feat: 分布式锁-故障转移机制

This commit is contained in:
tbphp
2025-07-08 22:02:39 +08:00
parent a159debd77
commit 2d669b7ece
2 changed files with 48 additions and 39 deletions

View File

@@ -77,9 +77,8 @@ func NewApp(params AppParams) *App {
// Start runs the application, it is a non-blocking call. // Start runs the application, it is a non-blocking call.
func (a *App) Start() error { func (a *App) Start() error {
// Perform leader election first. This is a blocking call. if err := a.leaderService.Start(); err != nil {
if err := a.leaderService.ElectLeader(); err != nil { return fmt.Errorf("leader service failed to start: %w", err)
return fmt.Errorf("leader election failed: %w", err)
} }
if a.leaderService.IsLeader() { if a.leaderService.IsLeader() {

View File

@@ -17,7 +17,6 @@ const (
leaderLockKey = "cluster:leader" leaderLockKey = "cluster:leader"
leaderLockTTL = 30 * time.Second leaderLockTTL = 30 * time.Second
leaderRenewalInterval = 10 * time.Second leaderRenewalInterval = 10 * time.Second
electionTimeout = 15 * time.Second
) )
const renewLockScript = ` const renewLockScript = `
@@ -54,51 +53,45 @@ func NewLeaderService(s store.Store) *LeaderService {
isSingleNode: !isDistributed, isSingleNode: !isDistributed,
} }
if service.isSingleNode { if service.isSingleNode {
logrus.Info("Running in single-node mode.") logrus.Debug("Running in single-node mode. Assuming leadership.")
service.isLeader.Store(true) service.isLeader.Store(true)
} else { } else {
logrus.Info("Running in distributed mode.") logrus.Debug("Running in distributed mode.")
} }
return service return service
} }
// ElectLeader attempts to become the cluster leader. This is a blocking call. // Start performs an initial leader election and starts the background leadership maintenance loop.
func (s *LeaderService) ElectLeader() error { func (s *LeaderService) Start() error {
if s.isSingleNode { if s.isSingleNode {
logrus.Info("In single-node mode, leadership is assumed. Skipping election.") logrus.Info("In single-node mode, leadership is assumed. Skipping election process.")
return nil return nil
} }
logrus.WithField("nodeID", s.nodeID).Debug("Attempting to acquire leadership...") logrus.WithField("nodeID", s.nodeID).Info("Performing initial leader election...")
if err := s.tryToBeLeader(); err != nil {
acquired, err := s.acquireLock() return fmt.Errorf("initial leader election failed: %w", err)
if err != nil {
return fmt.Errorf("failed to acquire leader lock: %w", err)
} }
if acquired { s.wg.Add(1)
logrus.WithField("nodeID", s.nodeID).Info("Successfully acquired leadership. Starting renewal process.") go s.maintainLeadershipLoop()
s.isLeader.Store(true)
s.wg.Add(1)
go s.renewalLoop()
} else {
logrus.WithField("nodeID", s.nodeID).Info("Another node is already the leader.")
s.isLeader.Store(false)
}
return nil return nil
} }
// Stop gracefully stops the leader renewal process if this node is the leader. // Stop gracefully stops the leadership maintenance process.
func (s *LeaderService) Stop() { func (s *LeaderService) Stop() {
if s.isSingleNode || !s.isLeader.Load() { if s.isSingleNode {
return return
} }
logrus.Info("Stopping leader renewal process...") logrus.Info("Stopping leadership maintenance process...")
close(s.stopChan) close(s.stopChan)
s.wg.Wait() s.wg.Wait()
s.releaseLock()
logrus.Info("Leader renewal process stopped.") if s.isLeader.Load() {
s.releaseLock()
}
logrus.Info("Leadership maintenance process stopped.")
} }
// IsLeader returns true if the current node is the leader. // IsLeader returns true if the current node is the leader.
@@ -106,28 +99,48 @@ func (s *LeaderService) IsLeader() bool {
return s.isLeader.Load() return s.isLeader.Load()
} }
// renewalLoop is the background process that keeps the leader lock alive. // maintainLeadershipLoop is the background process that keeps trying to acquire or renew the lock.
func (s *LeaderService) renewalLoop() { func (s *LeaderService) maintainLeadershipLoop() {
defer s.wg.Done() defer s.wg.Done()
ticker := time.NewTicker(leaderRenewalInterval) ticker := time.NewTicker(leaderRenewalInterval)
defer ticker.Stop() defer ticker.Stop()
logrus.Info("Leadership maintenance loop started.")
for { for {
select { select {
case <-ticker.C: case <-ticker.C:
if err := s.renewLock(); err != nil { if err := s.tryToBeLeader(); err != nil {
logrus.WithError(err).Error("Failed to renew leader lock, relinquishing leadership.") logrus.WithError(err).Warn("Error during leadership maintenance cycle.")
s.isLeader.Store(false)
return
} }
logrus.Debug("Successfully renewed leader lock.")
case <-s.stopChan: case <-s.stopChan:
logrus.Info("Leader renewal loop stopping.") logrus.Info("Leadership maintenance loop stopping.")
return return
} }
} }
} }
// tryToBeLeader is an idempotent function that attempts to acquire or renew the lock.
func (s *LeaderService) tryToBeLeader() error {
if s.isLeader.Load() {
err := s.renewLock()
if err != nil {
logrus.WithError(err).Error("Failed to renew leader lock, relinquishing leadership.")
s.isLeader.Store(false)
}
return nil
}
acquired, err := s.acquireLock()
if err != nil {
return fmt.Errorf("failed to acquire lock: %w", err)
}
if acquired {
logrus.WithField("nodeID", s.nodeID).Info("Successfully acquired leadership.")
s.isLeader.Store(true)
}
return nil
}
func (s *LeaderService) acquireLock() (bool, error) { func (s *LeaderService) acquireLock() (bool, error) {
return s.store.SetNX(leaderLockKey, []byte(s.nodeID), leaderLockTTL) return s.store.SetNX(leaderLockKey, []byte(s.nodeID), leaderLockTTL)
} }
@@ -146,9 +159,6 @@ func (s *LeaderService) renewLock() error {
} }
func (s *LeaderService) releaseLock() { func (s *LeaderService) releaseLock() {
if !s.isLeader.Load() {
return
}
luaStore := s.store.(store.LuaScripter) luaStore := s.store.(store.LuaScripter)
if _, err := luaStore.Eval(releaseLockScript, []string{leaderLockKey}, s.nodeID); err != nil { if _, err := luaStore.Eval(releaseLockScript, []string{leaderLockKey}, s.nodeID); err != nil {
logrus.WithError(err).Error("Failed to release leader lock on shutdown.") logrus.WithError(err).Error("Failed to release leader lock on shutdown.")