Files
gpt-load/internal/services/leader_service.go
2025-07-08 21:10:06 +08:00

185 lines
4.5 KiB
Go

package services
import (
"crypto/rand"
"encoding/hex"
"sync"
"sync/atomic"
"time"
"gpt-load/internal/store"
"github.com/sirupsen/logrus"
)
const (
leaderLockKey = "cluster:leader"
leaderLockTTL = 30 * time.Second
leaderRenewalInterval = 10 * time.Second
)
// Lua script for atomic lock renewal.
// KEYS[1]: lock key, ARGV[1]: node ID, ARGV[2]: TTL in seconds.
const renewLockScript = `
if redis.call("get", KEYS[1]) == ARGV[1] then
return redis.call("expire", KEYS[1], ARGV[2])
else
return 0
end`
// Lua script for atomic lock release.
// KEYS[1]: lock key, ARGV[1]: node ID.
const releaseLockScript = `
if redis.call("get", KEYS[1]) == ARGV[1] then
return redis.call("del", KEYS[1])
else
return 0
end`
// LeaderService provides a mechanism for electing a single leader in a cluster.
type LeaderService struct {
store store.Store
nodeID string
isLeader atomic.Bool
stopChan chan struct{}
wg sync.WaitGroup
isSingleNode bool
firstElectionDone chan struct{}
firstElectionOnce sync.Once
}
// NewLeaderService creates a new LeaderService.
func NewLeaderService(s store.Store) *LeaderService {
// Check if the store supports Lua scripting to determine if we are in a distributed environment.
_, isDistributed := s.(store.LuaScripter)
service := &LeaderService{
store: s,
nodeID: generateNodeID(),
stopChan: make(chan struct{}),
isSingleNode: !isDistributed,
firstElectionDone: make(chan struct{}),
}
if service.isSingleNode {
logrus.Info("Store does not support Lua, running in single-node mode. Assuming leadership.")
service.isLeader.Store(true)
close(service.firstElectionDone)
} else {
logrus.Info("Store supports Lua, running in distributed mode.")
}
return service
}
// Start begins the leader election process.
func (s *LeaderService) Start() {
if s.isSingleNode {
return
}
s.wg.Add(1)
go s.electionLoop()
}
// Stop gracefully stops the leader election process.
func (s *LeaderService) Stop() {
if s.isSingleNode {
return
}
close(s.stopChan)
s.wg.Wait()
}
// IsLeader returns true if the current node is the leader.
// In distributed mode, this call will block until the first election attempt is complete.
func (s *LeaderService) IsLeader() bool {
<-s.firstElectionDone
return s.isLeader.Load()
}
func (s *LeaderService) electionLoop() {
defer s.wg.Done()
logrus.WithField("nodeID", s.nodeID).Info("Starting leader election loop...")
// Attempt to acquire leadership immediately on start.
s.tryToBeLeader()
ticker := time.NewTicker(leaderRenewalInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
s.tryToBeLeader()
case <-s.stopChan:
logrus.Info("Stopping leader election loop...")
if s.isLeader.Load() {
s.releaseLock()
}
return
}
}
}
func (s *LeaderService) tryToBeLeader() {
defer s.firstElectionOnce.Do(func() {
close(s.firstElectionDone)
})
if s.isLeader.Load() {
if err := s.renewLock(); err != nil {
logrus.WithError(err).Error("Failed to renew leader lock, relinquishing leadership.")
s.isLeader.Store(false)
}
return
}
acquired, err := s.acquireLock()
if err != nil {
logrus.WithError(err).Error("Error trying to acquire leader lock.")
s.isLeader.Store(false)
return
}
if acquired {
logrus.WithField("nodeID", s.nodeID).Info("Successfully acquired leader lock.")
s.isLeader.Store(true)
}
}
func (s *LeaderService) acquireLock() (bool, error) {
return s.store.SetNX(leaderLockKey, []byte(s.nodeID), leaderLockTTL)
}
func (s *LeaderService) renewLock() error {
luaStore := s.store.(store.LuaScripter) // Already checked in NewLeaderService
ttlSeconds := int(leaderLockTTL.Seconds())
res, err := luaStore.Eval(renewLockScript, []string{leaderLockKey}, s.nodeID, ttlSeconds)
if err != nil {
return err
}
if i, ok := res.(int64); !ok || i == 0 {
return store.ErrNotFound // Not our lock anymore
}
return nil
}
func (s *LeaderService) releaseLock() {
luaStore := s.store.(store.LuaScripter) // Already checked in NewLeaderService
if _, err := luaStore.Eval(releaseLockScript, []string{leaderLockKey}, s.nodeID); err != nil {
logrus.WithError(err).Error("Failed to release leader lock on shutdown.")
} else {
logrus.Info("Successfully released leader lock.")
}
}
func generateNodeID() string {
bytes := make([]byte, 16)
if _, err := rand.Read(bytes); err != nil {
return "node-" + time.Now().Format(time.RFC3339Nano)
}
return hex.EncodeToString(bytes)
}