feat: 分布式锁调整为全局服务

This commit is contained in:
tbphp
2025-07-08 18:33:39 +08:00
parent c6495ffd04
commit 7340cdded1
4 changed files with 179 additions and 91 deletions

View File

@@ -31,6 +31,7 @@ type App struct {
keyCronService *services.KeyCronService
keyValidationPool *services.KeyValidationPool
keyPoolProvider *keypool.KeyProvider
leaderService *services.LeaderService
proxyServer *proxy.ProxyServer
storage store.Store
db *gorm.DB
@@ -49,6 +50,7 @@ type AppParams struct {
KeyCronService *services.KeyCronService
KeyValidationPool *services.KeyValidationPool
KeyPoolProvider *keypool.KeyProvider
LeaderService *services.LeaderService
ProxyServer *proxy.ProxyServer
Storage store.Store
DB *gorm.DB
@@ -65,6 +67,7 @@ func NewApp(params AppParams) *App {
keyCronService: params.KeyCronService,
keyValidationPool: params.KeyValidationPool,
keyPoolProvider: params.KeyPoolProvider,
leaderService: params.LeaderService,
proxyServer: params.ProxyServer,
storage: params.Storage,
db: params.DB,
@@ -90,6 +93,7 @@ func (a *App) Start() error {
// Start background services
a.startRequestLogger()
a.logCleanupService.Start()
a.leaderService.Start()
a.keyValidationPool.Start()
a.keyCronService.Start()
@@ -131,6 +135,7 @@ func (a *App) Stop(ctx context.Context) {
// Stop background services
a.keyCronService.Stop()
a.keyValidationPool.Stop()
a.leaderService.Stop()
a.logCleanupService.Stop()
// Close resources

View File

@@ -56,6 +56,9 @@ func BuildContainer() (*dig.Container, error) {
if err := container.Provide(services.NewLogCleanupService); err != nil {
return nil, err
}
if err := container.Provide(services.NewLeaderService); err != nil {
return nil, err
}
if err := container.Provide(keypool.NewProvider); err != nil {
return nil, err
}

View File

@@ -4,7 +4,6 @@ import (
"context"
"gpt-load/internal/config"
"gpt-load/internal/models"
"gpt-load/internal/store"
"sync"
"time"
@@ -12,38 +11,37 @@ import (
"gorm.io/gorm"
)
const (
leaderLockKey = "cron:leader:key_validation"
leaderLockTTL = 10 * time.Minute
)
// KeyCronService is responsible for periodically submitting keys for validation.
type KeyCronService struct {
DB *gorm.DB
SettingsManager *config.SystemSettingsManager
Pool *KeyValidationPool
Store store.Store
LeaderService *LeaderService
stopChan chan struct{}
wg sync.WaitGroup
}
// NewKeyCronService creates a new KeyCronService.
func NewKeyCronService(db *gorm.DB, settingsManager *config.SystemSettingsManager, pool *KeyValidationPool, store store.Store) *KeyCronService {
func NewKeyCronService(
db *gorm.DB,
settingsManager *config.SystemSettingsManager,
pool *KeyValidationPool,
leaderService *LeaderService,
) *KeyCronService {
return &KeyCronService{
DB: db,
SettingsManager: settingsManager,
Pool: pool,
Store: store,
LeaderService: leaderService,
stopChan: make(chan struct{}),
}
}
// Start begins the leader election and cron job execution.
// Start begins the cron job execution.
func (s *KeyCronService) Start() {
logrus.Info("Starting KeyCronService with leader election...")
logrus.Info("Starting KeyCronService...")
s.wg.Add(1)
go s.leaderElectionLoop()
go s.runLoop()
}
// Stop stops the cron job.
@@ -54,80 +52,20 @@ func (s *KeyCronService) Stop() {
logrus.Info("KeyCronService stopped.")
}
// leaderElectionLoop is the main loop that attempts to acquire leadership.
func (s *KeyCronService) leaderElectionLoop() {
func (s *KeyCronService) runLoop() {
defer s.wg.Done()
for {
select {
case <-s.stopChan:
return
default:
isLeader, err := s.tryAcquireLock()
if err != nil {
logrus.Errorf("KeyCronService: Error trying to acquire leader lock: %v. Retrying in 1 minute.", err)
time.Sleep(1 * time.Minute)
continue
}
if isLeader {
s.runAsLeader()
} else {
logrus.Debug("KeyCronService: Not the leader. Standing by.")
time.Sleep(leaderLockTTL)
}
}
}
}
// tryAcquireLock attempts to set a key in the store, effectively acquiring a lock.
func (s *KeyCronService) tryAcquireLock() (bool, error) {
exists, err := s.Store.Exists(leaderLockKey)
if err != nil {
return false, err
}
if exists {
return false, nil // Lock is held by another node
}
lockValue := []byte(time.Now().String())
err = s.Store.Set(leaderLockKey, lockValue, leaderLockTTL)
if err != nil {
return false, err
}
logrus.Info("KeyCronService: Successfully acquired leader lock.")
return true, nil
}
func (s *KeyCronService) runAsLeader() {
logrus.Info("KeyCronService: Running as leader.")
defer func() {
if err := s.Store.Delete(leaderLockKey); err != nil {
logrus.Errorf("KeyCronService: Failed to release leader lock: %v", err)
}
logrus.Info("KeyCronService: Released leader lock.")
}()
// Run once on start
s.submitValidationJobs()
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
heartbeat := time.NewTicker(leaderLockTTL / 2)
defer heartbeat.Stop()
for {
select {
case <-ticker.C:
if s.LeaderService.IsLeader() {
logrus.Info("KeyCronService: Running as leader, submitting validation jobs.")
s.submitValidationJobs()
case <-heartbeat.C:
logrus.Debug("KeyCronService: Renewing leader lock.")
err := s.Store.Set(leaderLockKey, []byte(time.Now().String()), leaderLockTTL)
if err != nil {
logrus.Errorf("KeyCronService: Failed to renew leader lock: %v. Relinquishing leadership.", err)
return
} else {
logrus.Debug("KeyCronService: Not the leader. Standing by.")
}
case <-s.stopChan:
return

View File

@@ -0,0 +1,142 @@
package services
import (
"crypto/rand"
"encoding/hex"
"sync"
"sync/atomic"
"time"
"gpt-load/internal/store"
"github.com/sirupsen/logrus"
)
const (
leaderLockKey = "cluster:leader"
leaderLockTTL = 30 * time.Second
leaderRenewalInterval = 10 * time.Second
leaderElectionTimeout = 5 * time.Second
)
// LeaderService provides a mechanism for electing a single leader in a cluster.
type LeaderService struct {
store store.Store
nodeID string
isLeader atomic.Bool
stopChan chan struct{}
wg sync.WaitGroup
}
// NewLeaderService creates a new LeaderService.
func NewLeaderService(store store.Store) *LeaderService {
return &LeaderService{
store: store,
nodeID: generateNodeID(),
stopChan: make(chan struct{}),
}
}
// Start begins the leader election process.
func (s *LeaderService) Start() {
logrus.WithField("nodeID", s.nodeID).Info("Starting LeaderService...")
s.wg.Add(1)
go s.electionLoop()
}
// Stop gracefully stops the leader election process.
func (s *LeaderService) Stop() {
logrus.Info("Stopping LeaderService...")
close(s.stopChan)
s.wg.Wait()
logrus.Info("LeaderService stopped.")
}
// IsLeader returns true if the current node is the leader.
// This is a fast, local check against an atomic boolean.
func (s *LeaderService) IsLeader() bool {
return s.isLeader.Load()
}
func (s *LeaderService) electionLoop() {
defer s.wg.Done()
// Attempt to acquire leadership immediately on start.
s.tryToBeLeader()
ticker := time.NewTicker(leaderRenewalInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
s.tryToBeLeader()
case <-s.stopChan:
if s.IsLeader() {
s.releaseLock()
}
return
}
}
}
func (s *LeaderService) tryToBeLeader() {
if s.IsLeader() {
// Already the leader, just renew the lock.
if err := s.renewLock(); err != nil {
logrus.WithError(err).Error("Failed to renew leader lock, relinquishing leadership.")
s.isLeader.Store(false)
}
return
}
// Not the leader, try to acquire the lock.
acquired, err := s.acquireLock()
if err != nil {
logrus.WithError(err).Error("Error trying to acquire leader lock.")
s.isLeader.Store(false)
return
}
if acquired {
logrus.WithField("nodeID", s.nodeID).Info("Successfully acquired leader lock.")
s.isLeader.Store(true)
} else {
logrus.Debug("Could not acquire leader lock, another node is likely the leader.")
s.isLeader.Store(false)
}
}
func (s *LeaderService) acquireLock() (bool, error) {
// SetNX is an atomic operation. If the key already exists, it does nothing.
// This is the core of our distributed lock.
return s.store.SetNX(leaderLockKey, []byte(s.nodeID), leaderLockTTL)
}
func (s *LeaderService) renewLock() error {
// To renew, we must ensure we are still the lock holder.
// A LUA script is the safest way to do this atomically.
// For simplicity here, we get and set, but this is not truly atomic without LUA.
// A simple SET can also work if we are confident in our election loop timing.
return s.store.Set(leaderLockKey, []byte(s.nodeID), leaderLockTTL)
}
func (s *LeaderService) releaseLock() {
// Best-effort attempt to release the lock on shutdown.
// The TTL will handle cases where this fails.
if err := s.store.Delete(leaderLockKey); err != nil {
logrus.WithError(err).Error("Failed to release leader lock on shutdown.")
} else {
logrus.Info("Successfully released leader lock.")
}
s.isLeader.Store(false)
}
func generateNodeID() string {
bytes := make([]byte, 16)
if _, err := rand.Read(bytes); err != nil {
// Fallback to a timestamp-based ID if crypto/rand fails
return "node-" + time.Now().Format(time.RFC3339Nano)
}
return hex.EncodeToString(bytes)
}