feat: 分布式锁调整为全局服务
This commit is contained in:
@@ -31,6 +31,7 @@ type App struct {
|
||||
keyCronService *services.KeyCronService
|
||||
keyValidationPool *services.KeyValidationPool
|
||||
keyPoolProvider *keypool.KeyProvider
|
||||
leaderService *services.LeaderService
|
||||
proxyServer *proxy.ProxyServer
|
||||
storage store.Store
|
||||
db *gorm.DB
|
||||
@@ -49,6 +50,7 @@ type AppParams struct {
|
||||
KeyCronService *services.KeyCronService
|
||||
KeyValidationPool *services.KeyValidationPool
|
||||
KeyPoolProvider *keypool.KeyProvider
|
||||
LeaderService *services.LeaderService
|
||||
ProxyServer *proxy.ProxyServer
|
||||
Storage store.Store
|
||||
DB *gorm.DB
|
||||
@@ -65,6 +67,7 @@ func NewApp(params AppParams) *App {
|
||||
keyCronService: params.KeyCronService,
|
||||
keyValidationPool: params.KeyValidationPool,
|
||||
keyPoolProvider: params.KeyPoolProvider,
|
||||
leaderService: params.LeaderService,
|
||||
proxyServer: params.ProxyServer,
|
||||
storage: params.Storage,
|
||||
db: params.DB,
|
||||
@@ -90,6 +93,7 @@ func (a *App) Start() error {
|
||||
// Start background services
|
||||
a.startRequestLogger()
|
||||
a.logCleanupService.Start()
|
||||
a.leaderService.Start()
|
||||
a.keyValidationPool.Start()
|
||||
a.keyCronService.Start()
|
||||
|
||||
@@ -131,6 +135,7 @@ func (a *App) Stop(ctx context.Context) {
|
||||
// Stop background services
|
||||
a.keyCronService.Stop()
|
||||
a.keyValidationPool.Stop()
|
||||
a.leaderService.Stop()
|
||||
a.logCleanupService.Stop()
|
||||
|
||||
// Close resources
|
||||
|
@@ -56,6 +56,9 @@ func BuildContainer() (*dig.Container, error) {
|
||||
if err := container.Provide(services.NewLogCleanupService); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := container.Provide(services.NewLeaderService); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := container.Provide(keypool.NewProvider); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"gpt-load/internal/config"
|
||||
"gpt-load/internal/models"
|
||||
"gpt-load/internal/store"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -12,38 +11,37 @@ import (
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
const (
|
||||
leaderLockKey = "cron:leader:key_validation"
|
||||
leaderLockTTL = 10 * time.Minute
|
||||
)
|
||||
|
||||
// KeyCronService is responsible for periodically submitting keys for validation.
|
||||
type KeyCronService struct {
|
||||
DB *gorm.DB
|
||||
SettingsManager *config.SystemSettingsManager
|
||||
Pool *KeyValidationPool
|
||||
Store store.Store
|
||||
LeaderService *LeaderService
|
||||
stopChan chan struct{}
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewKeyCronService creates a new KeyCronService.
|
||||
func NewKeyCronService(db *gorm.DB, settingsManager *config.SystemSettingsManager, pool *KeyValidationPool, store store.Store) *KeyCronService {
|
||||
func NewKeyCronService(
|
||||
db *gorm.DB,
|
||||
settingsManager *config.SystemSettingsManager,
|
||||
pool *KeyValidationPool,
|
||||
leaderService *LeaderService,
|
||||
) *KeyCronService {
|
||||
return &KeyCronService{
|
||||
DB: db,
|
||||
SettingsManager: settingsManager,
|
||||
Pool: pool,
|
||||
Store: store,
|
||||
LeaderService: leaderService,
|
||||
stopChan: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins the leader election and cron job execution.
|
||||
// Start begins the cron job execution.
|
||||
func (s *KeyCronService) Start() {
|
||||
logrus.Info("Starting KeyCronService with leader election...")
|
||||
logrus.Info("Starting KeyCronService...")
|
||||
s.wg.Add(1)
|
||||
go s.leaderElectionLoop()
|
||||
|
||||
go s.runLoop()
|
||||
}
|
||||
|
||||
// Stop stops the cron job.
|
||||
@@ -54,80 +52,20 @@ func (s *KeyCronService) Stop() {
|
||||
logrus.Info("KeyCronService stopped.")
|
||||
}
|
||||
|
||||
// leaderElectionLoop is the main loop that attempts to acquire leadership.
|
||||
func (s *KeyCronService) leaderElectionLoop() {
|
||||
func (s *KeyCronService) runLoop() {
|
||||
defer s.wg.Done()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-s.stopChan:
|
||||
return
|
||||
default:
|
||||
isLeader, err := s.tryAcquireLock()
|
||||
if err != nil {
|
||||
logrus.Errorf("KeyCronService: Error trying to acquire leader lock: %v. Retrying in 1 minute.", err)
|
||||
time.Sleep(1 * time.Minute)
|
||||
continue
|
||||
}
|
||||
|
||||
if isLeader {
|
||||
s.runAsLeader()
|
||||
} else {
|
||||
logrus.Debug("KeyCronService: Not the leader. Standing by.")
|
||||
time.Sleep(leaderLockTTL)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tryAcquireLock attempts to set a key in the store, effectively acquiring a lock.
|
||||
func (s *KeyCronService) tryAcquireLock() (bool, error) {
|
||||
exists, err := s.Store.Exists(leaderLockKey)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if exists {
|
||||
return false, nil // Lock is held by another node
|
||||
}
|
||||
|
||||
lockValue := []byte(time.Now().String())
|
||||
err = s.Store.Set(leaderLockKey, lockValue, leaderLockTTL)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
logrus.Info("KeyCronService: Successfully acquired leader lock.")
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (s *KeyCronService) runAsLeader() {
|
||||
logrus.Info("KeyCronService: Running as leader.")
|
||||
defer func() {
|
||||
if err := s.Store.Delete(leaderLockKey); err != nil {
|
||||
logrus.Errorf("KeyCronService: Failed to release leader lock: %v", err)
|
||||
}
|
||||
logrus.Info("KeyCronService: Released leader lock.")
|
||||
}()
|
||||
|
||||
// Run once on start
|
||||
s.submitValidationJobs()
|
||||
|
||||
ticker := time.NewTicker(5 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
heartbeat := time.NewTicker(leaderLockTTL / 2)
|
||||
defer heartbeat.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
if s.LeaderService.IsLeader() {
|
||||
logrus.Info("KeyCronService: Running as leader, submitting validation jobs.")
|
||||
s.submitValidationJobs()
|
||||
case <-heartbeat.C:
|
||||
logrus.Debug("KeyCronService: Renewing leader lock.")
|
||||
err := s.Store.Set(leaderLockKey, []byte(time.Now().String()), leaderLockTTL)
|
||||
if err != nil {
|
||||
logrus.Errorf("KeyCronService: Failed to renew leader lock: %v. Relinquishing leadership.", err)
|
||||
return
|
||||
} else {
|
||||
logrus.Debug("KeyCronService: Not the leader. Standing by.")
|
||||
}
|
||||
case <-s.stopChan:
|
||||
return
|
||||
|
142
internal/services/leader_service.go
Normal file
142
internal/services/leader_service.go
Normal file
@@ -0,0 +1,142 @@
|
||||
package services
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"gpt-load/internal/store"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
leaderLockKey = "cluster:leader"
|
||||
leaderLockTTL = 30 * time.Second
|
||||
leaderRenewalInterval = 10 * time.Second
|
||||
leaderElectionTimeout = 5 * time.Second
|
||||
)
|
||||
|
||||
// LeaderService provides a mechanism for electing a single leader in a cluster.
|
||||
type LeaderService struct {
|
||||
store store.Store
|
||||
nodeID string
|
||||
isLeader atomic.Bool
|
||||
stopChan chan struct{}
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewLeaderService creates a new LeaderService.
|
||||
func NewLeaderService(store store.Store) *LeaderService {
|
||||
return &LeaderService{
|
||||
store: store,
|
||||
nodeID: generateNodeID(),
|
||||
stopChan: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins the leader election process.
|
||||
func (s *LeaderService) Start() {
|
||||
logrus.WithField("nodeID", s.nodeID).Info("Starting LeaderService...")
|
||||
s.wg.Add(1)
|
||||
go s.electionLoop()
|
||||
}
|
||||
|
||||
// Stop gracefully stops the leader election process.
|
||||
func (s *LeaderService) Stop() {
|
||||
logrus.Info("Stopping LeaderService...")
|
||||
close(s.stopChan)
|
||||
s.wg.Wait()
|
||||
logrus.Info("LeaderService stopped.")
|
||||
}
|
||||
|
||||
// IsLeader returns true if the current node is the leader.
|
||||
// This is a fast, local check against an atomic boolean.
|
||||
func (s *LeaderService) IsLeader() bool {
|
||||
return s.isLeader.Load()
|
||||
}
|
||||
|
||||
func (s *LeaderService) electionLoop() {
|
||||
defer s.wg.Done()
|
||||
|
||||
// Attempt to acquire leadership immediately on start.
|
||||
s.tryToBeLeader()
|
||||
|
||||
ticker := time.NewTicker(leaderRenewalInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
s.tryToBeLeader()
|
||||
case <-s.stopChan:
|
||||
if s.IsLeader() {
|
||||
s.releaseLock()
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *LeaderService) tryToBeLeader() {
|
||||
if s.IsLeader() {
|
||||
// Already the leader, just renew the lock.
|
||||
if err := s.renewLock(); err != nil {
|
||||
logrus.WithError(err).Error("Failed to renew leader lock, relinquishing leadership.")
|
||||
s.isLeader.Store(false)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Not the leader, try to acquire the lock.
|
||||
acquired, err := s.acquireLock()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Error("Error trying to acquire leader lock.")
|
||||
s.isLeader.Store(false)
|
||||
return
|
||||
}
|
||||
|
||||
if acquired {
|
||||
logrus.WithField("nodeID", s.nodeID).Info("Successfully acquired leader lock.")
|
||||
s.isLeader.Store(true)
|
||||
} else {
|
||||
logrus.Debug("Could not acquire leader lock, another node is likely the leader.")
|
||||
s.isLeader.Store(false)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *LeaderService) acquireLock() (bool, error) {
|
||||
// SetNX is an atomic operation. If the key already exists, it does nothing.
|
||||
// This is the core of our distributed lock.
|
||||
return s.store.SetNX(leaderLockKey, []byte(s.nodeID), leaderLockTTL)
|
||||
}
|
||||
|
||||
func (s *LeaderService) renewLock() error {
|
||||
// To renew, we must ensure we are still the lock holder.
|
||||
// A LUA script is the safest way to do this atomically.
|
||||
// For simplicity here, we get and set, but this is not truly atomic without LUA.
|
||||
// A simple SET can also work if we are confident in our election loop timing.
|
||||
return s.store.Set(leaderLockKey, []byte(s.nodeID), leaderLockTTL)
|
||||
}
|
||||
|
||||
func (s *LeaderService) releaseLock() {
|
||||
// Best-effort attempt to release the lock on shutdown.
|
||||
// The TTL will handle cases where this fails.
|
||||
if err := s.store.Delete(leaderLockKey); err != nil {
|
||||
logrus.WithError(err).Error("Failed to release leader lock on shutdown.")
|
||||
} else {
|
||||
logrus.Info("Successfully released leader lock.")
|
||||
}
|
||||
s.isLeader.Store(false)
|
||||
}
|
||||
|
||||
func generateNodeID() string {
|
||||
bytes := make([]byte, 16)
|
||||
if _, err := rand.Read(bytes); err != nil {
|
||||
// Fallback to a timestamp-based ID if crypto/rand fails
|
||||
return "node-" + time.Now().Format(time.RFC3339Nano)
|
||||
}
|
||||
return hex.EncodeToString(bytes)
|
||||
}
|
Reference in New Issue
Block a user