refactor: 重构集群主从节点模式
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
# 服务器配置
|
# 服务器配置
|
||||||
PORT=3001
|
PORT=3001
|
||||||
HOST=0.0.0.0
|
HOST=0.0.0.0
|
||||||
|
IS_SLAVE=false
|
||||||
|
|
||||||
# 服务器读取、写入和空闲连接的超时时间(秒)
|
# 服务器读取、写入和空闲连接的超时时间(秒)
|
||||||
SERVER_READ_TIMEOUT=120
|
SERVER_READ_TIMEOUT=120
|
||||||
|
@@ -33,7 +33,6 @@ type App struct {
|
|||||||
requestLogService *services.RequestLogService
|
requestLogService *services.RequestLogService
|
||||||
cronChecker *keypool.CronChecker
|
cronChecker *keypool.CronChecker
|
||||||
keyPoolProvider *keypool.KeyProvider
|
keyPoolProvider *keypool.KeyProvider
|
||||||
leaderLock *store.LeaderLock
|
|
||||||
proxyServer *proxy.ProxyServer
|
proxyServer *proxy.ProxyServer
|
||||||
storage store.Store
|
storage store.Store
|
||||||
db *gorm.DB
|
db *gorm.DB
|
||||||
@@ -51,7 +50,6 @@ type AppParams struct {
|
|||||||
RequestLogService *services.RequestLogService
|
RequestLogService *services.RequestLogService
|
||||||
CronChecker *keypool.CronChecker
|
CronChecker *keypool.CronChecker
|
||||||
KeyPoolProvider *keypool.KeyProvider
|
KeyPoolProvider *keypool.KeyProvider
|
||||||
LeaderLock *store.LeaderLock
|
|
||||||
ProxyServer *proxy.ProxyServer
|
ProxyServer *proxy.ProxyServer
|
||||||
Storage store.Store
|
Storage store.Store
|
||||||
DB *gorm.DB
|
DB *gorm.DB
|
||||||
@@ -68,7 +66,6 @@ func NewApp(params AppParams) *App {
|
|||||||
requestLogService: params.RequestLogService,
|
requestLogService: params.RequestLogService,
|
||||||
cronChecker: params.CronChecker,
|
cronChecker: params.CronChecker,
|
||||||
keyPoolProvider: params.KeyPoolProvider,
|
keyPoolProvider: params.KeyPoolProvider,
|
||||||
leaderLock: params.LeaderLock,
|
|
||||||
proxyServer: params.ProxyServer,
|
proxyServer: params.ProxyServer,
|
||||||
storage: params.Storage,
|
storage: params.Storage,
|
||||||
db: params.DB,
|
db: params.DB,
|
||||||
@@ -77,26 +74,9 @@ func NewApp(params AppParams) *App {
|
|||||||
|
|
||||||
// Start runs the application, it is a non-blocking call.
|
// Start runs the application, it is a non-blocking call.
|
||||||
func (a *App) Start() error {
|
func (a *App) Start() error {
|
||||||
|
// Master 节点执行初始化
|
||||||
// 启动 Leader Lock 服务并等待选举结果
|
if a.configManager.IsMaster() {
|
||||||
if err := a.leaderLock.Start(); err != nil {
|
logrus.Info("Starting as Master Node.")
|
||||||
return fmt.Errorf("leader service failed to start: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Leader 节点执行初始化,Follower 节点等待
|
|
||||||
if a.leaderLock.IsLeader() {
|
|
||||||
logrus.Info("Leader mode. Performing initial one-time tasks...")
|
|
||||||
acquired, err := a.leaderLock.AcquireInitializingLock()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to acquire initializing lock: %w", err)
|
|
||||||
}
|
|
||||||
if !acquired {
|
|
||||||
logrus.Warn("Could not acquire initializing lock, another leader might be active. Switching to follower mode for initialization.")
|
|
||||||
if err := a.leaderLock.WaitForInitializationToComplete(); err != nil {
|
|
||||||
return fmt.Errorf("failed to wait for initialization as a fallback follower: %w", err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
defer a.leaderLock.ReleaseInitializingLock()
|
|
||||||
|
|
||||||
// 数据库迁移
|
// 数据库迁移
|
||||||
if err := a.db.AutoMigrate(
|
if err := a.db.AutoMigrate(
|
||||||
@@ -116,20 +96,21 @@ func (a *App) Start() error {
|
|||||||
}
|
}
|
||||||
logrus.Info("System settings initialized in DB.")
|
logrus.Info("System settings initialized in DB.")
|
||||||
|
|
||||||
a.settingsManager.Initialize(a.storage, a.groupManager, a.leaderLock)
|
a.settingsManager.Initialize(a.storage, a.groupManager, a.configManager.IsMaster())
|
||||||
|
|
||||||
// 从数据库加载密钥到 Redis
|
// 从数据库加载密钥到 Redis
|
||||||
if err := a.keyPoolProvider.LoadKeysFromDB(); err != nil {
|
if err := a.keyPoolProvider.LoadKeysFromDB(); err != nil {
|
||||||
return fmt.Errorf("failed to load keys into key pool: %w", err)
|
return fmt.Errorf("failed to load keys into key pool: %w", err)
|
||||||
}
|
}
|
||||||
logrus.Debug("API keys loaded into Redis cache by leader.")
|
logrus.Debug("API keys loaded into Redis cache by master.")
|
||||||
}
|
|
||||||
|
// 仅 Master 节点启动的服务
|
||||||
|
a.requestLogService.Start()
|
||||||
|
a.logCleanupService.Start()
|
||||||
|
a.cronChecker.Start()
|
||||||
} else {
|
} else {
|
||||||
logrus.Info("Follower Mode. Waiting for leader to complete initialization.")
|
logrus.Info("Starting as Slave Node.")
|
||||||
if err := a.leaderLock.WaitForInitializationToComplete(); err != nil {
|
a.settingsManager.Initialize(a.storage, a.groupManager, a.configManager.IsMaster())
|
||||||
return fmt.Errorf("follower failed to start: %w", err)
|
|
||||||
}
|
|
||||||
a.settingsManager.Initialize(a.storage, a.groupManager, a.leaderLock)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 显示配置并启动所有后台服务
|
// 显示配置并启动所有后台服务
|
||||||
@@ -137,10 +118,6 @@ func (a *App) Start() error {
|
|||||||
|
|
||||||
a.groupManager.Initialize()
|
a.groupManager.Initialize()
|
||||||
|
|
||||||
a.requestLogService.Start()
|
|
||||||
a.logCleanupService.Start()
|
|
||||||
a.cronChecker.Start()
|
|
||||||
|
|
||||||
// Create HTTP server
|
// Create HTTP server
|
||||||
serverConfig := a.configManager.GetEffectiveServerConfig()
|
serverConfig := a.configManager.GetEffectiveServerConfig()
|
||||||
a.httpServer = &http.Server{
|
a.httpServer = &http.Server{
|
||||||
@@ -174,8 +151,6 @@ func (a *App) Stop(ctx context.Context) {
|
|||||||
|
|
||||||
// 动态计算 HTTP 关机超时时间,为后台服务固定预留 5 秒
|
// 动态计算 HTTP 关机超时时间,为后台服务固定预留 5 秒
|
||||||
httpShutdownTimeout := totalTimeout - 5*time.Second
|
httpShutdownTimeout := totalTimeout - 5*time.Second
|
||||||
|
|
||||||
// 为 HTTP 服务器的优雅关闭创建一个独立的 context
|
|
||||||
httpShutdownCtx, cancelHttpShutdown := context.WithTimeout(context.Background(), httpShutdownTimeout)
|
httpShutdownCtx, cancelHttpShutdown := context.WithTimeout(context.Background(), httpShutdownTimeout)
|
||||||
defer cancelHttpShutdown()
|
defer cancelHttpShutdown()
|
||||||
|
|
||||||
@@ -190,14 +165,18 @@ func (a *App) Stop(ctx context.Context) {
|
|||||||
|
|
||||||
// 使用原始的总超时 context 继续关闭其他后台服务
|
// 使用原始的总超时 context 继续关闭其他后台服务
|
||||||
stoppableServices := []func(context.Context){
|
stoppableServices := []func(context.Context){
|
||||||
a.cronChecker.Stop,
|
|
||||||
a.leaderLock.Stop,
|
|
||||||
a.logCleanupService.Stop,
|
|
||||||
a.requestLogService.Stop,
|
|
||||||
a.groupManager.Stop,
|
a.groupManager.Stop,
|
||||||
a.settingsManager.Stop,
|
a.settingsManager.Stop,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if serverConfig.IsMaster {
|
||||||
|
stoppableServices = append(stoppableServices,
|
||||||
|
a.cronChecker.Stop,
|
||||||
|
a.logCleanupService.Stop,
|
||||||
|
a.requestLogService.Stop,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
wg.Add(len(stoppableServices))
|
wg.Add(len(stoppableServices))
|
||||||
|
|
||||||
@@ -221,7 +200,6 @@ func (a *App) Stop(ctx context.Context) {
|
|||||||
logrus.Warn("Shutdown timed out, some services may not have stopped gracefully.")
|
logrus.Warn("Shutdown timed out, some services may not have stopped gracefully.")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 3: Close storage connection last.
|
|
||||||
if a.storage != nil {
|
if a.storage != nil {
|
||||||
a.storage.Close()
|
a.storage.Close()
|
||||||
}
|
}
|
||||||
|
@@ -70,6 +70,7 @@ func (m *Manager) ReloadConfig() error {
|
|||||||
|
|
||||||
config := &Config{
|
config := &Config{
|
||||||
Server: types.ServerConfig{
|
Server: types.ServerConfig{
|
||||||
|
IsMaster: !utils.ParseBoolean(os.Getenv("IS_SLAVE"), true),
|
||||||
Port: utils.ParseInteger(os.Getenv("PORT"), 3001),
|
Port: utils.ParseInteger(os.Getenv("PORT"), 3001),
|
||||||
Host: utils.GetEnvOrDefault("HOST", "0.0.0.0"),
|
Host: utils.GetEnvOrDefault("HOST", "0.0.0.0"),
|
||||||
ReadTimeout: utils.ParseInteger(os.Getenv("SERVER_READ_TIMEOUT"), 120),
|
ReadTimeout: utils.ParseInteger(os.Getenv("SERVER_READ_TIMEOUT"), 120),
|
||||||
@@ -111,6 +112,11 @@ func (m *Manager) ReloadConfig() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IsMaster returns Server mode
|
||||||
|
func (m *Manager) IsMaster() bool {
|
||||||
|
return m.config.Server.IsMaster
|
||||||
|
}
|
||||||
|
|
||||||
// GetAuthConfig returns authentication configuration
|
// GetAuthConfig returns authentication configuration
|
||||||
func (m *Manager) GetAuthConfig() types.AuthConfig {
|
func (m *Manager) GetAuthConfig() types.AuthConfig {
|
||||||
return m.config.Auth
|
return m.config.Auth
|
||||||
|
@@ -36,12 +36,8 @@ type groupManager interface {
|
|||||||
Invalidate() error
|
Invalidate() error
|
||||||
}
|
}
|
||||||
|
|
||||||
type leaderLock interface {
|
|
||||||
IsLeader() bool
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize initializes the SystemSettingsManager with database and store dependencies.
|
// Initialize initializes the SystemSettingsManager with database and store dependencies.
|
||||||
func (sm *SystemSettingsManager) Initialize(store store.Store, gm groupManager, leaderLock leaderLock) error {
|
func (sm *SystemSettingsManager) Initialize(store store.Store, gm groupManager, isMaster bool) error {
|
||||||
settingsLoader := func() (types.SystemSettings, error) {
|
settingsLoader := func() (types.SystemSettings, error) {
|
||||||
var dbSettings []models.SystemSetting
|
var dbSettings []models.SystemSetting
|
||||||
if err := db.DB.Find(&dbSettings).Error; err != nil {
|
if err := db.DB.Find(&dbSettings).Error; err != nil {
|
||||||
@@ -83,12 +79,10 @@ func (sm *SystemSettingsManager) Initialize(store store.Store, gm groupManager,
|
|||||||
}
|
}
|
||||||
|
|
||||||
afterLoader := func(newData types.SystemSettings) {
|
afterLoader := func(newData types.SystemSettings) {
|
||||||
if !leaderLock.IsLeader() {
|
if !isMaster {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if err := gm.Invalidate(); err != nil {
|
gm.Invalidate()
|
||||||
logrus.Debugf("Failed to invalidate group manager cache after settings update: %v", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
syncer, err := syncer.NewCacheSyncer(
|
syncer, err := syncer.NewCacheSyncer(
|
||||||
|
@@ -34,9 +34,6 @@ func BuildContainer() (*dig.Container, error) {
|
|||||||
if err := container.Provide(store.NewStore); err != nil {
|
if err := container.Provide(store.NewStore); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if err := container.Provide(store.NewLeaderLock); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if err := container.Provide(httpclient.NewHTTPClientManager); err != nil {
|
if err := container.Provide(httpclient.NewHTTPClientManager); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@@ -4,7 +4,6 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"gpt-load/internal/config"
|
"gpt-load/internal/config"
|
||||||
"gpt-load/internal/models"
|
"gpt-load/internal/models"
|
||||||
"gpt-load/internal/store"
|
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
@@ -18,7 +17,6 @@ type CronChecker struct {
|
|||||||
DB *gorm.DB
|
DB *gorm.DB
|
||||||
SettingsManager *config.SystemSettingsManager
|
SettingsManager *config.SystemSettingsManager
|
||||||
Validator *KeyValidator
|
Validator *KeyValidator
|
||||||
LeaderLock *store.LeaderLock
|
|
||||||
stopChan chan struct{}
|
stopChan chan struct{}
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
}
|
}
|
||||||
@@ -28,13 +26,11 @@ func NewCronChecker(
|
|||||||
db *gorm.DB,
|
db *gorm.DB,
|
||||||
settingsManager *config.SystemSettingsManager,
|
settingsManager *config.SystemSettingsManager,
|
||||||
validator *KeyValidator,
|
validator *KeyValidator,
|
||||||
leaderLock *store.LeaderLock,
|
|
||||||
) *CronChecker {
|
) *CronChecker {
|
||||||
return &CronChecker{
|
return &CronChecker{
|
||||||
DB: db,
|
DB: db,
|
||||||
SettingsManager: settingsManager,
|
SettingsManager: settingsManager,
|
||||||
Validator: validator,
|
Validator: validator,
|
||||||
LeaderLock: leaderLock,
|
|
||||||
stopChan: make(chan struct{}),
|
stopChan: make(chan struct{}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -68,9 +64,7 @@ func (s *CronChecker) Stop(ctx context.Context) {
|
|||||||
func (s *CronChecker) runLoop() {
|
func (s *CronChecker) runLoop() {
|
||||||
defer s.wg.Done()
|
defer s.wg.Done()
|
||||||
|
|
||||||
if s.LeaderLock.IsLeader() {
|
|
||||||
s.submitValidationJobs()
|
s.submitValidationJobs()
|
||||||
}
|
|
||||||
|
|
||||||
ticker := time.NewTicker(5 * time.Minute)
|
ticker := time.NewTicker(5 * time.Minute)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
@@ -78,12 +72,8 @@ func (s *CronChecker) runLoop() {
|
|||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
if s.LeaderLock.IsLeader() {
|
logrus.Debug("CronChecker: Running as Master, submitting validation jobs.")
|
||||||
logrus.Debug("CronChecker: Running as leader, submitting validation jobs.")
|
|
||||||
s.submitValidationJobs()
|
s.submitValidationJobs()
|
||||||
} else {
|
|
||||||
logrus.Debug("CronChecker: Not the leader. Standing by.")
|
|
||||||
}
|
|
||||||
case <-s.stopChan:
|
case <-s.stopChan:
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@@ -4,7 +4,6 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"gpt-load/internal/config"
|
"gpt-load/internal/config"
|
||||||
"gpt-load/internal/models"
|
"gpt-load/internal/models"
|
||||||
"gpt-load/internal/store"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -16,17 +15,15 @@ import (
|
|||||||
type LogCleanupService struct {
|
type LogCleanupService struct {
|
||||||
db *gorm.DB
|
db *gorm.DB
|
||||||
settingsManager *config.SystemSettingsManager
|
settingsManager *config.SystemSettingsManager
|
||||||
leaderLock *store.LeaderLock
|
|
||||||
stopCh chan struct{}
|
stopCh chan struct{}
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewLogCleanupService 创建新的日志清理服务
|
// NewLogCleanupService 创建新的日志清理服务
|
||||||
func NewLogCleanupService(db *gorm.DB, settingsManager *config.SystemSettingsManager, leaderLock *store.LeaderLock) *LogCleanupService {
|
func NewLogCleanupService(db *gorm.DB, settingsManager *config.SystemSettingsManager) *LogCleanupService {
|
||||||
return &LogCleanupService{
|
return &LogCleanupService{
|
||||||
db: db,
|
db: db,
|
||||||
settingsManager: settingsManager,
|
settingsManager: settingsManager,
|
||||||
leaderLock: leaderLock,
|
|
||||||
stopCh: make(chan struct{}),
|
stopCh: make(chan struct{}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -77,11 +74,6 @@ func (s *LogCleanupService) run() {
|
|||||||
|
|
||||||
// cleanupExpiredLogs 清理过期的请求日志
|
// cleanupExpiredLogs 清理过期的请求日志
|
||||||
func (s *LogCleanupService) cleanupExpiredLogs() {
|
func (s *LogCleanupService) cleanupExpiredLogs() {
|
||||||
if !s.leaderLock.IsLeader() {
|
|
||||||
logrus.Debug("Not the leader, skipping log cleanup.")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// 获取日志保留天数配置
|
// 获取日志保留天数配置
|
||||||
settings := s.settingsManager.GetSettings()
|
settings := s.settingsManager.GetSettings()
|
||||||
retentionDays := settings.RequestLogRetentionDays
|
retentionDays := settings.RequestLogRetentionDays
|
||||||
|
@@ -28,19 +28,17 @@ type RequestLogService struct {
|
|||||||
db *gorm.DB
|
db *gorm.DB
|
||||||
store store.Store
|
store store.Store
|
||||||
settingsManager *config.SystemSettingsManager
|
settingsManager *config.SystemSettingsManager
|
||||||
leaderLock *store.LeaderLock
|
|
||||||
stopChan chan struct{}
|
stopChan chan struct{}
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
ticker *time.Ticker
|
ticker *time.Ticker
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewRequestLogService creates a new RequestLogService instance
|
// NewRequestLogService creates a new RequestLogService instance
|
||||||
func NewRequestLogService(db *gorm.DB, store store.Store, sm *config.SystemSettingsManager, ls *store.LeaderLock) *RequestLogService {
|
func NewRequestLogService(db *gorm.DB, store store.Store, sm *config.SystemSettingsManager) *RequestLogService {
|
||||||
return &RequestLogService{
|
return &RequestLogService{
|
||||||
db: db,
|
db: db,
|
||||||
store: store,
|
store: store,
|
||||||
settingsManager: sm,
|
settingsManager: sm,
|
||||||
leaderLock: ls,
|
|
||||||
stopChan: make(chan struct{}),
|
stopChan: make(chan struct{}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -133,12 +131,7 @@ func (s *RequestLogService) flush() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if !s.leaderLock.IsLeader() {
|
logrus.Debug("Master starting to flush request logs...")
|
||||||
logrus.Debug("Not a leader, skipping log flush.")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
logrus.Debug("Leader starting to flush request logs...")
|
|
||||||
|
|
||||||
for {
|
for {
|
||||||
keys, err := s.store.SPopN(PendingLogKeysSet, DefaultLogFlushBatchSize)
|
keys, err := s.store.SPopN(PendingLogKeysSet, DefaultLogFlushBatchSize)
|
||||||
|
@@ -1,248 +0,0 @@
|
|||||||
package store
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"crypto/rand"
|
|
||||||
"encoding/hex"
|
|
||||||
"fmt"
|
|
||||||
"sync"
|
|
||||||
"sync/atomic"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/sirupsen/logrus"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
leaderLockKey = "cluster:leader"
|
|
||||||
leaderLockTTL = 30 * time.Second
|
|
||||||
leaderRenewalInterval = 10 * time.Second
|
|
||||||
initializingLockKey = "cluster:initializing"
|
|
||||||
initializingLockTTL = 5 * time.Minute
|
|
||||||
)
|
|
||||||
|
|
||||||
const renewLockScript = `
|
|
||||||
if redis.call("get", KEYS[1]) == ARGV[1] then
|
|
||||||
return redis.call("expire", KEYS[1], ARGV[2])
|
|
||||||
else
|
|
||||||
return 0
|
|
||||||
end`
|
|
||||||
|
|
||||||
const releaseLockScript = `
|
|
||||||
if redis.call("get", KEYS[1]) == ARGV[1] then
|
|
||||||
return redis.call("del", KEYS[1])
|
|
||||||
else
|
|
||||||
return 0
|
|
||||||
end`
|
|
||||||
|
|
||||||
// LeaderLock provides a mechanism for electing a single leader in a cluster.
|
|
||||||
type LeaderLock struct {
|
|
||||||
store Store
|
|
||||||
nodeID string
|
|
||||||
isLeader atomic.Bool
|
|
||||||
stopChan chan struct{}
|
|
||||||
wg sync.WaitGroup
|
|
||||||
isSingleNode bool
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewLeaderLock creates a new LeaderLock.
|
|
||||||
func NewLeaderLock(s Store) *LeaderLock {
|
|
||||||
_, isDistributed := s.(LuaScripter)
|
|
||||||
service := &LeaderLock{
|
|
||||||
store: s,
|
|
||||||
nodeID: generateNodeID(),
|
|
||||||
stopChan: make(chan struct{}),
|
|
||||||
isSingleNode: !isDistributed,
|
|
||||||
}
|
|
||||||
if service.isSingleNode {
|
|
||||||
logrus.Debug("Running in single-node mode. Assuming leadership.")
|
|
||||||
service.isLeader.Store(true)
|
|
||||||
} else {
|
|
||||||
logrus.Debug("Running in distributed mode.")
|
|
||||||
}
|
|
||||||
return service
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start performs an initial leader election and starts the background leadership maintenance loop.
|
|
||||||
func (s *LeaderLock) Start() error {
|
|
||||||
if s.isSingleNode {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := s.tryToBeLeader(); err != nil {
|
|
||||||
return fmt.Errorf("initial leader election failed: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
s.wg.Add(1)
|
|
||||||
go s.maintainLeadershipLoop()
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stop gracefully stops the leadership maintenance process.
|
|
||||||
func (s *LeaderLock) Stop(ctx context.Context) {
|
|
||||||
if s.isSingleNode {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
close(s.stopChan)
|
|
||||||
|
|
||||||
done := make(chan struct{})
|
|
||||||
go func() {
|
|
||||||
s.wg.Wait()
|
|
||||||
close(done)
|
|
||||||
}()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-done:
|
|
||||||
logrus.Info("Leadership maintenance process stopped gracefully.")
|
|
||||||
case <-ctx.Done():
|
|
||||||
logrus.Warn("Leadership maintenance process stop timed out.")
|
|
||||||
}
|
|
||||||
|
|
||||||
if s.isLeader.Load() {
|
|
||||||
s.releaseLock()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// IsLeader returns true if the current node is the leader.
|
|
||||||
func (s *LeaderLock) IsLeader() bool {
|
|
||||||
return s.isLeader.Load()
|
|
||||||
}
|
|
||||||
|
|
||||||
// AcquireInitializingLock sets a temporary lock to indicate that initialization is in progress.
|
|
||||||
func (s *LeaderLock) AcquireInitializingLock() (bool, error) {
|
|
||||||
if !s.IsLeader() {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
logrus.Debug("Leader acquiring initialization lock...")
|
|
||||||
return s.store.SetNX(initializingLockKey, []byte(s.nodeID), initializingLockTTL)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ReleaseInitializingLock removes the initialization lock.
|
|
||||||
func (s *LeaderLock) ReleaseInitializingLock() {
|
|
||||||
if !s.IsLeader() {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
logrus.Debug("Leader releasing initialization lock...")
|
|
||||||
if err := s.store.Delete(initializingLockKey); err != nil {
|
|
||||||
logrus.WithError(err).Error("Failed to release initialization lock.")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// WaitForInitializationToComplete waits until the initialization lock is released.
|
|
||||||
func (s *LeaderLock) WaitForInitializationToComplete() error {
|
|
||||||
if s.isSingleNode || s.IsLeader() {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
logrus.Debug("Follower waiting for leader to complete initialization...")
|
|
||||||
|
|
||||||
time.Sleep(2 * time.Second)
|
|
||||||
|
|
||||||
// Use a context with timeout to prevent indefinite waiting.
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), initializingLockTTL+1*time.Minute)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
ticker := time.NewTicker(2 * time.Second)
|
|
||||||
defer ticker.Stop()
|
|
||||||
|
|
||||||
// Initial check before starting the loop
|
|
||||||
exists, err := s.store.Exists(initializingLockKey)
|
|
||||||
if err != nil {
|
|
||||||
logrus.WithError(err).Warn("Initial check for initialization lock failed, will proceed to loop.")
|
|
||||||
} else if !exists {
|
|
||||||
logrus.Debug("Initialization lock not found on initial check. Assuming initialization is complete.")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return fmt.Errorf("timed out waiting for leader initialization after %v", initializingLockTTL)
|
|
||||||
case <-ticker.C:
|
|
||||||
exists, err := s.store.Exists(initializingLockKey)
|
|
||||||
if err != nil {
|
|
||||||
logrus.WithError(err).Warn("Error checking initialization lock, will retry...")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !exists {
|
|
||||||
logrus.Debug("Initialization lock released. Follower proceeding with startup.")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// maintainLeadershipLoop is the background process that keeps trying to acquire or renew the lock.
|
|
||||||
func (s *LeaderLock) maintainLeadershipLoop() {
|
|
||||||
defer s.wg.Done()
|
|
||||||
ticker := time.NewTicker(leaderRenewalInterval)
|
|
||||||
defer ticker.Stop()
|
|
||||||
|
|
||||||
logrus.Debug("Leadership maintenance loop started.")
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ticker.C:
|
|
||||||
if err := s.tryToBeLeader(); err != nil {
|
|
||||||
logrus.WithError(err).Warn("Error during leadership maintenance cycle.")
|
|
||||||
}
|
|
||||||
case <-s.stopChan:
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// tryToBeLeader is an idempotent function that attempts to acquire or renew the lock.
|
|
||||||
func (s *LeaderLock) tryToBeLeader() error {
|
|
||||||
if s.isLeader.Load() {
|
|
||||||
err := s.renewLock()
|
|
||||||
if err != nil {
|
|
||||||
logrus.WithError(err).Error("Failed to renew leader lock, relinquishing leadership.")
|
|
||||||
s.isLeader.Store(false)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
acquired, err := s.acquireLock()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to acquire lock: %w", err)
|
|
||||||
}
|
|
||||||
if acquired {
|
|
||||||
logrus.WithField("nodeID", s.nodeID).Info("Successfully acquired leadership.")
|
|
||||||
s.isLeader.Store(true)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *LeaderLock) acquireLock() (bool, error) {
|
|
||||||
return s.store.SetNX(leaderLockKey, []byte(s.nodeID), leaderLockTTL)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *LeaderLock) renewLock() error {
|
|
||||||
luaStore := s.store.(LuaScripter)
|
|
||||||
ttlSeconds := int(leaderLockTTL.Seconds())
|
|
||||||
res, err := luaStore.Eval(renewLockScript, []string{leaderLockKey}, s.nodeID, ttlSeconds)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if i, ok := res.(int64); !ok || i == 0 {
|
|
||||||
return fmt.Errorf("failed to renew lock, another node may have taken over")
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *LeaderLock) releaseLock() {
|
|
||||||
luaStore := s.store.(LuaScripter)
|
|
||||||
if _, err := luaStore.Eval(releaseLockScript, []string{leaderLockKey}, s.nodeID); err != nil {
|
|
||||||
logrus.WithError(err).Error("Failed to release leader lock on shutdown.")
|
|
||||||
} else {
|
|
||||||
logrus.Info("Successfully released leader lock.")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func generateNodeID() string {
|
|
||||||
bytes := make([]byte, 16)
|
|
||||||
if _, err := rand.Read(bytes); err != nil {
|
|
||||||
return "node-" + time.Now().Format(time.RFC3339Nano)
|
|
||||||
}
|
|
||||||
return hex.EncodeToString(bytes)
|
|
||||||
}
|
|
@@ -138,11 +138,6 @@ func (s *RedisStore) Pipeline() Pipeliner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Eval executes a Lua script on Redis.
|
|
||||||
func (s *RedisStore) Eval(script string, keys []string, args ...any) (any, error) {
|
|
||||||
return s.client.Eval(context.Background(), script, keys, args...).Result()
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Pub/Sub operations ---
|
// --- Pub/Sub operations ---
|
||||||
|
|
||||||
// redisSubscription wraps the redis.PubSub to implement the Subscription interface.
|
// redisSubscription wraps the redis.PubSub to implement the Subscription interface.
|
||||||
|
@@ -74,8 +74,3 @@ type Pipeliner interface {
|
|||||||
type RedisPipeliner interface {
|
type RedisPipeliner interface {
|
||||||
Pipeline() Pipeliner
|
Pipeline() Pipeliner
|
||||||
}
|
}
|
||||||
|
|
||||||
// LuaScripter is an optional interface that a Store can implement to provide Lua script execution.
|
|
||||||
type LuaScripter interface {
|
|
||||||
Eval(script string, keys []string, args ...any) (any, error)
|
|
||||||
}
|
|
||||||
|
@@ -2,6 +2,7 @@ package types
|
|||||||
|
|
||||||
// ConfigManager defines the interface for configuration management
|
// ConfigManager defines the interface for configuration management
|
||||||
type ConfigManager interface {
|
type ConfigManager interface {
|
||||||
|
IsMaster() bool
|
||||||
GetAuthConfig() AuthConfig
|
GetAuthConfig() AuthConfig
|
||||||
GetCORSConfig() CORSConfig
|
GetCORSConfig() CORSConfig
|
||||||
GetPerformanceConfig() PerformanceConfig
|
GetPerformanceConfig() PerformanceConfig
|
||||||
@@ -41,6 +42,7 @@ type SystemSettings struct {
|
|||||||
type ServerConfig struct {
|
type ServerConfig struct {
|
||||||
Port int `json:"port"`
|
Port int `json:"port"`
|
||||||
Host string `json:"host"`
|
Host string `json:"host"`
|
||||||
|
IsMaster bool `json:"is_master"`
|
||||||
ReadTimeout int `json:"read_timeout"`
|
ReadTimeout int `json:"read_timeout"`
|
||||||
WriteTimeout int `json:"write_timeout"`
|
WriteTimeout int `json:"write_timeout"`
|
||||||
IdleTimeout int `json:"idle_timeout"`
|
IdleTimeout int `json:"idle_timeout"`
|
||||||
|
Reference in New Issue
Block a user