Initial aether repository structure
All checks were successful
CI / build (push) Successful in 1m13s
All checks were successful
CI / build (push) Successful in 1m13s
Distributed actor system with event sourcing for Go: - event.go - Event, ActorSnapshot, EventStore interface - eventbus.go - EventBus, EventBroadcaster for pub/sub - nats_eventbus.go - NATS-backed cross-node event broadcasting - store/ - InMemoryEventStore (testing), JetStreamEventStore (production) - cluster/ - Node discovery, leader election, shard distribution - model/ - EventStorming model types Extracted from arcadia as open-source infrastructure component. Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
331
cluster/manager.go
Normal file
331
cluster/manager.go
Normal file
@@ -0,0 +1,331 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
)
|
||||
|
||||
// VMRegistry provides access to local VM information for cluster operations
|
||||
type VMRegistry interface {
|
||||
GetActiveVMs() map[string]interface{} // VirtualMachine interface to avoid import cycles
|
||||
GetShard(actorID string) int
|
||||
}
|
||||
|
||||
// ClusterManager coordinates distributed VM operations across the cluster
|
||||
type ClusterManager struct {
|
||||
nodeID string
|
||||
nodes map[string]*NodeInfo
|
||||
nodeUpdates chan NodeUpdate
|
||||
shardMap *ShardMap
|
||||
hashRing *ConsistentHashRing
|
||||
election *LeaderElection
|
||||
natsConn *nats.Conn
|
||||
ctx context.Context
|
||||
mutex sync.RWMutex
|
||||
logger *log.Logger
|
||||
vmRegistry VMRegistry // Interface to access local VMs
|
||||
}
|
||||
|
||||
// NewClusterManager creates a cluster coordination manager
|
||||
func NewClusterManager(nodeID string, natsConn *nats.Conn, ctx context.Context) (*ClusterManager, error) {
|
||||
cm := &ClusterManager{
|
||||
nodeID: nodeID,
|
||||
nodes: make(map[string]*NodeInfo),
|
||||
nodeUpdates: make(chan NodeUpdate, 100),
|
||||
shardMap: &ShardMap{Shards: make(map[int][]string), Nodes: make(map[string]NodeInfo)},
|
||||
hashRing: NewConsistentHashRing(),
|
||||
natsConn: natsConn,
|
||||
ctx: ctx,
|
||||
logger: log.New(os.Stdout, fmt.Sprintf("[ClusterMgr %s] ", nodeID), log.LstdFlags),
|
||||
vmRegistry: nil, // Will be set later via SetVMRegistry
|
||||
}
|
||||
|
||||
// Create leadership election with callbacks
|
||||
callbacks := LeaderElectionCallbacks{
|
||||
OnBecameLeader: func() {
|
||||
cm.logger.Printf("👑 This node became the cluster leader - can initiate rebalancing")
|
||||
},
|
||||
OnLostLeader: func() {
|
||||
cm.logger.Printf("📉 This node lost cluster leadership")
|
||||
},
|
||||
OnNewLeader: func(leaderID string) {
|
||||
cm.logger.Printf("🔄 Cluster leadership changed to: %s", leaderID)
|
||||
},
|
||||
}
|
||||
|
||||
election, err := NewLeaderElection(nodeID, natsConn, callbacks)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create leader election: %w", err)
|
||||
}
|
||||
|
||||
cm.election = election
|
||||
return cm, nil
|
||||
}
|
||||
|
||||
// Start begins cluster management operations
|
||||
func (cm *ClusterManager) Start() {
|
||||
cm.logger.Printf("🚀 Starting cluster manager")
|
||||
|
||||
// Start leader election
|
||||
cm.election.Start()
|
||||
|
||||
// Subscribe to cluster messages
|
||||
cm.natsConn.Subscribe("aether.cluster.*", cm.handleClusterMessage)
|
||||
|
||||
// Start node monitoring
|
||||
go cm.monitorNodes()
|
||||
|
||||
// Start shard rebalancing (only if leader)
|
||||
go cm.rebalanceLoop()
|
||||
}
|
||||
|
||||
// Stop gracefully stops the cluster manager
|
||||
func (cm *ClusterManager) Stop() {
|
||||
cm.logger.Printf("🛑 Stopping cluster manager")
|
||||
|
||||
if cm.election != nil {
|
||||
cm.election.Stop()
|
||||
}
|
||||
}
|
||||
|
||||
// IsLeader returns whether this node is the cluster leader
|
||||
func (cm *ClusterManager) IsLeader() bool {
|
||||
if cm.election == nil {
|
||||
return false
|
||||
}
|
||||
return cm.election.IsLeader()
|
||||
}
|
||||
|
||||
// GetLeader returns the current cluster leader ID
|
||||
func (cm *ClusterManager) GetLeader() string {
|
||||
if cm.election == nil {
|
||||
return ""
|
||||
}
|
||||
return cm.election.GetLeader()
|
||||
}
|
||||
|
||||
// SetVMRegistry sets the VM registry for accessing local VM information
|
||||
func (cm *ClusterManager) SetVMRegistry(registry VMRegistry) {
|
||||
cm.vmRegistry = registry
|
||||
}
|
||||
|
||||
// GetActorsInShard returns actors that belong to a specific shard on this node
|
||||
func (cm *ClusterManager) GetActorsInShard(shardID int) []string {
|
||||
if cm.vmRegistry == nil {
|
||||
return []string{}
|
||||
}
|
||||
|
||||
activeVMs := cm.vmRegistry.GetActiveVMs()
|
||||
var actors []string
|
||||
|
||||
for actorID := range activeVMs {
|
||||
if cm.vmRegistry.GetShard(actorID) == shardID {
|
||||
actors = append(actors, actorID)
|
||||
}
|
||||
}
|
||||
|
||||
return actors
|
||||
}
|
||||
|
||||
// handleClusterMessage processes incoming cluster coordination messages
|
||||
func (cm *ClusterManager) handleClusterMessage(msg *nats.Msg) {
|
||||
var clusterMsg ClusterMessage
|
||||
if err := json.Unmarshal(msg.Data, &clusterMsg); err != nil {
|
||||
cm.logger.Printf("⚠️ Invalid cluster message: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
switch clusterMsg.Type {
|
||||
case "rebalance":
|
||||
cm.handleRebalanceRequest(clusterMsg)
|
||||
case "migrate":
|
||||
cm.handleMigrationRequest(clusterMsg)
|
||||
case "node_update":
|
||||
if update, ok := clusterMsg.Payload.(NodeUpdate); ok {
|
||||
cm.handleNodeUpdate(update)
|
||||
}
|
||||
default:
|
||||
cm.logger.Printf("⚠️ Unknown cluster message type: %s", clusterMsg.Type)
|
||||
}
|
||||
}
|
||||
|
||||
// handleNodeUpdate processes node status updates
|
||||
func (cm *ClusterManager) handleNodeUpdate(update NodeUpdate) {
|
||||
cm.mutex.Lock()
|
||||
defer cm.mutex.Unlock()
|
||||
|
||||
switch update.Type {
|
||||
case NodeJoined:
|
||||
cm.nodes[update.Node.ID] = update.Node
|
||||
cm.hashRing.AddNode(update.Node.ID)
|
||||
cm.logger.Printf("➕ Node joined: %s", update.Node.ID)
|
||||
|
||||
case NodeLeft:
|
||||
delete(cm.nodes, update.Node.ID)
|
||||
cm.hashRing.RemoveNode(update.Node.ID)
|
||||
cm.logger.Printf("➖ Node left: %s", update.Node.ID)
|
||||
|
||||
case NodeUpdated:
|
||||
if node, exists := cm.nodes[update.Node.ID]; exists {
|
||||
// Update existing node info
|
||||
*node = *update.Node
|
||||
} else {
|
||||
// New node
|
||||
cm.nodes[update.Node.ID] = update.Node
|
||||
cm.hashRing.AddNode(update.Node.ID)
|
||||
}
|
||||
}
|
||||
|
||||
// Check for failed nodes and mark them
|
||||
now := time.Now()
|
||||
for _, node := range cm.nodes {
|
||||
if now.Sub(node.LastSeen) > 90*time.Second && node.Status != NodeStatusFailed {
|
||||
node.Status = NodeStatusFailed
|
||||
cm.logger.Printf("❌ Node marked as failed: %s (last seen: %s)",
|
||||
node.ID, node.LastSeen.Format(time.RFC3339))
|
||||
}
|
||||
}
|
||||
|
||||
// Trigger rebalancing if we're the leader and there are significant changes
|
||||
if cm.IsLeader() {
|
||||
activeNodeCount := 0
|
||||
for _, node := range cm.nodes {
|
||||
if node.Status == NodeStatusActive {
|
||||
activeNodeCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Simple trigger: rebalance if we have different number of active nodes
|
||||
// than shards assigned (this is a simplified logic)
|
||||
if activeNodeCount > 0 {
|
||||
cm.triggerShardRebalancing("node topology changed")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleRebalanceRequest processes cluster rebalancing requests
|
||||
func (cm *ClusterManager) handleRebalanceRequest(msg ClusterMessage) {
|
||||
cm.logger.Printf("🔄 Handling rebalance request from %s", msg.From)
|
||||
|
||||
// Implementation would handle the specific rebalancing logic
|
||||
// This is a simplified version
|
||||
}
|
||||
|
||||
// handleMigrationRequest processes actor migration requests
|
||||
func (cm *ClusterManager) handleMigrationRequest(msg ClusterMessage) {
|
||||
cm.logger.Printf("🚚 Handling migration request from %s", msg.From)
|
||||
|
||||
// Implementation would handle the specific migration logic
|
||||
// This is a simplified version
|
||||
}
|
||||
|
||||
// triggerShardRebalancing initiates shard rebalancing across the cluster
|
||||
func (cm *ClusterManager) triggerShardRebalancing(reason string) {
|
||||
if !cm.IsLeader() {
|
||||
return // Only leader can initiate rebalancing
|
||||
}
|
||||
|
||||
cm.logger.Printf("⚖️ Triggering shard rebalancing: %s", reason)
|
||||
|
||||
// Get active nodes
|
||||
var activeNodes []*NodeInfo
|
||||
cm.mutex.RLock()
|
||||
for _, node := range cm.nodes {
|
||||
if node.Status == NodeStatusActive {
|
||||
activeNodes = append(activeNodes, node)
|
||||
}
|
||||
}
|
||||
cm.mutex.RUnlock()
|
||||
|
||||
if len(activeNodes) == 0 {
|
||||
cm.logger.Printf("⚠️ No active nodes available for rebalancing")
|
||||
return
|
||||
}
|
||||
|
||||
// This would implement the actual rebalancing logic
|
||||
cm.logger.Printf("🎯 Would rebalance across %d active nodes", len(activeNodes))
|
||||
}
|
||||
|
||||
// monitorNodes periodically checks node health and updates
|
||||
func (cm *ClusterManager) monitorNodes() {
|
||||
ticker := time.NewTicker(30 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
// Health check logic would go here
|
||||
cm.checkNodeHealth()
|
||||
|
||||
case <-cm.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// checkNodeHealth verifies the health of known nodes
|
||||
func (cm *ClusterManager) checkNodeHealth() {
|
||||
cm.mutex.Lock()
|
||||
defer cm.mutex.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
for _, node := range cm.nodes {
|
||||
if now.Sub(node.LastSeen) > 90*time.Second && node.Status == NodeStatusActive {
|
||||
node.Status = NodeStatusFailed
|
||||
cm.logger.Printf("💔 Node failed: %s", node.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// rebalanceLoop runs periodic rebalancing checks (leader only)
|
||||
func (cm *ClusterManager) rebalanceLoop() {
|
||||
ticker := time.NewTicker(5 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
if cm.IsLeader() {
|
||||
cm.triggerShardRebalancing("periodic rebalance check")
|
||||
}
|
||||
|
||||
case <-cm.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetNodes returns a copy of the current cluster nodes
|
||||
func (cm *ClusterManager) GetNodes() map[string]*NodeInfo {
|
||||
cm.mutex.RLock()
|
||||
defer cm.mutex.RUnlock()
|
||||
|
||||
nodes := make(map[string]*NodeInfo)
|
||||
for id, node := range cm.nodes {
|
||||
// Create a copy to prevent external mutation
|
||||
nodeCopy := *node
|
||||
nodes[id] = &nodeCopy
|
||||
}
|
||||
return nodes
|
||||
}
|
||||
|
||||
// GetShardMap returns the current shard mapping
|
||||
func (cm *ClusterManager) GetShardMap() *ShardMap {
|
||||
cm.mutex.RLock()
|
||||
defer cm.mutex.RUnlock()
|
||||
|
||||
// Return a copy to prevent external mutation
|
||||
return &ShardMap{
|
||||
Version: cm.shardMap.Version,
|
||||
Shards: make(map[int][]string),
|
||||
Nodes: make(map[string]NodeInfo),
|
||||
UpdateTime: cm.shardMap.UpdateTime,
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user