Initial aether repository structure
All checks were successful
CI / build (push) Successful in 1m13s

Distributed actor system with event sourcing for Go:
- event.go - Event, ActorSnapshot, EventStore interface
- eventbus.go - EventBus, EventBroadcaster for pub/sub
- nats_eventbus.go - NATS-backed cross-node event broadcasting
- store/ - InMemoryEventStore (testing), JetStreamEventStore (production)
- cluster/ - Node discovery, leader election, shard distribution
- model/ - EventStorming model types

Extracted from arcadia as open-source infrastructure component.

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2026-01-08 19:30:02 +01:00
commit e9e50c021f
22 changed files with 2588 additions and 0 deletions

331
cluster/manager.go Normal file
View File

@@ -0,0 +1,331 @@
package cluster
import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"sync"
"time"
"github.com/nats-io/nats.go"
)
// VMRegistry provides access to local VM information for cluster operations
type VMRegistry interface {
GetActiveVMs() map[string]interface{} // VirtualMachine interface to avoid import cycles
GetShard(actorID string) int
}
// ClusterManager coordinates distributed VM operations across the cluster
type ClusterManager struct {
nodeID string
nodes map[string]*NodeInfo
nodeUpdates chan NodeUpdate
shardMap *ShardMap
hashRing *ConsistentHashRing
election *LeaderElection
natsConn *nats.Conn
ctx context.Context
mutex sync.RWMutex
logger *log.Logger
vmRegistry VMRegistry // Interface to access local VMs
}
// NewClusterManager creates a cluster coordination manager
func NewClusterManager(nodeID string, natsConn *nats.Conn, ctx context.Context) (*ClusterManager, error) {
cm := &ClusterManager{
nodeID: nodeID,
nodes: make(map[string]*NodeInfo),
nodeUpdates: make(chan NodeUpdate, 100),
shardMap: &ShardMap{Shards: make(map[int][]string), Nodes: make(map[string]NodeInfo)},
hashRing: NewConsistentHashRing(),
natsConn: natsConn,
ctx: ctx,
logger: log.New(os.Stdout, fmt.Sprintf("[ClusterMgr %s] ", nodeID), log.LstdFlags),
vmRegistry: nil, // Will be set later via SetVMRegistry
}
// Create leadership election with callbacks
callbacks := LeaderElectionCallbacks{
OnBecameLeader: func() {
cm.logger.Printf("👑 This node became the cluster leader - can initiate rebalancing")
},
OnLostLeader: func() {
cm.logger.Printf("📉 This node lost cluster leadership")
},
OnNewLeader: func(leaderID string) {
cm.logger.Printf("🔄 Cluster leadership changed to: %s", leaderID)
},
}
election, err := NewLeaderElection(nodeID, natsConn, callbacks)
if err != nil {
return nil, fmt.Errorf("failed to create leader election: %w", err)
}
cm.election = election
return cm, nil
}
// Start begins cluster management operations
func (cm *ClusterManager) Start() {
cm.logger.Printf("🚀 Starting cluster manager")
// Start leader election
cm.election.Start()
// Subscribe to cluster messages
cm.natsConn.Subscribe("aether.cluster.*", cm.handleClusterMessage)
// Start node monitoring
go cm.monitorNodes()
// Start shard rebalancing (only if leader)
go cm.rebalanceLoop()
}
// Stop gracefully stops the cluster manager
func (cm *ClusterManager) Stop() {
cm.logger.Printf("🛑 Stopping cluster manager")
if cm.election != nil {
cm.election.Stop()
}
}
// IsLeader returns whether this node is the cluster leader
func (cm *ClusterManager) IsLeader() bool {
if cm.election == nil {
return false
}
return cm.election.IsLeader()
}
// GetLeader returns the current cluster leader ID
func (cm *ClusterManager) GetLeader() string {
if cm.election == nil {
return ""
}
return cm.election.GetLeader()
}
// SetVMRegistry sets the VM registry for accessing local VM information
func (cm *ClusterManager) SetVMRegistry(registry VMRegistry) {
cm.vmRegistry = registry
}
// GetActorsInShard returns actors that belong to a specific shard on this node
func (cm *ClusterManager) GetActorsInShard(shardID int) []string {
if cm.vmRegistry == nil {
return []string{}
}
activeVMs := cm.vmRegistry.GetActiveVMs()
var actors []string
for actorID := range activeVMs {
if cm.vmRegistry.GetShard(actorID) == shardID {
actors = append(actors, actorID)
}
}
return actors
}
// handleClusterMessage processes incoming cluster coordination messages
func (cm *ClusterManager) handleClusterMessage(msg *nats.Msg) {
var clusterMsg ClusterMessage
if err := json.Unmarshal(msg.Data, &clusterMsg); err != nil {
cm.logger.Printf("⚠️ Invalid cluster message: %v", err)
return
}
switch clusterMsg.Type {
case "rebalance":
cm.handleRebalanceRequest(clusterMsg)
case "migrate":
cm.handleMigrationRequest(clusterMsg)
case "node_update":
if update, ok := clusterMsg.Payload.(NodeUpdate); ok {
cm.handleNodeUpdate(update)
}
default:
cm.logger.Printf("⚠️ Unknown cluster message type: %s", clusterMsg.Type)
}
}
// handleNodeUpdate processes node status updates
func (cm *ClusterManager) handleNodeUpdate(update NodeUpdate) {
cm.mutex.Lock()
defer cm.mutex.Unlock()
switch update.Type {
case NodeJoined:
cm.nodes[update.Node.ID] = update.Node
cm.hashRing.AddNode(update.Node.ID)
cm.logger.Printf(" Node joined: %s", update.Node.ID)
case NodeLeft:
delete(cm.nodes, update.Node.ID)
cm.hashRing.RemoveNode(update.Node.ID)
cm.logger.Printf(" Node left: %s", update.Node.ID)
case NodeUpdated:
if node, exists := cm.nodes[update.Node.ID]; exists {
// Update existing node info
*node = *update.Node
} else {
// New node
cm.nodes[update.Node.ID] = update.Node
cm.hashRing.AddNode(update.Node.ID)
}
}
// Check for failed nodes and mark them
now := time.Now()
for _, node := range cm.nodes {
if now.Sub(node.LastSeen) > 90*time.Second && node.Status != NodeStatusFailed {
node.Status = NodeStatusFailed
cm.logger.Printf("❌ Node marked as failed: %s (last seen: %s)",
node.ID, node.LastSeen.Format(time.RFC3339))
}
}
// Trigger rebalancing if we're the leader and there are significant changes
if cm.IsLeader() {
activeNodeCount := 0
for _, node := range cm.nodes {
if node.Status == NodeStatusActive {
activeNodeCount++
}
}
// Simple trigger: rebalance if we have different number of active nodes
// than shards assigned (this is a simplified logic)
if activeNodeCount > 0 {
cm.triggerShardRebalancing("node topology changed")
}
}
}
// handleRebalanceRequest processes cluster rebalancing requests
func (cm *ClusterManager) handleRebalanceRequest(msg ClusterMessage) {
cm.logger.Printf("🔄 Handling rebalance request from %s", msg.From)
// Implementation would handle the specific rebalancing logic
// This is a simplified version
}
// handleMigrationRequest processes actor migration requests
func (cm *ClusterManager) handleMigrationRequest(msg ClusterMessage) {
cm.logger.Printf("🚚 Handling migration request from %s", msg.From)
// Implementation would handle the specific migration logic
// This is a simplified version
}
// triggerShardRebalancing initiates shard rebalancing across the cluster
func (cm *ClusterManager) triggerShardRebalancing(reason string) {
if !cm.IsLeader() {
return // Only leader can initiate rebalancing
}
cm.logger.Printf("⚖️ Triggering shard rebalancing: %s", reason)
// Get active nodes
var activeNodes []*NodeInfo
cm.mutex.RLock()
for _, node := range cm.nodes {
if node.Status == NodeStatusActive {
activeNodes = append(activeNodes, node)
}
}
cm.mutex.RUnlock()
if len(activeNodes) == 0 {
cm.logger.Printf("⚠️ No active nodes available for rebalancing")
return
}
// This would implement the actual rebalancing logic
cm.logger.Printf("🎯 Would rebalance across %d active nodes", len(activeNodes))
}
// monitorNodes periodically checks node health and updates
func (cm *ClusterManager) monitorNodes() {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
// Health check logic would go here
cm.checkNodeHealth()
case <-cm.ctx.Done():
return
}
}
}
// checkNodeHealth verifies the health of known nodes
func (cm *ClusterManager) checkNodeHealth() {
cm.mutex.Lock()
defer cm.mutex.Unlock()
now := time.Now()
for _, node := range cm.nodes {
if now.Sub(node.LastSeen) > 90*time.Second && node.Status == NodeStatusActive {
node.Status = NodeStatusFailed
cm.logger.Printf("💔 Node failed: %s", node.ID)
}
}
}
// rebalanceLoop runs periodic rebalancing checks (leader only)
func (cm *ClusterManager) rebalanceLoop() {
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if cm.IsLeader() {
cm.triggerShardRebalancing("periodic rebalance check")
}
case <-cm.ctx.Done():
return
}
}
}
// GetNodes returns a copy of the current cluster nodes
func (cm *ClusterManager) GetNodes() map[string]*NodeInfo {
cm.mutex.RLock()
defer cm.mutex.RUnlock()
nodes := make(map[string]*NodeInfo)
for id, node := range cm.nodes {
// Create a copy to prevent external mutation
nodeCopy := *node
nodes[id] = &nodeCopy
}
return nodes
}
// GetShardMap returns the current shard mapping
func (cm *ClusterManager) GetShardMap() *ShardMap {
cm.mutex.RLock()
defer cm.mutex.RUnlock()
// Return a copy to prevent external mutation
return &ShardMap{
Version: cm.shardMap.Version,
Shards: make(map[int][]string),
Nodes: make(map[string]NodeInfo),
UpdateTime: cm.shardMap.UpdateTime,
}
}