Files
aether/cluster/distributed.go
Hugo Nijhuis e9e50c021f
All checks were successful
CI / build (push) Successful in 1m13s
Initial aether repository structure
Distributed actor system with event sourcing for Go:
- event.go - Event, ActorSnapshot, EventStore interface
- eventbus.go - EventBus, EventBroadcaster for pub/sub
- nats_eventbus.go - NATS-backed cross-node event broadcasting
- store/ - InMemoryEventStore (testing), JetStreamEventStore (production)
- cluster/ - Node discovery, leader election, shard distribution
- model/ - EventStorming model types

Extracted from arcadia as open-source infrastructure component.

Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-08 19:30:02 +01:00

221 lines
6.3 KiB
Go

package cluster
import (
"context"
"encoding/json"
"fmt"
"github.com/nats-io/nats.go"
)
// DistributedVM manages a cluster of runtime nodes with VM-per-instance architecture
type DistributedVM struct {
nodeID string
cluster *ClusterManager
localRuntime Runtime // Interface to avoid import cycles
sharding *ShardManager
discovery *NodeDiscovery
natsConn *nats.Conn
ctx context.Context
cancel context.CancelFunc
}
// Runtime interface to avoid import cycles with main aether package
type Runtime interface {
Start() error
LoadModel(model interface{}) error
SendMessage(message interface{}) error
}
// DistributedVMRegistry implements VMRegistry using DistributedVM's local runtime and sharding
type DistributedVMRegistry struct {
runtime interface{} // Runtime interface to avoid import cycles
sharding *ShardManager
}
// NewDistributedVM creates a distributed VM runtime cluster node
func NewDistributedVM(nodeID string, natsURLs []string, localRuntime Runtime) (*DistributedVM, error) {
ctx, cancel := context.WithCancel(context.Background())
// Connect to NATS cluster
natsURL := natsURLs[0] // Use first URL for simplicity
natsConn, err := nats.Connect(natsURL,
nats.Name(fmt.Sprintf("aether-runtime-%s", nodeID)))
if err != nil {
cancel()
return nil, fmt.Errorf("failed to connect to NATS: %w", err)
}
// Create cluster components
discovery := NewNodeDiscovery(nodeID, natsConn, ctx)
sharding := NewShardManager(1024, 3) // 1024 shards, 3 replicas
cluster, err := NewClusterManager(nodeID, natsConn, ctx)
if err != nil {
cancel()
natsConn.Close()
return nil, fmt.Errorf("failed to create cluster manager: %w", err)
}
dvm := &DistributedVM{
nodeID: nodeID,
cluster: cluster,
localRuntime: localRuntime,
sharding: sharding,
discovery: discovery,
natsConn: natsConn,
ctx: ctx,
cancel: cancel,
}
// Create VM registry and connect it to cluster manager
vmRegistry := &DistributedVMRegistry{
runtime: localRuntime,
sharding: sharding,
}
cluster.SetVMRegistry(vmRegistry)
return dvm, nil
}
// Start begins the distributed VM cluster node
func (dvm *DistributedVM) Start() error {
// Start local runtime
if err := dvm.localRuntime.Start(); err != nil {
return fmt.Errorf("failed to start local runtime: %w", err)
}
// Start cluster services
go dvm.discovery.Start()
go dvm.cluster.Start()
// Start message routing
go dvm.startMessageRouting()
return nil
}
// Stop gracefully shuts down the distributed VM node
func (dvm *DistributedVM) Stop() {
dvm.cancel()
dvm.cluster.Stop()
dvm.discovery.Stop()
dvm.natsConn.Close()
}
// LoadModel distributes EventStorming model across the cluster with VM templates
func (dvm *DistributedVM) LoadModel(model interface{}) error {
// Load model locally first
if err := dvm.localRuntime.LoadModel(model); err != nil {
return fmt.Errorf("failed to load model locally: %w", err)
}
// Broadcast model to other cluster nodes
msg := ClusterMessage{
Type: "load_model",
From: dvm.nodeID,
To: "broadcast",
Payload: model,
}
return dvm.publishClusterMessage(msg)
}
// SendMessage routes messages across the distributed cluster
func (dvm *DistributedVM) SendMessage(message interface{}) error {
// This is a simplified implementation
// In practice, this would determine the target node based on sharding
// and route the message appropriately
return dvm.localRuntime.SendMessage(message)
}
// GetActorNode determines which node should handle a specific actor
func (dvm *DistributedVM) GetActorNode(actorID string) string {
// Use consistent hashing to determine the target node
return dvm.cluster.hashRing.GetNode(actorID)
}
// IsLocalActor checks if an actor should be handled by this node
func (dvm *DistributedVM) IsLocalActor(actorID string) bool {
targetNode := dvm.GetActorNode(actorID)
return targetNode == dvm.nodeID
}
// GetActorsInShard returns actors that belong to a specific shard on this node
func (dvm *DistributedVM) GetActorsInShard(shardID int) []string {
return dvm.cluster.GetActorsInShard(shardID)
}
// startMessageRouting begins routing messages between cluster nodes
func (dvm *DistributedVM) startMessageRouting() {
// Subscribe to cluster messages
dvm.natsConn.Subscribe("aether.distributed.*", dvm.handleClusterMessage)
}
// handleClusterMessage processes incoming cluster coordination messages
func (dvm *DistributedVM) handleClusterMessage(msg *nats.Msg) {
var clusterMsg ClusterMessage
if err := json.Unmarshal(msg.Data, &clusterMsg); err != nil {
return
}
switch clusterMsg.Type {
case "load_model":
// Handle model loading from other nodes
if model := clusterMsg.Payload; model != nil {
dvm.localRuntime.LoadModel(model)
}
case "route_message":
// Handle message routing from other nodes
if message := clusterMsg.Payload; message != nil {
dvm.localRuntime.SendMessage(message)
}
case "rebalance":
// Handle shard rebalancing requests
dvm.handleRebalanceRequest(clusterMsg)
}
}
// handleRebalanceRequest processes shard rebalancing requests
func (dvm *DistributedVM) handleRebalanceRequest(msg ClusterMessage) {
// Simplified rebalancing logic
// In practice, this would implement complex actor migration
}
// publishClusterMessage sends a message to other cluster nodes
func (dvm *DistributedVM) publishClusterMessage(msg ClusterMessage) error {
data, err := json.Marshal(msg)
if err != nil {
return err
}
subject := fmt.Sprintf("aether.distributed.%s", msg.Type)
return dvm.natsConn.Publish(subject, data)
}
// GetClusterInfo returns information about the cluster state
func (dvm *DistributedVM) GetClusterInfo() map[string]interface{} {
nodes := dvm.cluster.GetNodes()
return map[string]interface{}{
"nodeId": dvm.nodeID,
"isLeader": dvm.cluster.IsLeader(),
"leader": dvm.cluster.GetLeader(),
"nodeCount": len(nodes),
"nodes": nodes,
}
}
// GetActiveVMs returns a map of active VMs (implementation depends on runtime)
func (dvr *DistributedVMRegistry) GetActiveVMs() map[string]interface{} {
// This would need to access the actual runtime's VM registry
// For now, return empty map to avoid import cycles
return make(map[string]interface{})
}
// GetShard returns the shard number for the given actor ID
func (dvr *DistributedVMRegistry) GetShard(actorID string) int {
return dvr.sharding.GetShard(actorID)
}