Files
aether/.product-strategy/cluster/PATTERNS.md
Hugo Nijhuis 271f5db444
Some checks failed
CI / build (push) Successful in 21s
CI / integration (push) Failing after 2m1s
Move product strategy documentation to .product-strategy directory
Organize all product strategy and domain modeling documentation into a
dedicated .product-strategy directory for better separation from code.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-12 23:57:20 +01:00

30 KiB

DDD Patterns: Intended vs Actual Code

This document shows side-by-side comparisons of how the Cluster Coordination context should evolve from its current state to proper DDD patterns.


Pattern 1: Commands vs Message Handlers

Current (Anemic)

// File: cluster/manager.go, line 141
func (cm *ClusterManager) handleClusterMessage(msg *nats.Msg) {
    var clusterMsg ClusterMessage
    if err := json.Unmarshal(msg.Data, &clusterMsg); err != nil {
        cm.logger.Printf("Invalid cluster message: %v", err)
        return
    }

    switch clusterMsg.Type {
    case "rebalance":
        cm.handleRebalanceRequest(clusterMsg)
    case "migrate":
        cm.handleMigrationRequest(clusterMsg)
    case "node_update":
        if update, ok := clusterMsg.Payload.(NodeUpdate); ok {
            cm.handleNodeUpdate(update)
        }
    default:
        cm.logger.Printf("Unknown cluster message type: %s", clusterMsg.Type)
    }
}

// File: cluster/manager.go, line 163
func (cm *ClusterManager) handleNodeUpdate(update NodeUpdate) {
    cm.mutex.Lock()
    defer cm.mutex.Unlock()

    switch update.Type {
    case NodeJoined:
        cm.nodes[update.Node.ID] = update.Node
        cm.hashRing.AddNode(update.Node.ID)
        cm.logger.Printf("Node joined: %s", update.Node.ID)
    // ... (more cases)
    }
}

Problems:

  • Generic message dispatch; unclear intent
  • No explicit "command" concept
  • No validation before state change
  • No events published
  • Tightly coupled to NATS message format

Intended (DDD)

// File: cluster/manager.go

// JoinCluster adds a node to the cluster (command)
func (cm *ClusterManager) JoinCluster(nodeInfo *NodeInfo) error {
    // Validate preconditions
    if nodeInfo.ID == "" {
        return fmt.Errorf("node ID cannot be empty")
    }
    if nodeInfo.Capacity <= 0 {
        return fmt.Errorf("node capacity must be > 0")
    }

    cm.mutex.Lock()
    defer cm.mutex.Unlock()

    // Check duplicate
    if _, exists := cm.nodes[nodeInfo.ID]; exists {
        return fmt.Errorf("node already exists: %s", nodeInfo.ID)
    }

    // Execute command
    cm.nodes[nodeInfo.ID] = nodeInfo
    cm.hashRing.AddNode(nodeInfo.ID)

    // Publish event
    event := &NodeJoined{
        NodeID:    nodeInfo.ID,
        Address:   nodeInfo.Address,
        Port:      nodeInfo.Port,
        Capacity:  nodeInfo.Capacity,
        Timestamp: time.Now(),
    }
    if err := cm.eventPublisher.Publish(event); err != nil {
        cm.logger.Printf("Failed to publish NodeJoined event: %v", err)
        // Decide: rollback or proceed? (Usually proceed for CQRS)
    }

    // Trigger rebalancing if leader
    if cm.IsLeader() {
        go cm.triggerShardRebalancing("node_joined")
    }

    return nil
}

// MarkNodeFailed marks a node as failed (command)
func (cm *ClusterManager) MarkNodeFailed(nodeID string, reason string) error {
    // Validate
    cm.mutex.Lock()
    node, exists := cm.nodes[nodeID]
    if !exists {
        cm.mutex.Unlock()
        return fmt.Errorf("node not found: %s", nodeID)
    }
    if node.Status == NodeStatusFailed {
        cm.mutex.Unlock()
        return fmt.Errorf("node already failed: %s", nodeID)
    }

    // Execute command
    node.Status = NodeStatusFailed
    cm.hashRing.RemoveNode(nodeID)

    // Publish event
    event := &NodeFailed{
        NodeID:    nodeID,
        Reason:    reason,
        Timestamp: time.Now(),
    }
    if err := cm.eventPublisher.Publish(event); err != nil {
        cm.logger.Printf("Failed to publish NodeFailed event: %v", err)
    }

    isLeader := cm.IsLeader()
    cm.mutex.Unlock()

    // Trigger rebalancing if leader
    if isLeader {
        go cm.triggerShardRebalancing("node_failed")
    }

    return nil
}

Benefits:

  • Explicit command methods with clear intent
  • Validation before execution
  • Atomic operations (lock held throughout)
  • Events published on success
  • Decoupled from message format
  • Testable in isolation

Pattern 2: Value Objects vs Primitives

Current (Scattered Types)

// File: cluster/types.go, line 58
type NodeInfo struct {
    ID        string            `json:"id"`
    Address   string            `json:"address"`
    Port      int               `json:"port"`
    Status    NodeStatus        `json:"status"`
    Capacity  float64           `json:"capacity"`
    Load      float64           `json:"load"`
    LastSeen  time.Time         `json:"lastSeen"`
    Timestamp time.Time         `json:"timestamp"`
    Metadata  map[string]string `json:"metadata"`
    IsLeader  bool              `json:"isLeader"`
    VMCount   int               `json:"vmCount"`
    ShardIDs  []int             `json:"shardIds"`
}

// No validation, no methods
// Can create invalid nodes:
node := &NodeInfo{ID: "", Capacity: -5}  // Invalid!

Intended (Value Objects with Validation)

// File: cluster/domain.go (new file)

// NodeID is a value object representing a unique node identifier
type NodeID struct {
    id string
}

// NewNodeID creates a NodeID, validating it's non-empty
func NewNodeID(id string) (NodeID, error) {
    if id == "" {
        return NodeID{}, errors.New("node ID cannot be empty")
    }
    return NodeID{id: id}, nil
}

// String returns the node ID as string
func (n NodeID) String() string {
    return n.id
}

// Equal checks equality
func (n NodeID) Equal(other NodeID) bool {
    return n.id == other.id
}

// Capacity is a value object representing node capacity
type Capacity struct {
    value float64
}

// NewCapacity creates a Capacity, validating it's positive
func NewCapacity(value float64) (Capacity, error) {
    if value <= 0 {
        return Capacity{}, fmt.Errorf("capacity must be > 0, got %f", value)
    }
    return Capacity{value: value}, nil
}

// Value returns capacity as float64
func (c Capacity) Value() float64 {
    return c.value
}

// NodeInfo is a value object (immutable after creation)
type NodeInfo struct {
    nodeID    NodeID
    address   string
    port      int
    status    NodeStatus
    capacity  Capacity
    load      float64
    lastSeen  time.Time
    metadata  map[string]string
    isLeader  bool
    vmCount   int
    shardIDs  []int
}

// NewNodeInfo creates a NodeInfo with validation
func NewNodeInfo(
    id string,
    address string,
    port int,
    capacity float64,
) (*NodeInfo, error) {
    nodeID, err := NewNodeID(id)
    if err != nil {
        return nil, err
    }

    cap, err := NewCapacity(capacity)
    if err != nil {
        return nil, err
    }

    if port < 1 || port > 65535 {
        return nil, fmt.Errorf("invalid port: %d", port)
    }

    return &NodeInfo{
        nodeID:    nodeID,
        address:   address,
        port:      port,
        status:    NodeStatusActive,
        capacity:  cap,
        load:      0,
        lastSeen:  time.Now(),
        metadata:  make(map[string]string),
        isLeader:  false,
        vmCount:   0,
        shardIDs:  []int{},
    }, nil
}

// Getters (all return copies to prevent mutation)
func (n *NodeInfo) NodeID() NodeID { return n.nodeID }
func (n *NodeInfo) Address() string { return n.address }
func (n *NodeInfo) Port() int { return n.port }
func (n *NodeInfo) Status() NodeStatus { return n.status }
func (n *NodeInfo) Capacity() Capacity { return n.capacity }

// WithStatus returns a new NodeInfo with updated status
// (immutable pattern: create new instead of mutate)
func (n *NodeInfo) WithStatus(status NodeStatus) *NodeInfo {
    copy := *n
    copy.status = status
    return &copy
}

// WithLastSeen returns a new NodeInfo with updated last seen time
func (n *NodeInfo) WithLastSeen(t time.Time) *NodeInfo {
    copy := *n
    copy.lastSeen = t
    return &copy
}

// Equal checks value equality
func (n *NodeInfo) Equal(other *NodeInfo) bool {
    if other == nil {
        return false
    }
    return n.nodeID.Equal(other.nodeID) &&
        n.address == other.address &&
        n.port == other.port &&
        n.status == other.status &&
        n.capacity == other.capacity
}

Benefits:

  • Impossible to create invalid nodes (caught at construction)
  • Type safety (can't accidentally pass negative capacity)
  • Immutable (prevents accidental mutations)
  • Methods encapsulate behavior
  • Easy to extend validation
  • Copy-on-write pattern for updates

Pattern 3: Event Publishing

Current (No Events)

// File: cluster/manager.go
func (cm *ClusterManager) handleNodeUpdate(update NodeUpdate) {
    cm.mutex.Lock()
    defer cm.mutex.Unlock()

    switch update.Type {
    case NodeJoined:
        cm.nodes[update.Node.ID] = update.Node
        cm.hashRing.AddNode(update.Node.ID)
        cm.logger.Printf("Node joined: %s", update.Node.ID)
        // No event published
        // No way for other contexts to react
        // No audit trail
    }
}

Intended (Events as First-Class)

// File: cluster/events.go (new file)

// NodeJoined event indicates a node joined the cluster
type NodeJoined struct {
    NodeID    NodeID
    Address   string
    Port      int
    Capacity  Capacity
    Timestamp time.Time
}

// NodeFailed event indicates a node failed
type NodeFailed struct {
    NodeID    NodeID
    Reason    string // "HeartbeatTimeout", "AdminMarked", etc.
    Timestamp time.Time
}

// ShardAssigned event indicates shard assignment changed
type ShardAssigned struct {
    ShardID   int
    NodeIDs   []NodeID // [primary, replica1, replica2, ...]
    Version   uint64
    Timestamp time.Time
}

// ShardMigrated event indicates a shard moved from one node to another
type ShardMigrated struct {
    ShardID    int
    FromNodes  []NodeID
    ToNodes    []NodeID
    Timestamp  time.Time
}

// RebalancingTriggered event indicates rebalancing started
type RebalancingTriggered struct {
    LeaderID  NodeID
    Reason    string // "node_joined", "node_failed", "manual", "periodic"
    Timestamp time.Time
}

// EventPublisher is the interface for publishing domain events
type EventPublisher interface {
    // Publish publishes an event
    Publish(ctx context.Context, event interface{}) error
}

// File: cluster/manager.go (updated)

type ClusterManager struct {
    // ... existing fields ...
    eventPublisher EventPublisher  // NEW
}

// publishEvent is a helper to publish events consistently
func (cm *ClusterManager) publishEvent(ctx context.Context, event interface{}) error {
    if cm.eventPublisher == nil {
        return nil  // No-op if no publisher configured
    }
    return cm.eventPublisher.Publish(ctx, event)
}

// JoinCluster adds a node and publishes NodeJoined event
func (cm *ClusterManager) JoinCluster(ctx context.Context, nodeInfo *NodeInfo) error {
    // Validation...

    cm.mutex.Lock()
    cm.nodes[nodeInfo.NodeID().String()] = nodeInfo
    cm.hashRing.AddNode(nodeInfo.NodeID().String())
    cm.mutex.Unlock()

    // Publish event
    event := &NodeJoined{
        NodeID:    nodeInfo.NodeID(),
        Address:   nodeInfo.Address(),
        Port:      nodeInfo.Port(),
        Capacity:  nodeInfo.Capacity(),
        Timestamp: time.Now(),
    }
    return cm.publishEvent(ctx, event)
}

// MarkNodeFailed marks node as failed and publishes NodeFailed event
func (cm *ClusterManager) MarkNodeFailed(ctx context.Context, nodeID NodeID, reason string) error {
    // Validation...

    cm.mutex.Lock()
    cm.nodes[nodeID.String()].WithStatus(NodeStatusFailed)
    cm.hashRing.RemoveNode(nodeID.String())
    cm.mutex.Unlock()

    // Publish event
    event := &NodeFailed{
        NodeID:    nodeID,
        Reason:    reason,
        Timestamp: time.Now(),
    }
    return cm.publishEvent(ctx, event)
}

Benefits:

  • Events are explicit domain concepts
  • Type-safe (compiler enforces event structure)
  • Published consistently (via publishEvent helper)
  • Other contexts can subscribe and react
  • Full audit trail available
  • Enables event sourcing / CQRS

Pattern 4: Invariant Validation

Current (Validation Scattered)

// File: cluster/manager.go, line 191-197
func (cm *ClusterManager) handleNodeUpdate(update NodeUpdate) {
    // ...
    now := time.Now()
    for _, node := range cm.nodes {
        if now.Sub(node.LastSeen) > 90*time.Second && node.Status != NodeStatusFailed {
            node.Status = NodeStatusFailed
            cm.logger.Printf("Node marked as failed: %s", node.ID)
        }
    }
}

// File: cluster/manager.go, line 276-288
func (cm *ClusterManager) checkNodeHealth() {
    cm.mutex.Lock()
    defer cm.mutex.Unlock()

    now := time.Now()
    for _, node := range cm.nodes {
        if now.Sub(node.LastSeen) > 90*time.Second && node.Status == NodeStatusActive {
            node.Status = NodeStatusFailed
            cm.logger.Printf("Node failed: %s", node.ID)
        }
    }
}

// Duplicate logic! Easy to miss cases.
// No central validation.

Intended (Centralized Invariants)

// File: cluster/invariants.go (new file)

// ClusterInvariants defines the consistency rules for the cluster
type ClusterInvariants struct {
    shardCount int
}

// NewClusterInvariants creates an invariant validator
func NewClusterInvariants(shardCount int) *ClusterInvariants {
    return &ClusterInvariants{shardCount: shardCount}
}

// ValidateNodeHealth checks Invariant 5: Leader is active
func (i *ClusterInvariants) ValidateNodeHealth(nodes map[string]*NodeInfo, leaderID string) error {
    if leaderID == "" {
        return nil  // No leader yet, OK
    }

    leaderNode, exists := nodes[leaderID]
    if !exists {
        return fmt.Errorf("leader node %s not in cluster", leaderID)
    }

    if leaderNode.Status() != NodeStatusActive {
        return fmt.Errorf("leader node %s is not active (status: %v)", leaderID, leaderNode.Status())
    }

    return nil
}

// ValidateShardCoverage checks Invariant 2: All shards assigned
func (i *ClusterInvariants) ValidateShardCoverage(shardMap *ShardMap) error {
    if shardMap == nil {
        return errors.New("shard map is nil")
    }

    assignedShards := make(map[int]bool)
    for shardID := range shardMap.Shards {
        assignedShards[shardID] = true
    }

    for shardID := 0; shardID < i.shardCount; shardID++ {
        if !assignedShards[shardID] {
            return fmt.Errorf("shard %d is not assigned (orphaned)", shardID)
        }
    }

    return nil
}

// ValidateShardOwnership checks Invariant 3: Only healthy nodes own shards
func (i *ClusterInvariants) ValidateShardOwnership(shardMap *ShardMap) error {
    if shardMap == nil {
        return errors.New("shard map is nil")
    }

    for shardID, nodeIDs := range shardMap.Shards {
        for _, nodeID := range nodeIDs {
            nodeInfo, exists := shardMap.Nodes[nodeID.String()]
            if !exists {
                return fmt.Errorf("shard %d assigned to unknown node %s", shardID, nodeID)
            }

            if nodeInfo.Status() != NodeStatusActive {
                return fmt.Errorf("shard %d assigned to unhealthy node %s (status: %v)",
                    shardID, nodeID, nodeInfo.Status())
            }
        }
    }

    return nil
}

// ValidateAll runs all invariant checks
func (i *ClusterInvariants) ValidateAll(topology *ClusterTopology) error {
    if err := i.ValidateNodeHealth(topology.nodes, topology.leaderID); err != nil {
        return fmt.Errorf("invariant violation (I5): %w", err)
    }

    if err := i.ValidateShardCoverage(topology.shardMap); err != nil {
        return fmt.Errorf("invariant violation (I2): %w", err)
    }

    if err := i.ValidateShardOwnership(topology.shardMap); err != nil {
        return fmt.Errorf("invariant violation (I3): %w", err)
    }

    return nil
}

// File: cluster/manager.go (updated)

type ClusterManager struct {
    // ... existing fields ...
    invariants *ClusterInvariants  // NEW
}

// MarkNodeFailed marks node as failed with invariant checks
func (cm *ClusterManager) MarkNodeFailed(ctx context.Context, nodeID NodeID, reason string) error {
    cm.mutex.Lock()
    defer cm.mutex.Unlock()

    // Validate preconditions
    node, exists := cm.nodes[nodeID.String()]
    if !exists {
        return fmt.Errorf("node not found: %s", nodeID)
    }
    if node.Status() == NodeStatusFailed {
        return fmt.Errorf("node already failed: %s", nodeID)
    }

    // Execute command
    failedNode := node.WithStatus(NodeStatusFailed)
    cm.nodes[nodeID.String()] = failedNode
    cm.hashRing.RemoveNode(nodeID.String())

    // Validate invariants still hold
    if err := cm.invariants.ValidateNodeHealth(cm.nodes, cm.currentLeaderID); err != nil {
        return fmt.Errorf("invariant violation after node failure: %w", err)
    }

    // Publish event
    event := &NodeFailed{
        NodeID:    nodeID,
        Reason:    reason,
        Timestamp: time.Now(),
    }
    _ = cm.publishEvent(ctx, event)

    return nil
}

// AssignShards assigns shards with invariant validation
func (cm *ClusterManager) AssignShards(ctx context.Context, newShardMap *ShardMap) error {
    // Only leader can assign
    if !cm.IsLeader() {
        return errors.New("only leader can assign shards")
    }

    cm.mutex.Lock()
    defer cm.mutex.Unlock()

    // Validate preconditions
    if err := cm.invariants.ValidateAll(&ClusterTopology{
        nodes:     cm.nodes,
        shardMap:  newShardMap,
        leaderID:  cm.currentLeaderID,
    }); err != nil {
        return fmt.Errorf("cannot assign shards: %w", err)
    }

    // Execute command
    oldShardMap := cm.shardMap
    cm.shardMap = newShardMap

    // Publish events
    for shardID, newNodes := range newShardMap.Shards {
        oldNodes := oldShardMap.Shards[shardID]
        if !nodeListEqual(oldNodes, newNodes) {
            event := &ShardMigrated{
                ShardID:    shardID,
                FromNodes:  oldNodes,
                ToNodes:    newNodes,
                Timestamp:  time.Now(),
            }
            _ = cm.publishEvent(ctx, event)
        }
    }

    return nil
}

Benefits:

  • Invariants defined in one place
  • Easy to audit what's being validated
  • Consistent application across all commands
  • Clear error messages
  • Testable in isolation
  • Easy to add new invariants

Pattern 5: Rebalancing Strategy

Current (Stubbed)

// File: cluster/shard.go, line 210
func (chp *ConsistentHashPlacement) RebalanceShards(
    currentMap *ShardMap,
    nodes map[string]*NodeInfo,
) (*ShardMap, error) {
    // This is a simplified implementation
    // In practice, this would implement sophisticated rebalancing logic
    return currentMap, nil  // BUG: Returns unchanged!
}

Intended (Real Implementation)

// File: cluster/rebalancing.go (new file)

// RebalancingStrategy defines how to distribute shards across nodes
type RebalancingStrategy interface {
    // Rebalance computes new shard assignments
    // Returns new ShardMap or error if unable to rebalance
    Rebalance(
        current *ShardMap,
        activeNodes map[string]*NodeInfo,
    ) (*ShardMap, error)
}

// ConsistentHashRebalancer uses consistent hashing to minimize movements
type ConsistentHashRebalancer struct {
    virtualNodes int
    shardCount   int
}

// NewConsistentHashRebalancer creates a rebalancer
func NewConsistentHashRebalancer(virtualNodes, shardCount int) *ConsistentHashRebalancer {
    return &ConsistentHashRebalancer{
        virtualNodes: virtualNodes,
        shardCount:   shardCount,
    }
}

// Rebalance computes new assignments using consistent hashing
func (chr *ConsistentHashRebalancer) Rebalance(
    current *ShardMap,
    activeNodes map[string]*NodeInfo,
) (*ShardMap, error) {
    if len(activeNodes) == 0 {
        return nil, errors.New("no active nodes to rebalance to")
    }

    // Build new hash ring from active nodes
    ring := NewConsistentHashRingWithConfig(HashRingConfig{
        VirtualNodes: chr.virtualNodes,
    })
    for nodeID := range activeNodes {
        ring.AddNode(nodeID)
    }

    // Reassign each shard via consistent hash
    newAssignments := make(map[int][]string)
    for shardID := 0; shardID < chr.shardCount; shardID++ {
        // Primary node via consistent hash
        primaryNode := ring.GetNode(fmt.Sprintf("shard-%d", shardID))
        if primaryNode == "" {
            return nil, fmt.Errorf("no node assigned for shard %d", shardID)
        }

        // TODO: Add replicas (for now: single replica)
        newAssignments[shardID] = []string{primaryNode}
    }

    return &ShardMap{
        Version:    current.Version + 1,
        Shards:     newAssignments,
        Nodes:      activeNodes,
        UpdateTime: time.Now(),
    }, nil
}

// LoadBalancingRebalancer assigns based on current load (future strategy)
type LoadBalancingRebalancer struct {
    shardCount int
}

// Rebalance assigns shards to least-loaded nodes
func (lbr *LoadBalancingRebalancer) Rebalance(
    current *ShardMap,
    activeNodes map[string]*NodeInfo,
) (*ShardMap, error) {
    // Sort nodes by load
    type nodeLoad struct {
        id   string
        load float64
    }
    var nodes []nodeLoad
    for id, node := range activeNodes {
        nodes = append(nodes, nodeLoad{id, node.Load})
    }
    sort.Slice(nodes, func(i, j int) bool {
        return nodes[i].load < nodes[j].load
    })

    // Assign each shard to least-loaded node
    newAssignments := make(map[int][]string)
    for shardID := 0; shardID < lbr.shardCount; shardID++ {
        // Round-robin through sorted nodes
        idx := shardID % len(nodes)
        newAssignments[shardID] = []string{nodes[idx].id}
    }

    return &ShardMap{
        Version:    current.Version + 1,
        Shards:     newAssignments,
        Nodes:      activeNodes,
        UpdateTime: time.Now(),
    }, nil
}

// File: cluster/manager.go (updated)

// RebalanceShards coordinates rebalancing
func (cm *ClusterManager) RebalanceShards(ctx context.Context, reason string) error {
    if !cm.IsLeader() {
        return errors.New("only leader can rebalance")
    }

    cm.mutex.Lock()

    // Get active nodes
    activeNodes := make(map[string]*NodeInfo)
    for id, node := range cm.nodes {
        if node.Status() == NodeStatusActive {
            activeNodes[id] = node
        }
    }

    if len(activeNodes) == 0 {
        cm.mutex.Unlock()
        return errors.New("no active nodes to rebalance to")
    }

    // Publish rebalancing started
    startEvent := &RebalancingTriggered{
        LeaderID:  NodeID{id: cm.currentLeaderID},
        Reason:    reason,
        Timestamp: time.Now(),
    }
    _ = cm.publishEvent(ctx, startEvent)

    // Compute new assignments
    strategy := NewConsistentHashRebalancer(DefaultVirtualNodes, DefaultNumShards)
    newShardMap, err := strategy.Rebalance(cm.shardMap, activeNodes)
    if err != nil {
        cm.mutex.Unlock()
        return fmt.Errorf("rebalancing strategy failed: %w", err)
    }

    // Validate new assignments
    if err := cm.invariants.ValidateAll(&ClusterTopology{
        nodes:     cm.nodes,
        shardMap:  newShardMap,
        leaderID:  cm.currentLeaderID,
    }); err != nil {
        cm.mutex.Unlock()
        return fmt.Errorf("new shard map violates invariants: %w", err)
    }

    // Apply new assignments
    oldShardMap := cm.shardMap
    cm.shardMap = newShardMap

    migratedCount := 0
    for shardID, newNodes := range newShardMap.Shards {
        oldNodes := oldShardMap.Shards[shardID]
        if !nodeListEqual(oldNodes, newNodes) {
            migratedCount++
            // Publish event for each migration
            event := &ShardMigrated{
                ShardID:    shardID,
                FromNodes:  stringListToNodeIDList(oldNodes),
                ToNodes:    stringListToNodeIDList(newNodes),
                Timestamp:  time.Now(),
            }
            _ = cm.publishEvent(ctx, event)
        }
    }

    cm.mutex.Unlock()

    // Publish rebalancing completed
    completeEvent := &RebalancingCompleted{
        LeaderID:         NodeID{id: cm.currentLeaderID},
        MigratedCount:    migratedCount,
        CompletedAt:      time.Now(),
    }
    _ = cm.publishEvent(ctx, completeEvent)

    return nil
}

Benefits:

  • Strategy pattern allows multiple algorithms
  • Real rebalancing actually redistributes shards
  • New strategies can be plugged in (e.g., load-aware)
  • Invariants checked before applying
  • Events published for observability
  • Testable in isolation

Pattern 6: Testing Aggregates

Current (Hard to Test)

// Testing is difficult because:
// 1. No dependency injection (NATS, KV store hardcoded)
// 2. No way to verify events (none published)
// 3. No way to inject clock (time.Now() hardcoded)
// 4. All state is private; hard to assert

func TestClusterManager_JoinNode(t *testing.T) {
    // Can't create without real NATS connection!
    natsConn, _ := nats.Connect(nats.DefaultURL)
    defer natsConn.Close()

    ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
    cm, _ := NewClusterManager("node-1", natsConn, ctx)

    // Can't control time
    // Can't verify events
    // Can't assert invariants
}

Intended (Testable with Mocks)

// File: cluster/manager_test.go

// MockEventPublisher captures published events for testing
type MockEventPublisher struct {
    events []interface{}
    mu     sync.Mutex
}

func (m *MockEventPublisher) Publish(ctx context.Context, event interface{}) error {
    m.mu.Lock()
    defer m.mu.Unlock()
    m.events = append(m.events, event)
    return nil
}

func (m *MockEventPublisher) GetEvents(t *testing.T) []interface{} {
    m.mu.Lock()
    defer m.mu.Unlock()
    return m.events
}

func (m *MockEventPublisher) Clear() {
    m.mu.Lock()
    defer m.mu.Unlock()
    m.events = []interface{}{}
}

// MockClock allows controlling time in tests
type MockClock struct {
    now time.Time
}

func (mc *MockClock) Now() time.Time {
    return mc.now
}

func (mc *MockClock) Advance(d time.Duration) {
    mc.now = mc.now.Add(d)
}

// ClusterManagerWithClock allows injecting a clock
type ClusterManager struct {
    // ... existing fields ...
    clock Clock  // NEW
}

type Clock interface {
    Now() time.Time
}

// Test: JoinCluster publishes NodeJoined event
func TestClusterManager_JoinCluster_PublishesEvent(t *testing.T) {
    // Arrange
    publisher := &MockEventPublisher{}
    cm := &ClusterManager{
        nodes:          make(map[string]*NodeInfo),
        hashRing:       NewConsistentHashRing(),
        eventPublisher: publisher,
        invariants:     NewClusterInvariants(1024),
    }

    nodeInfo, _ := NewNodeInfo("node-1", "localhost", 8080, 1000)

    // Act
    ctx := context.Background()
    err := cm.JoinCluster(ctx, nodeInfo)

    // Assert
    if err != nil {
        t.Fatalf("JoinCluster failed: %v", err)
    }

    events := publisher.GetEvents(t)
    if len(events) != 1 {
        t.Fatalf("expected 1 event, got %d", len(events))
    }

    joinedEvent, ok := events[0].(*NodeJoined)
    if !ok {
        t.Fatalf("expected NodeJoined event, got %T", events[0])
    }

    if joinedEvent.NodeID.String() != "node-1" {
        t.Errorf("expected node-1, got %s", joinedEvent.NodeID)
    }
}

// Test: MarkNodeFailed with invariant violation
func TestClusterManager_MarkNodeFailed_ValidatesInvariants(t *testing.T) {
    // Arrange
    publisher := &MockEventPublisher{}
    cm := &ClusterManager{
        nodes:            make(map[string]*NodeInfo),
        hashRing:         NewConsistentHashRing(),
        eventPublisher:   publisher,
        currentLeaderID:  "node-1",
        invariants:       NewClusterInvariants(1024),
    }

    // Only one node: the leader
    node1, _ := NewNodeInfo("node-1", "localhost", 8080, 1000)
    cm.nodes["node-1"] = node1

    // Act: Try to fail the only (leader) node
    ctx := context.Background()
    nodeID, _ := NewNodeID("node-1")
    err := cm.MarkNodeFailed(ctx, nodeID, "test")

    // Assert: Should fail because it violates Invariant 5 (leader must be active)
    if err == nil {
        t.Fatal("expected error when failing leader, got nil")
    }
    if !strings.Contains(err.Error(), "invariant") {
        t.Errorf("expected invariant error, got: %v", err)
    }
}

// Test: Rebalance uses strategy to compute assignments
func TestClusterManager_RebalanceShards_UsesStrategy(t *testing.T) {
    // Arrange
    publisher := &MockEventPublisher{}
    cm := &ClusterManager{
        nodes:           make(map[string]*NodeInfo),
        hashRing:        NewConsistentHashRing(),
        shardMap:        &ShardMap{Shards: make(map[int][]string)},
        currentLeaderID: "node-1",
        eventPublisher:  publisher,
        invariants:      NewClusterInvariants(10),  // 10 shards for test
    }

    // Add nodes
    for i := 1; i <= 2; i++ {
        id := fmt.Sprintf("node-%d", i)
        node, _ := NewNodeInfo(id, "localhost", 8080+i, 1000)
        cm.nodes[id] = node
        cm.hashRing.AddNode(id)
    }

    // Act: Rebalance
    ctx := context.Background()
    err := cm.RebalanceShards(ctx, "test")

    // Assert
    if err != nil {
        t.Fatalf("RebalanceShards failed: %v", err)
    }

    // Check that shards are now assigned
    assignedCount := len(cm.shardMap.Shards)
    if assignedCount != 10 {
        t.Errorf("expected 10 shards assigned, got %d", assignedCount)
    }

    // Check that events were published
    events := publisher.GetEvents(t)
    hasShardMigrated := false
    for _, event := range events {
        if _, ok := event.(*ShardMigrated); ok {
            hasShardMigrated = true
            break
        }
    }
    if !hasShardMigrated {
        t.Error("expected at least one ShardMigrated event")
    }
}

Benefits:

  • Dependency injection (publisher, clock, strategy)
  • Easy to verify events
  • Can test invariant validation
  • Can test without NATS
  • Clear, maintainable tests
  • Behavior-focused (what happened, not how)

Summary: Key Patterns to Adopt

Pattern Current Intended Benefit
Commands Message handlers Explicit methods Clear intent
Events None published First-class domain events Event-driven, auditable
Validation Scattered Centralized invariants Consistent, testable
Immutability Mutable state Value objects, copy-on-write Prevents bugs
Strategy Stubbed Real implementation Actually works
Testing Hard (coupled) Dependency injection, mocks Easy, comprehensive

References