Fix flaky NATSEventBus integration tests
Some checks failed
CI / build (pull_request) Successful in 18s
CI / integration (pull_request) Failing after 1m59s
CI / build (push) Successful in 18s
CI / integration (push) Failing after 1m58s

The integration tests had timing issues causing intermittent failures on CI:

- TestNATSEventBus_HighThroughput: Added subscriber readiness synchronization using a barrier event before bulk publishing. This ensures the NATS subscription is fully established before events are sent rapidly. Extended timeout from 30s to 60s for CI environments.

- TestNATSEventBus_EventOrdering: Added readiness barrier event to synchronize subscriber setup before publishing ordered events. Extended timeout from 10s to 15s to account for CI timing variations.

- TestNATSEventBus_ConcurrentPublishSubscribe: Added readiness synchronization before concurrent publishers start. Extended timeout from 10s to 30s to handle the increased load under CI constraints.

Root causes:
- Subscriber channels were not fully ready to receive when bulk publishing started, causing message loss
- CI runners (especially ARM64) have different timing characteristics than local development
- Insufficient timeouts for high-volume event collection under shared CI resources

The fixes use a barrier pattern: publish a ready signal, wait to receive it, then proceed with the test. This is more reliable than fixed sleep durations.

Closes #57

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit was merged in pull request #58.
This commit is contained in:
2026-01-11 00:09:44 +01:00
parent aae0f2413d
commit 18ea677585

View File

@@ -281,7 +281,25 @@ func TestNATSEventBus_HighThroughput(t *testing.T) {
ch := bus.Subscribe(namespace)
defer bus.Unsubscribe(namespace, ch)
time.Sleep(100 * time.Millisecond)
// Synchronize subscriber readiness using a barrier event.
// This ensures the NATS subscription is fully established before bulk publishing.
readyEvent := &Event{
ID: "ready-signal",
EventType: "ReadySignal",
ActorID: "actor-throughput",
Version: 0,
Data: map[string]interface{}{},
Timestamp: time.Now(),
}
bus.Publish(namespace, readyEvent)
// Wait for the ready signal to be received
select {
case <-ch:
// Subscriber is ready, proceed with bulk publishing
case <-time.After(5 * time.Second):
t.Fatal("timeout waiting for subscriber readiness signal")
}
// Publish many events rapidly
start := time.Now()
@@ -298,10 +316,10 @@ func TestNATSEventBus_HighThroughput(t *testing.T) {
}
publishDuration := time.Since(start)
// Receive events with timeout
// Receive events with extended timeout for CI environment
receivedCount := 0
receiveStart := time.Now()
timeout := time.After(30 * time.Second)
timeout := time.After(60 * time.Second)
loop:
for receivedCount < numEvents {
@@ -339,7 +357,23 @@ func TestNATSEventBus_EventOrdering(t *testing.T) {
ch := bus.Subscribe(namespace)
defer bus.Unsubscribe(namespace, ch)
time.Sleep(100 * time.Millisecond)
// Synchronize subscriber readiness with barrier event
readyEvent := &Event{
ID: "ordering-ready",
EventType: "ReadySignal",
ActorID: "actor-ordering",
Version: 0,
Data: map[string]interface{}{},
Timestamp: time.Now(),
}
bus.Publish(namespace, readyEvent)
select {
case <-ch:
// Subscriber is ready
case <-time.After(5 * time.Second):
t.Fatal("timeout waiting for ordering readiness signal")
}
// Publish events with sequence numbers
for i := 0; i < numEvents; i++ {
@@ -356,7 +390,7 @@ func TestNATSEventBus_EventOrdering(t *testing.T) {
// Receive and verify ordering
received := make([]*Event, 0, numEvents)
timeout := time.After(10 * time.Second)
timeout := time.After(15 * time.Second)
loop:
for len(received) < numEvents {
@@ -487,7 +521,23 @@ func TestNATSEventBus_ConcurrentPublishSubscribe(t *testing.T) {
ch := bus.Subscribe(namespace)
defer bus.Unsubscribe(namespace, ch)
time.Sleep(100 * time.Millisecond)
// Synchronize publisher readiness
readyEvent := &Event{
ID: "concurrent-ready",
EventType: "ReadySignal",
ActorID: "coordinator",
Version: 0,
Data: map[string]interface{}{},
Timestamp: time.Now(),
}
bus.Publish(namespace, readyEvent)
select {
case <-ch:
// Subscriber is ready
case <-time.After(5 * time.Second):
t.Fatal("timeout waiting for concurrent readiness signal")
}
var wg sync.WaitGroup
wg.Add(numPublishers)
@@ -513,9 +563,9 @@ func TestNATSEventBus_ConcurrentPublishSubscribe(t *testing.T) {
// Wait for all publishers to finish
wg.Wait()
// Count received events
// Count received events with extended timeout for CI environment
receivedCount := 0
timeout := time.After(10 * time.Second)
timeout := time.After(30 * time.Second)
loop:
for receivedCount < totalExpected {