commit a36745c9b42abd96d15ce219e0f22f1013420c87
parent c247b9228803408d96904c20c96d6e69fcadbdd0
Author: Michael Camilleri <[email protected]>
Date: Tue, 2 Jun 2026 11:20:30 +0900
Harden the engagement room and keep idle sockets warm
The live channel is a Cloudflare Durable Object (EngagementRoom) the app
reaches over a .default URLSession WebSocket. Reviewing a diagnostic
log revealed the following:
1. The room accepts every authenticated socket and never kicks a peer,
so an error that occurs with one peer can abort the fan-out with
every peer later in the iteration silently missed that frame. Each
send is now wrapped so one dead socket can't drop a live letter or
cursor for the rest of the room.
2. Neither side sent keepalives, so an idle foreground socket was reaped
by NAT / edge idle timeouts and surfaced as 'Software caused
connection abort'. The room now answers application pings via
setWebSocketAutoResponse — handled by the runtime without waking the
Durable Object — and EngagementHost sends a 'ping' every 25s,
filtering 'ping' and 'pong' out of the inbound stream so keepalive
never reaches the engagement decoder.
3. webSocketMessage wrote lastSeenAt and rescheduled the expiry alarm on
every message — a durable write per keystroke. That work moves into a
debounced touch(). Fixing this exposed a latent alarm bug: while
sockets were connected the alarm re-armed from a stale lastSeenAt, so
a long idle-but-connected room — now common, since keepalive stops
the socket dropping — would refire in a tight loop. The alarm re-arms
from now when sockets are present.
Co-Authored-By: Claude Opus 4.8 <[email protected]>
Diffstat:
2 files changed, 86 insertions(+), 4 deletions(-)
diff --git a/Crossmate/Services/EngagementHost.swift b/Crossmate/Services/EngagementHost.swift
@@ -14,8 +14,11 @@ final class EngagementHost: NSObject {
var onEvent: ((Event) -> Void)?
+ private static let keepaliveInterval: Duration = .seconds(25)
+
private var sockets: [UUID: URLSessionWebSocketTask] = [:]
private var engagementIDsByTask: [ObjectIdentifier: UUID] = [:]
+ private var pingTasks: [UUID: Task<Void, Never>] = [:]
// A `.default` URLSession cannot hold a socket open while the app is
// suspended — iOS aborts it (`Software caused connection abort`) on the way
// to the background. This is by design: the live channel is a foreground-only
@@ -44,9 +47,35 @@ final class EngagementHost: NSObject {
engagementIDsByTask[ObjectIdentifier(task)] = engagementID
task.resume()
receiveNext(engagementID: engagementID, task: task)
+ startPing(engagementID: engagementID)
onEvent?(.diagnostic(engagementID: engagementID, message: "socket connecting \(room.roomID.uuidString)"))
}
+ /// Sends an app-level "ping" on a timer so an idle foreground socket
+ /// survives NAT / Cloudflare edge idle timeouts. The room Worker answers
+ /// "pong" via `setWebSocketAutoResponse` without waking the Durable Object;
+ /// both strings are filtered out of the inbound stream in `receiveNext`
+ /// (they are not engagement messages). Stops on the first failed send or
+ /// when the socket is gone — the close path then tears the channel down.
+ private func startPing(engagementID: UUID) {
+ pingTasks[engagementID]?.cancel()
+ pingTasks[engagementID] = Task { @MainActor [weak self] in
+ while !Task.isCancelled {
+ try? await Task.sleep(for: Self.keepaliveInterval)
+ guard !Task.isCancelled, let self, let task = self.sockets[engagementID] else { return }
+ do {
+ try await task.send(.string("ping"))
+ } catch {
+ return
+ }
+ }
+ }
+ }
+
+ private func stopPing(engagementID: UUID) {
+ pingTasks.removeValue(forKey: engagementID)?.cancel()
+ }
+
func send(engagementID: UUID, message: Data) async throws {
guard let task = sockets[engagementID] else {
throw EngagementHostError.missingSocket
@@ -55,6 +84,7 @@ final class EngagementHost: NSObject {
}
func disconnect(engagementID: UUID) {
+ stopPing(engagementID: engagementID)
guard let task = sockets.removeValue(forKey: engagementID) else { return }
engagementIDsByTask.removeValue(forKey: ObjectIdentifier(task))
task.cancel(with: .goingAway, reason: nil)
@@ -70,10 +100,17 @@ final class EngagementHost: NSObject {
self.onEvent?(.channelMessage(engagementID: engagementID, message: data))
self.receiveNext(engagementID: engagementID, task: task)
case .success(.string(let string)):
+ // Keepalive frames (see `startPing`) are not engagement
+ // messages; swallow them and keep listening.
+ guard string != "ping", string != "pong" else {
+ self.receiveNext(engagementID: engagementID, task: task)
+ return
+ }
let data = Data(string.utf8)
self.onEvent?(.channelMessage(engagementID: engagementID, message: data))
self.receiveNext(engagementID: engagementID, task: task)
case .failure(let error):
+ self.stopPing(engagementID: engagementID)
self.sockets.removeValue(forKey: engagementID)
self.engagementIDsByTask.removeValue(forKey: ObjectIdentifier(task))
self.onEvent?(.error(engagementID: engagementID, message: error.localizedDescription))
@@ -158,6 +195,7 @@ extension EngagementHost: URLSessionWebSocketDelegate {
guard let engagementID = engagementIDsByTask.removeValue(forKey: ObjectIdentifier(webSocketTask)) else {
return
}
+ stopPing(engagementID: engagementID)
sockets.removeValue(forKey: engagementID)
onEvent?(.channelClose(engagementID: engagementID))
}
diff --git a/Worker/engagement-worker.js b/Worker/engagement-worker.js
@@ -1,7 +1,18 @@
+// Coarse debounce for the durable `lastSeenAt` write on the message hot path.
+// The room TTL only needs second-ish granularity, so a write + alarm reschedule
+// on every keystroke broadcast is pure churn. In-memory, so it resets to a
+// guaranteed write on each hibernation wake — which is fine.
+const TOUCH_DEBOUNCE_MS = 30 * 1000;
+
export class EngagementRoom {
constructor(state, env) {
this.state = state;
this.env = env;
+ // Answer keepalive pings without waking the Durable Object, so an otherwise
+ // idle foreground socket stays warm through NAT / edge idle timeouts. Pairs
+ // with the client's periodic "ping" (EngagementHost.startPing). Auto-answered
+ // frames never reach `webSocketMessage`, so they cost nothing here.
+ this.state.setWebSocketAutoResponse(new WebSocketRequestResponsePair("ping", "pong"));
}
async fetch(request) {
@@ -28,6 +39,20 @@ export class EngagementRoom {
deviceID: auth.deviceID,
connectedAt: Date.now()
});
+ // Supersede any earlier socket from the same device. A reconnect would
+ // otherwise leave the stale one lingering until its TCP dies, so the room
+ // fans every broadcast at a zombie and can echo the device's own prior
+ // frames back to it. `server` is not accepted yet, so it is not in the set.
+ for (const existing of this.state.getWebSockets()) {
+ const attachment = existing.deserializeAttachment();
+ if (attachment && attachment.authorID === auth.authorID && attachment.deviceID === auth.deviceID) {
+ try {
+ existing.close(1000, "Superseded by a newer connection");
+ } catch {
+ // Already closing; the runtime will reap it.
+ }
+ }
+ }
this.state.acceptWebSocket(server);
return new Response(null, {
@@ -80,17 +105,23 @@ export class EngagementRoom {
await this.state.storage.put(nonceKey, Date.now());
await this.pruneNonces();
await this.scheduleExpiry();
+ this.lastTouchAt = Date.now();
return { ok: true, authorID, deviceID };
}
async webSocketMessage(ws, message) {
const data = typeof message === "string" ? message : message.slice(0);
- await this.state.storage.put("lastSeenAt", Date.now());
- await this.scheduleExpiry();
+ await this.touch();
for (const peer of this.state.getWebSockets()) {
if (peer === ws) continue;
- peer.send(data);
+ try {
+ peer.send(data);
+ } catch {
+ // A peer caught mid-close throws here; skip it so the rest of the room
+ // still receives this frame. The dead socket is reaped via
+ // `webSocketClose` / runtime cleanup.
+ }
}
}
@@ -105,7 +136,11 @@ export class EngagementRoom {
const ttlMs = Number(this.env.ROOM_TTL_SECONDS || "600") * 1000;
const sockets = this.state.getWebSockets();
if (sockets.length > 0) {
- await this.scheduleExpiry();
+ // Still connected — the room is alive regardless of how long since the
+ // last broadcast (keepalive pings are auto-answered and don't advance
+ // `lastSeenAt`). Re-arm from now, not the stale deadline, or a long idle
+ // session would refire the alarm in a tight loop.
+ await this.state.storage.setAlarm(Date.now() + ttlMs);
return;
}
@@ -118,6 +153,15 @@ export class EngagementRoom {
await this.scheduleExpiry();
}
+ async touch() {
+ const now = Date.now();
+ // Debounce the durable write + alarm reschedule off the message hot path.
+ if (this.lastTouchAt && now - this.lastTouchAt < TOUCH_DEBOUNCE_MS) return;
+ this.lastTouchAt = now;
+ await this.state.storage.put("lastSeenAt", now);
+ await this.scheduleExpiry();
+ }
+
async pruneNonces() {
const maxAgeMs = Number(this.env.NONCE_TTL_SECONDS || "300") * 1000;
const cutoff = Date.now() - maxAgeMs;