crossmate

A collaborative crossword app for iOS
Log | Files | Refs | LICENSE

commit a36745c9b42abd96d15ce219e0f22f1013420c87
parent c247b9228803408d96904c20c96d6e69fcadbdd0
Author: Michael Camilleri <[email protected]>
Date:   Tue,  2 Jun 2026 11:20:30 +0900

Harden the engagement room and keep idle sockets warm

The live channel is a Cloudflare Durable Object (EngagementRoom) the app
reaches over a .default URLSession WebSocket. Reviewing a diagnostic
log revealed the following:

1. The room accepts every authenticated socket and never kicks a peer,
   so an error that occurs with one peer can abort the fan-out with
   every peer later in the iteration silently missed that frame. Each
   send is now wrapped so one dead socket can't drop a live letter or
   cursor for the rest of the room.

2. Neither side sent keepalives, so an idle foreground socket was reaped
   by NAT / edge idle timeouts and surfaced as 'Software caused
   connection abort'. The room now answers application pings via
   setWebSocketAutoResponse — handled by the runtime without waking the
   Durable Object — and EngagementHost sends a 'ping' every 25s,
   filtering 'ping' and 'pong' out of the inbound stream so keepalive
   never reaches the engagement decoder.

3. webSocketMessage wrote lastSeenAt and rescheduled the expiry alarm on
   every message — a durable write per keystroke. That work moves into a
   debounced touch(). Fixing this exposed a latent alarm bug: while
   sockets were connected the alarm re-armed from a stale lastSeenAt, so
   a long idle-but-connected room — now common, since keepalive stops
   the socket dropping — would refire in a tight loop. The alarm re-arms
   from now when sockets are present.

Co-Authored-By: Claude Opus 4.8 <[email protected]>

Diffstat:
MCrossmate/Services/EngagementHost.swift | 38++++++++++++++++++++++++++++++++++++++
MWorker/engagement-worker.js | 52++++++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/Crossmate/Services/EngagementHost.swift b/Crossmate/Services/EngagementHost.swift @@ -14,8 +14,11 @@ final class EngagementHost: NSObject { var onEvent: ((Event) -> Void)? + private static let keepaliveInterval: Duration = .seconds(25) + private var sockets: [UUID: URLSessionWebSocketTask] = [:] private var engagementIDsByTask: [ObjectIdentifier: UUID] = [:] + private var pingTasks: [UUID: Task<Void, Never>] = [:] // A `.default` URLSession cannot hold a socket open while the app is // suspended — iOS aborts it (`Software caused connection abort`) on the way // to the background. This is by design: the live channel is a foreground-only @@ -44,9 +47,35 @@ final class EngagementHost: NSObject { engagementIDsByTask[ObjectIdentifier(task)] = engagementID task.resume() receiveNext(engagementID: engagementID, task: task) + startPing(engagementID: engagementID) onEvent?(.diagnostic(engagementID: engagementID, message: "socket connecting \(room.roomID.uuidString)")) } + /// Sends an app-level "ping" on a timer so an idle foreground socket + /// survives NAT / Cloudflare edge idle timeouts. The room Worker answers + /// "pong" via `setWebSocketAutoResponse` without waking the Durable Object; + /// both strings are filtered out of the inbound stream in `receiveNext` + /// (they are not engagement messages). Stops on the first failed send or + /// when the socket is gone — the close path then tears the channel down. + private func startPing(engagementID: UUID) { + pingTasks[engagementID]?.cancel() + pingTasks[engagementID] = Task { @MainActor [weak self] in + while !Task.isCancelled { + try? await Task.sleep(for: Self.keepaliveInterval) + guard !Task.isCancelled, let self, let task = self.sockets[engagementID] else { return } + do { + try await task.send(.string("ping")) + } catch { + return + } + } + } + } + + private func stopPing(engagementID: UUID) { + pingTasks.removeValue(forKey: engagementID)?.cancel() + } + func send(engagementID: UUID, message: Data) async throws { guard let task = sockets[engagementID] else { throw EngagementHostError.missingSocket @@ -55,6 +84,7 @@ final class EngagementHost: NSObject { } func disconnect(engagementID: UUID) { + stopPing(engagementID: engagementID) guard let task = sockets.removeValue(forKey: engagementID) else { return } engagementIDsByTask.removeValue(forKey: ObjectIdentifier(task)) task.cancel(with: .goingAway, reason: nil) @@ -70,10 +100,17 @@ final class EngagementHost: NSObject { self.onEvent?(.channelMessage(engagementID: engagementID, message: data)) self.receiveNext(engagementID: engagementID, task: task) case .success(.string(let string)): + // Keepalive frames (see `startPing`) are not engagement + // messages; swallow them and keep listening. + guard string != "ping", string != "pong" else { + self.receiveNext(engagementID: engagementID, task: task) + return + } let data = Data(string.utf8) self.onEvent?(.channelMessage(engagementID: engagementID, message: data)) self.receiveNext(engagementID: engagementID, task: task) case .failure(let error): + self.stopPing(engagementID: engagementID) self.sockets.removeValue(forKey: engagementID) self.engagementIDsByTask.removeValue(forKey: ObjectIdentifier(task)) self.onEvent?(.error(engagementID: engagementID, message: error.localizedDescription)) @@ -158,6 +195,7 @@ extension EngagementHost: URLSessionWebSocketDelegate { guard let engagementID = engagementIDsByTask.removeValue(forKey: ObjectIdentifier(webSocketTask)) else { return } + stopPing(engagementID: engagementID) sockets.removeValue(forKey: engagementID) onEvent?(.channelClose(engagementID: engagementID)) } diff --git a/Worker/engagement-worker.js b/Worker/engagement-worker.js @@ -1,7 +1,18 @@ +// Coarse debounce for the durable `lastSeenAt` write on the message hot path. +// The room TTL only needs second-ish granularity, so a write + alarm reschedule +// on every keystroke broadcast is pure churn. In-memory, so it resets to a +// guaranteed write on each hibernation wake — which is fine. +const TOUCH_DEBOUNCE_MS = 30 * 1000; + export class EngagementRoom { constructor(state, env) { this.state = state; this.env = env; + // Answer keepalive pings without waking the Durable Object, so an otherwise + // idle foreground socket stays warm through NAT / edge idle timeouts. Pairs + // with the client's periodic "ping" (EngagementHost.startPing). Auto-answered + // frames never reach `webSocketMessage`, so they cost nothing here. + this.state.setWebSocketAutoResponse(new WebSocketRequestResponsePair("ping", "pong")); } async fetch(request) { @@ -28,6 +39,20 @@ export class EngagementRoom { deviceID: auth.deviceID, connectedAt: Date.now() }); + // Supersede any earlier socket from the same device. A reconnect would + // otherwise leave the stale one lingering until its TCP dies, so the room + // fans every broadcast at a zombie and can echo the device's own prior + // frames back to it. `server` is not accepted yet, so it is not in the set. + for (const existing of this.state.getWebSockets()) { + const attachment = existing.deserializeAttachment(); + if (attachment && attachment.authorID === auth.authorID && attachment.deviceID === auth.deviceID) { + try { + existing.close(1000, "Superseded by a newer connection"); + } catch { + // Already closing; the runtime will reap it. + } + } + } this.state.acceptWebSocket(server); return new Response(null, { @@ -80,17 +105,23 @@ export class EngagementRoom { await this.state.storage.put(nonceKey, Date.now()); await this.pruneNonces(); await this.scheduleExpiry(); + this.lastTouchAt = Date.now(); return { ok: true, authorID, deviceID }; } async webSocketMessage(ws, message) { const data = typeof message === "string" ? message : message.slice(0); - await this.state.storage.put("lastSeenAt", Date.now()); - await this.scheduleExpiry(); + await this.touch(); for (const peer of this.state.getWebSockets()) { if (peer === ws) continue; - peer.send(data); + try { + peer.send(data); + } catch { + // A peer caught mid-close throws here; skip it so the rest of the room + // still receives this frame. The dead socket is reaped via + // `webSocketClose` / runtime cleanup. + } } } @@ -105,7 +136,11 @@ export class EngagementRoom { const ttlMs = Number(this.env.ROOM_TTL_SECONDS || "600") * 1000; const sockets = this.state.getWebSockets(); if (sockets.length > 0) { - await this.scheduleExpiry(); + // Still connected — the room is alive regardless of how long since the + // last broadcast (keepalive pings are auto-answered and don't advance + // `lastSeenAt`). Re-arm from now, not the stale deadline, or a long idle + // session would refire the alarm in a tight loop. + await this.state.storage.setAlarm(Date.now() + ttlMs); return; } @@ -118,6 +153,15 @@ export class EngagementRoom { await this.scheduleExpiry(); } + async touch() { + const now = Date.now(); + // Debounce the durable write + alarm reschedule off the message hot path. + if (this.lastTouchAt && now - this.lastTouchAt < TOUCH_DEBOUNCE_MS) return; + this.lastTouchAt = now; + await this.state.storage.put("lastSeenAt", now); + await this.scheduleExpiry(); + } + async pruneNonces() { const maxAgeMs = Number(this.env.NONCE_TTL_SECONDS || "300") * 1000; const cutoff = Date.now() - maxAgeMs;