crossmate

A collaborative crossword app for iOS
Log | Files | Refs | LICENSE

commit 468919ba2913eab8c1f3effaadf348a7db1896ae
parent d42141b1dad973acead5d6cd098dd920914bc4e4
Author: Michael Camilleri <[email protected]>
Date:   Fri, 29 May 2026 12:26:48 +0900

Harden live-engagement reconnection

A live engagement channel that dropped often failed to come back on its
own — the only reliable recovery was manually re-offering the
connection. Two things caused it. Auto-reconnect was purely
edge-triggered (channel close, foreground, a peer's cursor move) and
peer presence was gated on a 60-second cursor- freshness window. So a
present-but-quiet peer aged out of 'present' after a minute of not
moving their cursor, and with no edge re-firing the channel sat dead
until the user intervened.

Presence now rides the readAt active-lease instead of cursor freshness:
presentPeers/hasPresentPeer gate on readAt > now, and
peerPresenceFreshness- Window is gone. readAt is set ~10 minutes ahead
while a puzzle is open and collapsed to now on leave, so it survives
think-time and self-expires if a peer vanishes uncleanly. The value is
already persisted per peer (RecordApplier adopts every inbound Player
record's readAt), so this is a predicate swap, not new plumbing.

On top of that signal, three reconnect paths:

- Re-establish on foreground. scenePhase .active now re-offers
  engagement; nothing previously rebuilt the socket that backgrounding
  tears down.

- A periodic backstop. While a shared puzzle is open, a 30s timer
  re-runs the coordinator's presence check, so a reconnect edge lost to
  suspension, a failed connect, or a readAt-only lease refresh
  self-heals instead of waiting for a manual nudge. It is a no-op in
  steady state (live, or no present peer).

- A lease refresh on that same timer. A peer's own typing never advances
  its own lease — only open, foreground, and incoming peer moves do — so
  an active solo driver whose partner is idle would otherwise lapse
  mid-session and look absent. The timer renews the local lease
  (floor-gated, so it writes at most ~once per 5 minutes), making readAt
  a true 'I'm on this puzzle' heartbeat.

The reliable heartbeat is what lets the coordinator finally tear down a
stale channel: peerPresenceMayHaveChanged now disconnects any live
channel whose connected peer's lease has expired. It collects targets
before awaiting (so the states walk can't race a reentrant mutation) and
disconnects via the host without pre-clearing state, so the resulting
channelClose drives the normal idle transition and cleanup. .connecting
is left to the existing 30s sweep, since an in-flight hail may just be
racing the peer's first lease write.

Co-Authored-By: Claude Opus 4.8 <[email protected]>

Diffstat:
MCrossmate/CrossmateApp.swift | 10+++++++++-
MCrossmate/Services/AppServices.swift | 73+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
MCrossmate/Sync/EngagementCoordinator.swift | 34++++++++++++++++++++++++++++++++++
MTests/Unit/Sync/AppServicesAnnouncementTests.swift | 24++++++++++++------------
MTests/Unit/Sync/EngagementCoordinatorTests.swift | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 172 insertions(+), 23 deletions(-)

diff --git a/Crossmate/CrossmateApp.swift b/Crossmate/CrossmateApp.swift @@ -565,7 +565,15 @@ private struct PuzzleDisplayView: View { let id = gameID switch newPhase { case .active: - Task { await services.publishReadCursor(for: id, mode: .activeLease) } + Task { + await services.publishReadCursor(for: id, mode: .activeLease) + // Backgrounding tears the engagement socket down without + // rebuilding it, so a live session that dropped while we + // were away never comes back on its own. Re-offer on + // resume; this is a no-op when the channel is still live + // (the coordinator only acts from an idle state). + await services.startEngagementIfPossible(gameID: id) + } case .background: Task { await services.publishReadCursor(for: id, mode: .currentTime) } case .inactive: diff --git a/Crossmate/Services/AppServices.swift b/Crossmate/Services/AppServices.swift @@ -24,9 +24,19 @@ final class AppServices { private static let readLeaseDuration: TimeInterval = 10 * 60 private static let readLeaseRefreshFloor: TimeInterval = 5 * 60 - private nonisolated static let peerPresenceFreshnessWindow: TimeInterval = 60 private static let engagementTeardownDelaySeconds = 120 private static let engagementTeardownDelay: Duration = .seconds(engagementTeardownDelaySeconds) + /// How often a foregrounded shared puzzle re-runs the engagement reconnect + /// check. Engagement auto-connect is otherwise edge-triggered (channel + /// close, foreground, a peer's cursor move), so a drop whose re-connect + /// edge never lands — a failed connect, a `readAt`-only lease refresh, an + /// edge lost to suspension — stays disconnected until the user nudges it + /// manually. This timer is the backstop. It re-runs `peerPresenceMayHave\ + /// Changed`, which is a no-op unless the coordinator is idle *and* a peer + /// holds a live lease, so a steady-state live (or peerless) session does + /// no work and writes nothing; the coordinator's connecting-state guard + /// caps any actual re-hail rate. + private static let engagementReconnectInterval: Duration = .seconds(30) enum FreshenReason { case appeared @@ -145,6 +155,7 @@ final class AppServices { private var isGameListVisible = false private var latestLocalSelections: [UUID: PlayerSelection] = [:] private var scheduledEngagementEndTasks: [UUID: Task<Void, Never>] = [:] + private var engagementReconnectTasks: [UUID: Task<Void, Never>] = [:] init() { let preferences = PlayerPreferences() @@ -992,10 +1003,12 @@ final class AppServices { guard preferences.isICloudSyncEnabled else { return } guard await ensureICloudSyncStarted() else { return } await engagementCoordinator.peerPresenceMayHaveChanged(gameIDs: [gameID]) + startEngagementReconnectRetry(gameID: gameID) } func endEngagement(gameID: UUID) async { cancelScheduledEngagementEnd(gameID: gameID) + cancelEngagementReconnectRetry(gameID: gameID) syncMonitor.note("engagement: ending for \(gameID.uuidString)") engagementStatus.setLive(false, gameID: gameID) latestLocalSelections[gameID] = nil @@ -1005,6 +1018,11 @@ final class AppServices { func scheduleEngagementEnd(gameID: UUID) { cancelScheduledEngagementEnd(gameID: gameID) + // Leaving the puzzle stops the reconnect backstop regardless of + // whether a channel ever went live — otherwise a puzzle that was + // opened but never connected would tick forever. A quick return + // re-arms it via `startEngagementIfPossible`. + cancelEngagementReconnectRetry(gameID: gameID) guard engagementStatus.isLive(gameID: gameID) else { return } syncMonitor.note( "engagement: scheduled ending for \(gameID.uuidString) " + @@ -1032,6 +1050,40 @@ final class AppServices { await endEngagement(gameID: gameID) } + /// Arms the periodic engagement-presence tick for `gameID`, replacing any + /// prior timer for the same game (so a re-arm on foreground doesn't stack). + /// Each tick does two things: + /// + /// 1. Renews our own read lease (`publishReadCursor(.activeLease)`, which + /// is floor-gated so it writes at most ~once per 5 min). This is what + /// makes `readAt` a true "foregrounded on this puzzle" heartbeat: a + /// peer's own typing never advances their own lease, so without this an + /// active solo driver would lapse mid-session and look absent. + /// 2. Re-runs the coordinator's presence check, which both reconnects a + /// dropped channel and tears down a live channel whose peer's lease has + /// expired — neither of which is safe until (1) keeps present peers + /// from falsely lapsing. + /// + /// Both are no-ops in steady state (lease has >5 min left; coordinator is + /// live with a present peer). Cancelled on leave via `scheduleEngagement + /// End`/`endEngagement`; goes dormant under suspension (the sleep can't + /// advance) and is re-armed by the foreground `startEngagementIfPossible`. + private func startEngagementReconnectRetry(gameID: UUID) { + engagementReconnectTasks[gameID]?.cancel() + engagementReconnectTasks[gameID] = Task { [weak self] in + while !Task.isCancelled { + try? await Task.sleep(for: Self.engagementReconnectInterval) + guard !Task.isCancelled, let self else { return } + await self.publishReadCursor(for: gameID, mode: .activeLease) + await self.engagementCoordinator.peerPresenceMayHaveChanged(gameIDs: [gameID]) + } + } + } + + private func cancelEngagementReconnectRetry(gameID: UUID) { + engagementReconnectTasks.removeValue(forKey: gameID)?.cancel() + } + func noteLocalSelection(_ selection: PlayerSelection, gameID: UUID) async { latestLocalSelections[gameID] = selection guard engagementStatus.isLive(gameID: gameID), @@ -2357,10 +2409,13 @@ final class AppServices { } } - /// True iff some non-local participant in `gameID` has a fresh cursor - /// track. Cursor fields are the user-visible presence signal; stale - /// cursor rows are ignored so a solo solver in a shared puzzle does not - /// keep publishing local cursor changes after the peer has left. + /// True iff some non-local participant in `gameID` currently holds a + /// valid read lease (`readAt` in the future). The active-lease cursor — + /// set ~10 minutes ahead while the puzzle is open and collapsed to `now` + /// on leave — is the presence signal: it survives think-time without + /// cursor movement and self-expires if a peer vanishes uncleanly, so a + /// solo solver in a shared puzzle stops treating a departed peer as + /// present within the lease window rather than on every paused minute. static func hasPresentPeer( persistence: PersistenceController, gameID: UUID, @@ -2370,11 +2425,10 @@ final class AppServices { return await withCheckedContinuation { continuation in context.perform { let req = NSFetchRequest<PlayerEntity>(entityName: "PlayerEntity") - let cutoff = Date().addingTimeInterval(-peerPresenceFreshnessWindow) req.predicate = NSPredicate( - format: "game.id == %@ AND updatedAt > %@ AND selRow != nil AND selCol != nil AND selDir != nil", + format: "game.id == %@ AND readAt > %@", gameID as CVarArg, - cutoff as NSDate + Date() as NSDate ) let players = (try? context.fetch(req)) ?? [] let hasPeer = players.contains { player in @@ -2397,9 +2451,8 @@ final class AppServices { return await withCheckedContinuation { continuation in context.perform { let req = NSFetchRequest<PlayerEntity>(entityName: "PlayerEntity") - let cutoff = Date().addingTimeInterval(-peerPresenceFreshnessWindow) var predicates = [ - NSPredicate(format: "updatedAt > %@ AND selRow != nil AND selCol != nil AND selDir != nil", cutoff as NSDate) + NSPredicate(format: "readAt > %@", Date() as NSDate) ] if let gameIDs, !gameIDs.isEmpty { predicates.append(NSPredicate(format: "game.id IN %@", Array(gameIDs))) diff --git a/Crossmate/Sync/EngagementCoordinator.swift b/Crossmate/Sync/EngagementCoordinator.swift @@ -172,6 +172,16 @@ actor EngagementCoordinator { room.roomID } } + + var peerAuthorID: String? { + switch self { + case .idle: + nil + case .connecting(let peerAuthorID, _, _, _), + .live(let peerAuthorID, _, _): + peerAuthorID + } + } } private let host: any EngagementTransporting @@ -217,6 +227,30 @@ actor EngagementCoordinator { guard let localAuthorID = await localAuthorID(), !localAuthorID.isEmpty else { return } await sweepStaleConnections() let peersByGame = await presentPeers(gameIDs) + + // Tear down any live channel whose connected peer no longer holds a + // valid lease — they've left or vanished. Disconnecting the host + // fires `.channelClose`, which drives the normal idle transition and + // downstream cleanup, so we don't mutate state here. Collected before + // awaiting so the `states` walk doesn't race a reentrant mutation. + // `.connecting` is left to `sweepStaleConnections`' timeout, since an + // in-flight hail may simply be racing the peer's first lease write. + var expired: [(gameID: UUID, engagementID: UUID, peerAuthorID: String)] = [] + for (gameID, state) in states { + if let gameIDs, !gameIDs.contains(gameID) { continue } + guard case .live(let peerAuthorID, let engagementID, _) = state else { continue } + if !(peersByGame[gameID]?.contains(peerAuthorID) ?? false) { + expired.append((gameID, engagementID, peerAuthorID)) + } + } + for entry in expired { + await log( + "engagement: peer \(entry.peerAuthorID) lease expired for " + + "\(entry.gameID.uuidString), tearing down \(entry.engagementID.uuidString)" + ) + await host.disconnect(engagementID: entry.engagementID) + } + for (gameID, peers) in peersByGame { guard state(for: gameID) == .idle else { continue } guard let peerAuthorID = peers.sorted().first(where: { localAuthorID < $0 }) else { continue } diff --git a/Tests/Unit/Sync/AppServicesAnnouncementTests.swift b/Tests/Unit/Sync/AppServicesAnnouncementTests.swift @@ -56,14 +56,14 @@ struct AppServicesAnnouncementTests { @MainActor struct AppServicesPeerPresenceTests { - @Test("fresh cursor track counts as a present peer without a read lease") - func freshCursorTrackCountsAsPresentPeer() async throws { + @Test("valid read lease counts as a present peer, even without a cursor") + func validReadLeaseCountsAsPresentPeer() async throws { let (persistence, gameID) = try makePersistence(authorID: "alice") try addPlayer( gameID: gameID, authorID: "bob", - selection: PlayerSelection(row: 1, col: 2, direction: .down), - readAt: nil, + selection: nil, + readAt: Date().addingTimeInterval(10 * 60), updatedAt: Date(), persistence: persistence ) @@ -83,15 +83,15 @@ struct AppServicesPeerPresenceTests { #expect(peers[gameID] == ["bob"]) } - @Test("stale cursor track does not count as a present peer") - func staleCursorTrackDoesNotCountAsPresentPeer() async throws { + @Test("expired read lease does not count as a present peer") + func expiredReadLeaseDoesNotCountAsPresentPeer() async throws { let (persistence, gameID) = try makePersistence(authorID: "alice") try addPlayer( gameID: gameID, authorID: "bob", selection: PlayerSelection(row: 1, col: 2, direction: .down), - readAt: nil, - updatedAt: Date().addingTimeInterval(-90), + readAt: Date().addingTimeInterval(-1), + updatedAt: Date(), persistence: persistence ) @@ -110,14 +110,14 @@ struct AppServicesPeerPresenceTests { #expect(peers[gameID] == nil) } - @Test("read lease alone does not count as engagement presence") - func readLeaseWithoutCursorDoesNotCountAsPresentPeer() async throws { + @Test("a fresh cursor without a read lease does not count as a present peer") + func freshCursorWithoutLeaseDoesNotCountAsPresentPeer() async throws { let (persistence, gameID) = try makePersistence(authorID: "alice") try addPlayer( gameID: gameID, authorID: "bob", - selection: nil, - readAt: Date().addingTimeInterval(60), + selection: PlayerSelection(row: 1, col: 2, direction: .down), + readAt: nil, updatedAt: Date(), persistence: persistence ) diff --git a/Tests/Unit/Sync/EngagementCoordinatorTests.swift b/Tests/Unit/Sync/EngagementCoordinatorTests.swift @@ -246,6 +246,43 @@ struct EngagementCoordinatorTests { #expect(host.connections.count == 2) } + @Test("live channel tears down when the peer's lease expires, and not before") + @MainActor + func livePeerLeaseExpiryTearsDown() async throws { + let gameID = UUID(uuidString: "99999999-9999-9999-9999-999999999999")! + let host = MockEngagementHost() + let sink = EngagementCoordinatorTestSink() + let presence = PresenceSource([gameID: ["bob"]]) + let coordinator = EngagementCoordinator( + host: host, + localAuthorID: { "alice" }, + localDeviceID: "deviceA", + presentPeers: { await presence.current($0) }, + sendHail: { gameID, payload, addressee in + await sink.send(gameID: gameID, payload: payload, addressee: addressee) + }, + deletePing: { recordName, gameID in + await sink.delete(recordName: recordName, gameID: gameID) + } + ) + + // Connect and bring the channel live. + await coordinator.peerPresenceMayHaveChanged(gameIDs: [gameID]) + let engagementID = try #require(host.connections.first?.engagementID) + #expect(await coordinator.channelOpened(engagementID: engagementID) == gameID) + + // A later tick with the peer still present leaves the channel alone. + await coordinator.peerPresenceMayHaveChanged(gameIDs: [gameID]) + #expect(host.disconnected.isEmpty) + + // Peer's lease lapses → present set drops them → channel is torn down, + // and the now-peerless game is not immediately re-hailed. + await presence.set([:]) + await coordinator.peerPresenceMayHaveChanged(gameIDs: [gameID]) + #expect(host.disconnected == [engagementID]) + #expect(await sink.sentHails().count == 1) + } + @Test("stale and expired hails are deleted") @MainActor func staleAndExpiredHailsAreDeleted() async throws { @@ -362,6 +399,23 @@ private struct DeletedPing: Equatable, Sendable { var gameID: UUID } +private actor PresenceSource { + private var peersByGame: [UUID: [String]] + + init(_ initial: [UUID: [String]]) { + self.peersByGame = initial + } + + func set(_ next: [UUID: [String]]) { + peersByGame = next + } + + func current(_ gameIDs: Set<UUID>?) -> [UUID: [String]] { + guard let gameIDs else { return peersByGame } + return peersByGame.filter { gameIDs.contains($0.key) } + } +} + private actor EngagementCoordinatorTestSink { private var sent: [SentHail] = [] private var deleted: [DeletedPing] = []