commit 468919ba2913eab8c1f3effaadf348a7db1896ae
parent d42141b1dad973acead5d6cd098dd920914bc4e4
Author: Michael Camilleri <[email protected]>
Date: Fri, 29 May 2026 12:26:48 +0900
Harden live-engagement reconnection
A live engagement channel that dropped often failed to come back on its
own — the only reliable recovery was manually re-offering the
connection. Two things caused it. Auto-reconnect was purely
edge-triggered (channel close, foreground, a peer's cursor move) and
peer presence was gated on a 60-second cursor- freshness window. So a
present-but-quiet peer aged out of 'present' after a minute of not
moving their cursor, and with no edge re-firing the channel sat dead
until the user intervened.
Presence now rides the readAt active-lease instead of cursor freshness:
presentPeers/hasPresentPeer gate on readAt > now, and
peerPresenceFreshness- Window is gone. readAt is set ~10 minutes ahead
while a puzzle is open and collapsed to now on leave, so it survives
think-time and self-expires if a peer vanishes uncleanly. The value is
already persisted per peer (RecordApplier adopts every inbound Player
record's readAt), so this is a predicate swap, not new plumbing.
On top of that signal, three reconnect paths:
- Re-establish on foreground. scenePhase .active now re-offers
engagement; nothing previously rebuilt the socket that backgrounding
tears down.
- A periodic backstop. While a shared puzzle is open, a 30s timer
re-runs the coordinator's presence check, so a reconnect edge lost to
suspension, a failed connect, or a readAt-only lease refresh
self-heals instead of waiting for a manual nudge. It is a no-op in
steady state (live, or no present peer).
- A lease refresh on that same timer. A peer's own typing never advances
its own lease — only open, foreground, and incoming peer moves do — so
an active solo driver whose partner is idle would otherwise lapse
mid-session and look absent. The timer renews the local lease
(floor-gated, so it writes at most ~once per 5 minutes), making readAt
a true 'I'm on this puzzle' heartbeat.
The reliable heartbeat is what lets the coordinator finally tear down a
stale channel: peerPresenceMayHaveChanged now disconnects any live
channel whose connected peer's lease has expired. It collects targets
before awaiting (so the states walk can't race a reentrant mutation) and
disconnects via the host without pre-clearing state, so the resulting
channelClose drives the normal idle transition and cleanup. .connecting
is left to the existing 30s sweep, since an in-flight hail may just be
racing the peer's first lease write.
Co-Authored-By: Claude Opus 4.8 <[email protected]>
Diffstat:
5 files changed, 172 insertions(+), 23 deletions(-)
diff --git a/Crossmate/CrossmateApp.swift b/Crossmate/CrossmateApp.swift
@@ -565,7 +565,15 @@ private struct PuzzleDisplayView: View {
let id = gameID
switch newPhase {
case .active:
- Task { await services.publishReadCursor(for: id, mode: .activeLease) }
+ Task {
+ await services.publishReadCursor(for: id, mode: .activeLease)
+ // Backgrounding tears the engagement socket down without
+ // rebuilding it, so a live session that dropped while we
+ // were away never comes back on its own. Re-offer on
+ // resume; this is a no-op when the channel is still live
+ // (the coordinator only acts from an idle state).
+ await services.startEngagementIfPossible(gameID: id)
+ }
case .background:
Task { await services.publishReadCursor(for: id, mode: .currentTime) }
case .inactive:
diff --git a/Crossmate/Services/AppServices.swift b/Crossmate/Services/AppServices.swift
@@ -24,9 +24,19 @@ final class AppServices {
private static let readLeaseDuration: TimeInterval = 10 * 60
private static let readLeaseRefreshFloor: TimeInterval = 5 * 60
- private nonisolated static let peerPresenceFreshnessWindow: TimeInterval = 60
private static let engagementTeardownDelaySeconds = 120
private static let engagementTeardownDelay: Duration = .seconds(engagementTeardownDelaySeconds)
+ /// How often a foregrounded shared puzzle re-runs the engagement reconnect
+ /// check. Engagement auto-connect is otherwise edge-triggered (channel
+ /// close, foreground, a peer's cursor move), so a drop whose re-connect
+ /// edge never lands — a failed connect, a `readAt`-only lease refresh, an
+ /// edge lost to suspension — stays disconnected until the user nudges it
+ /// manually. This timer is the backstop. It re-runs `peerPresenceMayHave\
+ /// Changed`, which is a no-op unless the coordinator is idle *and* a peer
+ /// holds a live lease, so a steady-state live (or peerless) session does
+ /// no work and writes nothing; the coordinator's connecting-state guard
+ /// caps any actual re-hail rate.
+ private static let engagementReconnectInterval: Duration = .seconds(30)
enum FreshenReason {
case appeared
@@ -145,6 +155,7 @@ final class AppServices {
private var isGameListVisible = false
private var latestLocalSelections: [UUID: PlayerSelection] = [:]
private var scheduledEngagementEndTasks: [UUID: Task<Void, Never>] = [:]
+ private var engagementReconnectTasks: [UUID: Task<Void, Never>] = [:]
init() {
let preferences = PlayerPreferences()
@@ -992,10 +1003,12 @@ final class AppServices {
guard preferences.isICloudSyncEnabled else { return }
guard await ensureICloudSyncStarted() else { return }
await engagementCoordinator.peerPresenceMayHaveChanged(gameIDs: [gameID])
+ startEngagementReconnectRetry(gameID: gameID)
}
func endEngagement(gameID: UUID) async {
cancelScheduledEngagementEnd(gameID: gameID)
+ cancelEngagementReconnectRetry(gameID: gameID)
syncMonitor.note("engagement: ending for \(gameID.uuidString)")
engagementStatus.setLive(false, gameID: gameID)
latestLocalSelections[gameID] = nil
@@ -1005,6 +1018,11 @@ final class AppServices {
func scheduleEngagementEnd(gameID: UUID) {
cancelScheduledEngagementEnd(gameID: gameID)
+ // Leaving the puzzle stops the reconnect backstop regardless of
+ // whether a channel ever went live — otherwise a puzzle that was
+ // opened but never connected would tick forever. A quick return
+ // re-arms it via `startEngagementIfPossible`.
+ cancelEngagementReconnectRetry(gameID: gameID)
guard engagementStatus.isLive(gameID: gameID) else { return }
syncMonitor.note(
"engagement: scheduled ending for \(gameID.uuidString) " +
@@ -1032,6 +1050,40 @@ final class AppServices {
await endEngagement(gameID: gameID)
}
+ /// Arms the periodic engagement-presence tick for `gameID`, replacing any
+ /// prior timer for the same game (so a re-arm on foreground doesn't stack).
+ /// Each tick does two things:
+ ///
+ /// 1. Renews our own read lease (`publishReadCursor(.activeLease)`, which
+ /// is floor-gated so it writes at most ~once per 5 min). This is what
+ /// makes `readAt` a true "foregrounded on this puzzle" heartbeat: a
+ /// peer's own typing never advances their own lease, so without this an
+ /// active solo driver would lapse mid-session and look absent.
+ /// 2. Re-runs the coordinator's presence check, which both reconnects a
+ /// dropped channel and tears down a live channel whose peer's lease has
+ /// expired — neither of which is safe until (1) keeps present peers
+ /// from falsely lapsing.
+ ///
+ /// Both are no-ops in steady state (lease has >5 min left; coordinator is
+ /// live with a present peer). Cancelled on leave via `scheduleEngagement
+ /// End`/`endEngagement`; goes dormant under suspension (the sleep can't
+ /// advance) and is re-armed by the foreground `startEngagementIfPossible`.
+ private func startEngagementReconnectRetry(gameID: UUID) {
+ engagementReconnectTasks[gameID]?.cancel()
+ engagementReconnectTasks[gameID] = Task { [weak self] in
+ while !Task.isCancelled {
+ try? await Task.sleep(for: Self.engagementReconnectInterval)
+ guard !Task.isCancelled, let self else { return }
+ await self.publishReadCursor(for: gameID, mode: .activeLease)
+ await self.engagementCoordinator.peerPresenceMayHaveChanged(gameIDs: [gameID])
+ }
+ }
+ }
+
+ private func cancelEngagementReconnectRetry(gameID: UUID) {
+ engagementReconnectTasks.removeValue(forKey: gameID)?.cancel()
+ }
+
func noteLocalSelection(_ selection: PlayerSelection, gameID: UUID) async {
latestLocalSelections[gameID] = selection
guard engagementStatus.isLive(gameID: gameID),
@@ -2357,10 +2409,13 @@ final class AppServices {
}
}
- /// True iff some non-local participant in `gameID` has a fresh cursor
- /// track. Cursor fields are the user-visible presence signal; stale
- /// cursor rows are ignored so a solo solver in a shared puzzle does not
- /// keep publishing local cursor changes after the peer has left.
+ /// True iff some non-local participant in `gameID` currently holds a
+ /// valid read lease (`readAt` in the future). The active-lease cursor —
+ /// set ~10 minutes ahead while the puzzle is open and collapsed to `now`
+ /// on leave — is the presence signal: it survives think-time without
+ /// cursor movement and self-expires if a peer vanishes uncleanly, so a
+ /// solo solver in a shared puzzle stops treating a departed peer as
+ /// present within the lease window rather than on every paused minute.
static func hasPresentPeer(
persistence: PersistenceController,
gameID: UUID,
@@ -2370,11 +2425,10 @@ final class AppServices {
return await withCheckedContinuation { continuation in
context.perform {
let req = NSFetchRequest<PlayerEntity>(entityName: "PlayerEntity")
- let cutoff = Date().addingTimeInterval(-peerPresenceFreshnessWindow)
req.predicate = NSPredicate(
- format: "game.id == %@ AND updatedAt > %@ AND selRow != nil AND selCol != nil AND selDir != nil",
+ format: "game.id == %@ AND readAt > %@",
gameID as CVarArg,
- cutoff as NSDate
+ Date() as NSDate
)
let players = (try? context.fetch(req)) ?? []
let hasPeer = players.contains { player in
@@ -2397,9 +2451,8 @@ final class AppServices {
return await withCheckedContinuation { continuation in
context.perform {
let req = NSFetchRequest<PlayerEntity>(entityName: "PlayerEntity")
- let cutoff = Date().addingTimeInterval(-peerPresenceFreshnessWindow)
var predicates = [
- NSPredicate(format: "updatedAt > %@ AND selRow != nil AND selCol != nil AND selDir != nil", cutoff as NSDate)
+ NSPredicate(format: "readAt > %@", Date() as NSDate)
]
if let gameIDs, !gameIDs.isEmpty {
predicates.append(NSPredicate(format: "game.id IN %@", Array(gameIDs)))
diff --git a/Crossmate/Sync/EngagementCoordinator.swift b/Crossmate/Sync/EngagementCoordinator.swift
@@ -172,6 +172,16 @@ actor EngagementCoordinator {
room.roomID
}
}
+
+ var peerAuthorID: String? {
+ switch self {
+ case .idle:
+ nil
+ case .connecting(let peerAuthorID, _, _, _),
+ .live(let peerAuthorID, _, _):
+ peerAuthorID
+ }
+ }
}
private let host: any EngagementTransporting
@@ -217,6 +227,30 @@ actor EngagementCoordinator {
guard let localAuthorID = await localAuthorID(), !localAuthorID.isEmpty else { return }
await sweepStaleConnections()
let peersByGame = await presentPeers(gameIDs)
+
+ // Tear down any live channel whose connected peer no longer holds a
+ // valid lease — they've left or vanished. Disconnecting the host
+ // fires `.channelClose`, which drives the normal idle transition and
+ // downstream cleanup, so we don't mutate state here. Collected before
+ // awaiting so the `states` walk doesn't race a reentrant mutation.
+ // `.connecting` is left to `sweepStaleConnections`' timeout, since an
+ // in-flight hail may simply be racing the peer's first lease write.
+ var expired: [(gameID: UUID, engagementID: UUID, peerAuthorID: String)] = []
+ for (gameID, state) in states {
+ if let gameIDs, !gameIDs.contains(gameID) { continue }
+ guard case .live(let peerAuthorID, let engagementID, _) = state else { continue }
+ if !(peersByGame[gameID]?.contains(peerAuthorID) ?? false) {
+ expired.append((gameID, engagementID, peerAuthorID))
+ }
+ }
+ for entry in expired {
+ await log(
+ "engagement: peer \(entry.peerAuthorID) lease expired for " +
+ "\(entry.gameID.uuidString), tearing down \(entry.engagementID.uuidString)"
+ )
+ await host.disconnect(engagementID: entry.engagementID)
+ }
+
for (gameID, peers) in peersByGame {
guard state(for: gameID) == .idle else { continue }
guard let peerAuthorID = peers.sorted().first(where: { localAuthorID < $0 }) else { continue }
diff --git a/Tests/Unit/Sync/AppServicesAnnouncementTests.swift b/Tests/Unit/Sync/AppServicesAnnouncementTests.swift
@@ -56,14 +56,14 @@ struct AppServicesAnnouncementTests {
@MainActor
struct AppServicesPeerPresenceTests {
- @Test("fresh cursor track counts as a present peer without a read lease")
- func freshCursorTrackCountsAsPresentPeer() async throws {
+ @Test("valid read lease counts as a present peer, even without a cursor")
+ func validReadLeaseCountsAsPresentPeer() async throws {
let (persistence, gameID) = try makePersistence(authorID: "alice")
try addPlayer(
gameID: gameID,
authorID: "bob",
- selection: PlayerSelection(row: 1, col: 2, direction: .down),
- readAt: nil,
+ selection: nil,
+ readAt: Date().addingTimeInterval(10 * 60),
updatedAt: Date(),
persistence: persistence
)
@@ -83,15 +83,15 @@ struct AppServicesPeerPresenceTests {
#expect(peers[gameID] == ["bob"])
}
- @Test("stale cursor track does not count as a present peer")
- func staleCursorTrackDoesNotCountAsPresentPeer() async throws {
+ @Test("expired read lease does not count as a present peer")
+ func expiredReadLeaseDoesNotCountAsPresentPeer() async throws {
let (persistence, gameID) = try makePersistence(authorID: "alice")
try addPlayer(
gameID: gameID,
authorID: "bob",
selection: PlayerSelection(row: 1, col: 2, direction: .down),
- readAt: nil,
- updatedAt: Date().addingTimeInterval(-90),
+ readAt: Date().addingTimeInterval(-1),
+ updatedAt: Date(),
persistence: persistence
)
@@ -110,14 +110,14 @@ struct AppServicesPeerPresenceTests {
#expect(peers[gameID] == nil)
}
- @Test("read lease alone does not count as engagement presence")
- func readLeaseWithoutCursorDoesNotCountAsPresentPeer() async throws {
+ @Test("a fresh cursor without a read lease does not count as a present peer")
+ func freshCursorWithoutLeaseDoesNotCountAsPresentPeer() async throws {
let (persistence, gameID) = try makePersistence(authorID: "alice")
try addPlayer(
gameID: gameID,
authorID: "bob",
- selection: nil,
- readAt: Date().addingTimeInterval(60),
+ selection: PlayerSelection(row: 1, col: 2, direction: .down),
+ readAt: nil,
updatedAt: Date(),
persistence: persistence
)
diff --git a/Tests/Unit/Sync/EngagementCoordinatorTests.swift b/Tests/Unit/Sync/EngagementCoordinatorTests.swift
@@ -246,6 +246,43 @@ struct EngagementCoordinatorTests {
#expect(host.connections.count == 2)
}
+ @Test("live channel tears down when the peer's lease expires, and not before")
+ @MainActor
+ func livePeerLeaseExpiryTearsDown() async throws {
+ let gameID = UUID(uuidString: "99999999-9999-9999-9999-999999999999")!
+ let host = MockEngagementHost()
+ let sink = EngagementCoordinatorTestSink()
+ let presence = PresenceSource([gameID: ["bob"]])
+ let coordinator = EngagementCoordinator(
+ host: host,
+ localAuthorID: { "alice" },
+ localDeviceID: "deviceA",
+ presentPeers: { await presence.current($0) },
+ sendHail: { gameID, payload, addressee in
+ await sink.send(gameID: gameID, payload: payload, addressee: addressee)
+ },
+ deletePing: { recordName, gameID in
+ await sink.delete(recordName: recordName, gameID: gameID)
+ }
+ )
+
+ // Connect and bring the channel live.
+ await coordinator.peerPresenceMayHaveChanged(gameIDs: [gameID])
+ let engagementID = try #require(host.connections.first?.engagementID)
+ #expect(await coordinator.channelOpened(engagementID: engagementID) == gameID)
+
+ // A later tick with the peer still present leaves the channel alone.
+ await coordinator.peerPresenceMayHaveChanged(gameIDs: [gameID])
+ #expect(host.disconnected.isEmpty)
+
+ // Peer's lease lapses → present set drops them → channel is torn down,
+ // and the now-peerless game is not immediately re-hailed.
+ await presence.set([:])
+ await coordinator.peerPresenceMayHaveChanged(gameIDs: [gameID])
+ #expect(host.disconnected == [engagementID])
+ #expect(await sink.sentHails().count == 1)
+ }
+
@Test("stale and expired hails are deleted")
@MainActor
func staleAndExpiredHailsAreDeleted() async throws {
@@ -362,6 +399,23 @@ private struct DeletedPing: Equatable, Sendable {
var gameID: UUID
}
+private actor PresenceSource {
+ private var peersByGame: [UUID: [String]]
+
+ init(_ initial: [UUID: [String]]) {
+ self.peersByGame = initial
+ }
+
+ func set(_ next: [UUID: [String]]) {
+ peersByGame = next
+ }
+
+ func current(_ gameIDs: Set<UUID>?) -> [UUID: [String]] {
+ guard let gameIDs else { return peersByGame }
+ return peersByGame.filter { gameIDs.contains($0.key) }
+ }
+}
+
private actor EngagementCoordinatorTestSink {
private var sent: [SentHail] = []
private var deleted: [DeletedPing] = []