commit f049050afd89651a66f610fe40c460851a828769
parent f6a40456a5a1b40eeae08b77989a9bbc0151de0a
Author: Michael Camilleri <[email protected]>
Date: Thu, 25 Jun 2026 12:46:54 +0900
Retry transient push-worker sends and wait for connectivity
This commit addresses push-worker registrations that fail at app
foreground with `-1005 networkConnectionLost` (or `-1001 timedOut`)
while the device is plainly online — CloudKit keeps syncing in the
same second. The failure is a keep-alive connection-reuse race against
the Cloudflare worker, but registerGameCredential swallowed it with no
retry, so the device could sit unregistered with the worker until an
unrelated reconcile. During that window a co-player's join broadcast
resolves to no targets and the banner is lost, while the badge still
arrives over the independent CloudKit path.
sendAuthorized now retries a send up to three times with a short
backoff for the transient transport errors, re-signing each attempt so
the monotonic App Attest assertion counter is never replayed. Every
worker write reached through it is idempotent, so re-sending is safe.
The client also moves off URLSession.shared onto a dedicated session
with waitsForConnectivity enabled, so a send issued before the radio is
back up waits for the path rather than failing immediately, and the
worker's connection pool no longer churns against unrelated traffic.
Co-Authored-By: Claude Opus 4.8 <[email protected]>
Diffstat:
1 file changed, 62 insertions(+), 9 deletions(-)
diff --git a/Crossmate/Services/PushClient.swift b/Crossmate/Services/PushClient.swift
@@ -69,9 +69,25 @@ final class PushClient {
/// `Local.xcconfig`) or when the bundle's APNs environment is missing or
/// unrecognised. The rest of the app treats a nil PushClient as "push
/// notifications are disabled" rather than crashing.
+ /// Dedicated session for worker traffic. `waitsForConnectivity` lets a send
+ /// issued right at app-foreground — before the radio/path is back up — wait
+ /// for connectivity instead of failing immediately (a common source of the
+ /// `-1005`/`-1009` registration failures), bounded by the resource timeout.
+ /// It governs connection establishment, not a mid-transfer drop, so it pairs
+ /// with the transport retry in `sendAuthorized` rather than replacing it.
+ /// A dedicated session also keeps the worker's connection pool off `.shared`,
+ /// so unrelated traffic doesn't churn the pooled connections it reuses.
+ static func makeWorkerSession() -> URLSession {
+ let config = URLSessionConfiguration.default
+ config.waitsForConnectivity = true
+ config.timeoutIntervalForRequest = 30
+ config.timeoutIntervalForResource = 60
+ return URLSession(configuration: config)
+ }
+
init?(
deviceID: String = RecordSerializer.localDeviceID,
- session: URLSession = .shared,
+ session: URLSession = PushClient.makeWorkerSession(),
log: @escaping (String) -> Void = { _ in }
) {
guard
@@ -483,14 +499,51 @@ final class PushClient {
body: Data,
gameCredential: GamePushCredentials? = nil
) async throws -> (Data, HTTPURLResponse) {
- try await sendAuthorized(
- request,
- method: method,
- path: path,
- body: body,
- gameCredential: gameCredential,
- retryAfterRegistrationReset: true
- )
+ // Transport-level retry. `-1005 networkConnectionLost` / `-1001 timedOut`
+ // against the Cloudflare worker are usually a keep-alive connection-reuse
+ // race — the edge closed a pooled connection the device then reused —
+ // rather than a real outage (CloudKit keeps syncing through them). They're
+ // retryable: a fresh attempt opens a new connection. Each attempt re-signs
+ // via the full overload below — `signedHeaders` mints a fresh nonce and a
+ // new App Attest assertion whose counter is monotonic — so a retry can't
+ // replay a stale assertion and regress the worker's stored counter. Every
+ // worker write reached through here is idempotent, so re-sending is safe.
+ // Bounded: a genuinely offline device exhausts the attempts and the next
+ // reconcile trigger re-registers.
+ let maxAttempts = 3
+ var attempt = 0
+ while true {
+ do {
+ return try await sendAuthorized(
+ request,
+ method: method,
+ path: path,
+ body: body,
+ gameCredential: gameCredential,
+ retryAfterRegistrationReset: true
+ )
+ } catch let error as URLError
+ where Self.isRetryableTransport(error.code) && attempt + 1 < maxAttempts {
+ attempt += 1
+ // Short backoff (200ms, 400ms) so an instantaneous reuse race
+ // isn't hammered and a momentarily-busy edge gets a beat.
+ try? await Task.sleep(for: .milliseconds(200 * attempt))
+ }
+ }
+ }
+
+ /// Transport failures worth retrying for an idempotent worker write.
+ /// Deliberately excludes `.notConnectedToInternet`/`.cancelled` — a truly
+ /// offline or cancelled send should fail fast and let the next reconcile
+ /// re-register rather than spin through the backoff.
+ private static func isRetryableTransport(_ code: URLError.Code) -> Bool {
+ switch code {
+ case .networkConnectionLost, .timedOut, .cannotConnectToHost,
+ .cannotFindHost, .dnsLookupFailed, .secureConnectionFailed:
+ return true
+ default:
+ return false
+ }
}
private func sendAuthorized(