crossmate

A collaborative crossword app for iOS
Log | Files | Refs | LICENSE

DebuggingMonitors.swift (17997B)


      1 import CloudKit
      2 import Foundation
      3 import Observation
      4 import os
      5 
      6 struct PerformanceDiagnosticEntry: Identifiable, Sendable {
      7     let id = UUID()
      8     let timestamp: Date
      9     let name: String
     10     let durationMS: Double
     11     let detail: String
     12 }
     13 
     14 @MainActor
     15 @Observable
     16 final class PerformanceMonitor {
     17     private(set) var entries: [PerformanceDiagnosticEntry] = []
     18     private(set) var totalEvents = 0
     19     private(set) var suppressedEvents = 0
     20 
     21     private let maxEntries = 120
     22     private var nextRenderProbeID = 1
     23     private var pendingRenderProbes: [Int: RenderProbe] = [:]
     24     @ObservationIgnored private var viewBodyCounts: [String: Int] = [:]
     25     @ObservationIgnored private var viewBodyDetails: [String: String] = [:]
     26     @ObservationIgnored private var scheduledViewBodyReports: Set<String> = []
     27 
     28     private struct RenderProbe {
     29         let name: String
     30         let position: GridPosition
     31         let expectedEntry: String
     32         let startedAt: ContinuousClock.Instant
     33         let detail: String
     34     }
     35 
     36     func record(
     37         _ name: String,
     38         durationMS: Double,
     39         detail: String = "",
     40         thresholdMS: Double = 8
     41     ) {
     42         totalEvents += 1
     43         guard durationMS >= thresholdMS else {
     44             suppressedEvents += 1
     45             return
     46         }
     47         entries.append(PerformanceDiagnosticEntry(
     48             timestamp: Date(),
     49             name: name,
     50             durationMS: durationMS,
     51             detail: detail
     52         ))
     53         if entries.count > maxEntries {
     54             entries.removeFirst(entries.count - maxEntries)
     55         }
     56     }
     57 
     58     func note(_ name: String, detail: String = "") {
     59         entries.append(PerformanceDiagnosticEntry(
     60             timestamp: Date(),
     61             name: name,
     62             durationMS: 0,
     63             detail: detail
     64         ))
     65         if entries.count > maxEntries {
     66             entries.removeFirst(entries.count - maxEntries)
     67         }
     68     }
     69 
     70     func clear() {
     71         entries.removeAll()
     72         totalEvents = 0
     73         suppressedEvents = 0
     74         pendingRenderProbes.removeAll()
     75         viewBodyCounts.removeAll()
     76         viewBodyDetails.removeAll()
     77         scheduledViewBodyReports.removeAll()
     78     }
     79 
     80     func beginRenderProbe(
     81         name: String,
     82         position: GridPosition,
     83         expectedEntry: String,
     84         detail: String = ""
     85     ) -> Int {
     86         let id = nextRenderProbeID
     87         nextRenderProbeID += 1
     88         pendingRenderProbes[id] = RenderProbe(
     89             name: name,
     90             position: position,
     91             expectedEntry: expectedEntry,
     92             startedAt: .now,
     93             detail: detail
     94         )
     95         return id
     96     }
     97 
     98     func renderProbeDetailPrefix(for id: Int) -> String? {
     99         guard let probe = pendingRenderProbes[id] else { return nil }
    100         return "probe=\(id) row=\(probe.position.row) col=\(probe.position.col)"
    101     }
    102 
    103     func completeRenderProbe(id: Int, position: GridPosition, entry: String) {
    104         guard let probe = pendingRenderProbes[id],
    105               probe.position == position,
    106               probe.expectedEntry == entry
    107         else { return }
    108         pendingRenderProbes.removeValue(forKey: id)
    109         let duration = probe.startedAt.duration(to: .now)
    110         let milliseconds = Double(duration.components.seconds) * 1000
    111             + Double(duration.components.attoseconds) / 1_000_000_000_000_000
    112         let baseDetail = "probe=\(id) row=\(probe.position.row) col=\(probe.position.col)"
    113         let detail = probe.detail.isEmpty ? baseDetail : "\(baseDetail) \(probe.detail)"
    114         record(
    115             probe.name,
    116             durationMS: milliseconds,
    117             detail: detail,
    118             thresholdMS: 6
    119         )
    120     }
    121 
    122     func recordDeferred(
    123         _ name: String,
    124         start: ContinuousClock.Instant,
    125         detail: String = "",
    126         thresholdMS: Double = 8
    127     ) {
    128         let milliseconds = Self.milliseconds(from: start.duration(to: .now))
    129         guard milliseconds >= thresholdMS else { return }
    130         Task { @MainActor [weak self] in
    131             self?.record(
    132                 name,
    133                 durationMS: milliseconds,
    134                 detail: detail,
    135                 thresholdMS: thresholdMS
    136             )
    137         }
    138     }
    139 
    140     func markViewBodyDeferred(
    141         _ name: String,
    142         key: String = "",
    143         detail: String = ""
    144     ) {
    145         Task { @MainActor [weak self] in
    146             self?.markViewBody(name, key: key, detail: detail)
    147         }
    148     }
    149 
    150     private func markViewBody(_ name: String, key: String, detail: String) {
    151         let reportKey = key.isEmpty ? name : "\(name)|\(key)"
    152         viewBodyCounts[reportKey, default: 0] += 1
    153         viewBodyDetails[reportKey] = detail
    154 
    155         guard !scheduledViewBodyReports.contains(reportKey) else { return }
    156         scheduledViewBodyReports.insert(reportKey)
    157         Task { @MainActor [weak self] in
    158             try? await Task.sleep(for: .milliseconds(50))
    159             self?.flushViewBodyReport(name: name, reportKey: reportKey)
    160         }
    161     }
    162 
    163     private func flushViewBodyReport(name: String, reportKey: String) {
    164         scheduledViewBodyReports.remove(reportKey)
    165         let count = viewBodyCounts.removeValue(forKey: reportKey) ?? 0
    166         guard count > 0 else { return }
    167         let detail = viewBodyDetails.removeValue(forKey: reportKey) ?? ""
    168         let countDetail = detail.isEmpty ? "count=\(count)" : "count=\(count) \(detail)"
    169         note("\(name).body", detail: countDetail)
    170     }
    171 
    172     private static func milliseconds(from duration: Duration) -> Double {
    173         Double(duration.components.seconds) * 1000
    174             + Double(duration.components.attoseconds) / 1_000_000_000_000_000
    175     }
    176 }
    177 
    178 struct EventLogEntry: Identifiable, Sendable, Codable {
    179     var id = UUID()
    180     let timestamp: Date
    181     let level: String
    182     let message: String
    183 }
    184 
    185 /// Truncates identifiers before they land in the event log, so a shared
    186 /// diagnostics dump can't expose a complete game UUID, CloudKit user/device
    187 /// record name, or share-link token. The first eight characters survive —
    188 /// enough to correlate lines within one log (matching the existing
    189 /// `PlayerRoster[0A5207B4]` and truncated-APNs-token conventions) without
    190 /// reconstructing the real identifier.
    191 ///
    192 /// Main-actor-confined because `Regex` is not `Sendable`; both producers
    193 /// (`EventLog`, `SyncMonitor`) already live on the main actor.
    194 @MainActor
    195 enum LogScrubber {
    196     private static let uuid =
    197         /[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}/
    198     /// CloudKit user record names are `_` + 32 hex; device IDs inside ping
    199     /// record names are bare 32-hex. `{32,}` swallows anything longer (e.g. a
    200     /// full APNs token) in one match rather than leaving a usable tail.
    201     /// Sentinels like `__defaultOwner__` contain non-hex letters and pass
    202     /// through untouched.
    203     private static let hexToken = /_?[0-9A-Fa-f]{32,}/
    204     /// An iCloud share URL's path token is a bearer credential — anyone
    205     /// holding the full link can join the game — so only a stub survives.
    206     private static let shareURLToken = /(icloud\.com\/share\/)([A-Za-z0-9._~-]{9,})/
    207 
    208     static func scrub(_ message: String) -> String {
    209         var result = message
    210         result = result.replacing(uuid) { "\($0.output.prefix(8))…" }
    211         result = result.replacing(hexToken) { match in
    212             let keep = match.output.hasPrefix("_") ? 9 : 8
    213             return "\(match.output.prefix(keep))…"
    214         }
    215         result = result.replacing(shareURLToken) { "\($0.output.1)\($0.output.2.prefix(8))…" }
    216         return result
    217     }
    218 }
    219 
    220 /// Off-main durable backing for `EventLog`. Owns a single atomically-rewritten
    221 /// snapshot file in Application Support — chosen over Caches, which iOS can
    222 /// purge under storage pressure, exactly when a tester needs the log most.
    223 /// The file survives termination, so a collaborator can still share their
    224 /// diagnostics hours after the activity, long after their app was suspended.
    225 actor EventLogStore {
    226     private let fileURL: URL
    227 
    228     init(filename: String = "event-log.json") {
    229         let dir = FileManager.default
    230             .urls(for: .applicationSupportDirectory, in: .userDomainMask)[0]
    231         try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
    232         self.fileURL = dir.appendingPathComponent(filename)
    233     }
    234 
    235     func load() -> [EventLogEntry] {
    236         guard let data = try? Data(contentsOf: fileURL),
    237               let entries = try? JSONDecoder().decode([EventLogEntry].self, from: data)
    238         else { return [] }
    239         return entries
    240     }
    241 
    242     /// Rewrites the file from the already-pruned in-memory snapshot. Called
    243     /// coalesced (debounced after a burst, and on backgrounding) rather than
    244     /// per-append, so the O(n) encode stays rare even during an intense
    245     /// ~5/sec co-solve.
    246     func persist(_ entries: [EventLogEntry]) {
    247         guard let data = try? JSONEncoder().encode(entries) else { return }
    248         try? data.write(to: fileURL, options: .atomic)
    249     }
    250 }
    251 
    252 /// Generic in-app event log. Anything that wants to leave a breadcrumb the
    253 /// user can read in the diagnostics screen calls `note(_:)`. SyncMonitor and
    254 /// other producers compose this — they don't subclass it.
    255 ///
    256 /// Backed by `EventLogStore` so the buffer outlives the process: `loadPersisted`
    257 /// hydrates the in-memory view on launch, and a debounced flush mirrors live
    258 /// breadcrumbs back to disk. That durability is what lets a quiet-overnight
    259 /// device still surface a collaborator's hours-old session the next morning.
    260 @MainActor
    261 @Observable
    262 final class EventLog {
    263     private(set) var entries: [EventLogEntry] = []
    264 
    265     /// How far back the log reaches. A time window — rather than a fixed entry
    266     /// count — guarantees the buffer always spans a known wall-clock duration
    267     /// regardless of how chatty the session is. During an intense co-solve the
    268     /// log churns at ~5 events/sec, so a count cap of a few thousand only holds
    269     /// a couple of minutes and evicts the start of the session before anyone
    270     /// can grab it. A day comfortably spans a collaborator's evening session
    271     /// plus a quiet night before someone collects the log in the morning, while
    272     /// staying cheap on an idle device (a quiet night writes a handful of rows).
    273     private let retention: TimeInterval = 60 * 60 * 24
    274 
    275     /// Hard ceiling so a runaway producer (e.g. a tight error-retry loop) can't
    276     /// grow the buffer without bound inside the retention window. Sized well
    277     /// above an hour of normal+intense traffic (~18k entries) so the time
    278     /// window stays the governing limit in practice; this only trips on a fault.
    279     private let maxEntries = 30_000
    280 
    281     private let store: EventLogStore
    282 
    283     /// Mirror of every breadcrumb into the unified system log. The file-backed
    284     /// buffer remains the system of record for TestFlight shares; this mirror
    285     /// adds live streaming in Xcode/Console.app on tethered runs, capture in
    286     /// sysdiagnoses, and crash resilience — a hard crash loses up to 2s of
    287     /// tail to the debounced file flush, but the system log keeps it.
    288     /// Interpolation is `.public` because these messages are already exposed
    289     /// verbatim through the diagnostics share sheet.
    290     @ObservationIgnored private let systemLog = Logger(
    291         subsystem: Bundle.main.bundleIdentifier ?? "Crossmate",
    292         category: "events"
    293     )
    294 
    295     /// Until the on-disk history has been merged in, appends must not flush:
    296     /// a flush before `loadPersisted` would clobber the file with a partial,
    297     /// pre-hydration snapshot and lose the very overnight history we're after.
    298     private var hydrated = false
    299     private var flushTask: Task<Void, Never>?
    300 
    301     init(store: EventLogStore = EventLogStore()) {
    302         self.store = store
    303     }
    304 
    305     /// Merges the persisted history into the in-memory view. Call once, early
    306     /// in launch. Any breadcrumbs noted before hydration are kept and ordered
    307     /// in by timestamp, so nothing recorded during startup is lost.
    308     func loadPersisted() async {
    309         let persisted = await store.load()
    310         entries = (persisted + entries).sorted { $0.timestamp < $1.timestamp }
    311         prune(now: Date())
    312         hydrated = true
    313         scheduleFlush()
    314     }
    315 
    316     func note(_ message: String, level: String = "info") {
    317         append(level: level, message)
    318     }
    319 
    320     fileprivate func append(level: String, _ message: String) {
    321         let message = LogScrubber.scrub(message)
    322         switch level {
    323         case "error":
    324             systemLog.error("\(message, privacy: .public)")
    325         case "warn":
    326             systemLog.warning("\(message, privacy: .public)")
    327         default:
    328             systemLog.info("\(message, privacy: .public)")
    329         }
    330         let now = Date()
    331         entries.append(EventLogEntry(timestamp: now, level: level, message: message))
    332         prune(now: now)
    333         scheduleFlush()
    334     }
    335 
    336     /// Coalesce writes: one persist a couple of seconds after the last append,
    337     /// not one per breadcrumb. A fresh append cancels the pending write and
    338     /// re-arms it, so a steady burst collapses into a single trailing flush.
    339     private func scheduleFlush() {
    340         guard hydrated else { return }
    341         flushTask?.cancel()
    342         flushTask = Task { [weak self] in
    343             try? await Task.sleep(for: .seconds(2))
    344             guard !Task.isCancelled, let self else { return }
    345             await self.store.persist(self.entries)
    346         }
    347     }
    348 
    349     /// Persist immediately, bypassing the debounce. Call on backgrounding so an
    350     /// imminent suspend keeps the tail that the timer hasn't written yet.
    351     func flush() async {
    352         guard hydrated else { return }
    353         flushTask?.cancel()
    354         await store.persist(entries)
    355     }
    356 
    357     /// Drops entries older than the retention window, then enforces the count
    358     /// ceiling. Entries are appended in timestamp order, so the expired ones are
    359     /// always a prefix — `firstIndex` stops at the first fresh entry, making the
    360     /// scan proportional to how many just aged out (typically zero or one per
    361     /// append), not to the buffer size. The entry just appended carries `now`,
    362     /// so at least one entry always survives the cutoff.
    363     private func prune(now: Date) {
    364         let cutoff = now.addingTimeInterval(-retention)
    365         if let firstFresh = entries.firstIndex(where: { $0.timestamp >= cutoff }), firstFresh > 0 {
    366             entries.removeFirst(firstFresh)
    367         }
    368         if entries.count > maxEntries {
    369             entries.removeFirst(entries.count - maxEntries)
    370         }
    371     }
    372 }
    373 
    374 @MainActor
    375 @Observable
    376 final class SyncMonitor {
    377     private(set) var lastSuccessAt: Date?
    378     private(set) var lastErrorPhase: String?
    379     private(set) var lastErrorDomain: String?
    380     private(set) var lastErrorCode: Int?
    381     private(set) var lastErrorDescription: String?
    382     private(set) var snapshot: SyncEngine.DiagnosticSnapshot?
    383 
    384     private let log: EventLog
    385 
    386     init(log: EventLog) {
    387         self.log = log
    388     }
    389 
    390     func recordStart(_ phase: String) {
    391         log.append(level: "info", "starting \(phase)")
    392     }
    393 
    394     func recordSuccess(_ phase: String) {
    395         lastSuccessAt = Date()
    396         // Once the phase that last errored runs to completion, the stored
    397         // error is stale — the engine has recovered. Clear it so the
    398         // diagnostics panel reflects current health instead of pinning to
    399         // a transient failure (e.g. an OpLock that retried successfully).
    400         if lastErrorPhase == phase {
    401             clearLastError()
    402         }
    403         log.append(level: "info", "\(phase) succeeded")
    404     }
    405 
    406     func recordError(_ phase: String, _ error: Error) {
    407         let nsError = error as NSError
    408         lastErrorPhase = phase
    409         lastErrorDomain = nsError.domain
    410         lastErrorCode = nsError.code
    411         // Scrub at storage, not display: this string surfaces both in the
    412         // diagnostics view and in the shared dump's header, and CloudKit
    413         // error text can embed full record and zone names.
    414         lastErrorDescription = LogScrubber.scrub(nsError.localizedDescription)
    415         let userInfoSummary = nsError.userInfo
    416             .map { "\($0.key)=\($0.value)" }
    417             .joined(separator: " | ")
    418         let message = "\(phase) failed: domain=\(nsError.domain) code=\(nsError.code) \(nsError.localizedDescription) | userInfo: \(userInfoSummary)"
    419         log.append(level: "error", message)
    420     }
    421 
    422     func note(_ message: String) {
    423         log.note(message)
    424     }
    425 
    426     /// Bumps `lastSuccessAt` without writing a `succeeded` log line. Used by
    427     /// the CKSyncEngine delegate handlers so that auto-drained sends and
    428     /// framework-initiated fetches — which never go through `run(_:_:)` —
    429     /// still count as "the engine is alive and round-tripping with the
    430     /// server". Without this the timestamp pins to whichever instrumented
    431     /// phase last ran and silently lies about engine health when only
    432     /// auto-drain activity is happening.
    433     func noteSuccess() {
    434         lastSuccessAt = Date()
    435         clearLastError()
    436     }
    437 
    438     func updateSnapshot(_ snapshot: SyncEngine.DiagnosticSnapshot) {
    439         self.snapshot = snapshot
    440     }
    441 
    442     func clearLastError() {
    443         lastErrorPhase = nil
    444         lastErrorDomain = nil
    445         lastErrorCode = nil
    446         lastErrorDescription = nil
    447     }
    448 
    449     func run(_ phase: String, _ body: @Sendable () async throws -> Void) async {
    450         recordStart(phase)
    451         do {
    452             try await body()
    453             recordSuccess(phase)
    454         } catch {
    455             recordError(phase, error)
    456         }
    457     }
    458 
    459     func run<T: Sendable>(_ phase: String, _ body: @Sendable () async throws -> T) async -> T? {
    460         recordStart(phase)
    461         do {
    462             let result = try await body()
    463             recordSuccess(phase)
    464             return result
    465         } catch {
    466             recordError(phase, error)
    467             return nil
    468         }
    469     }
    470 }