DebuggingMonitors.swift (17997B)
1 import CloudKit 2 import Foundation 3 import Observation 4 import os 5 6 struct PerformanceDiagnosticEntry: Identifiable, Sendable { 7 let id = UUID() 8 let timestamp: Date 9 let name: String 10 let durationMS: Double 11 let detail: String 12 } 13 14 @MainActor 15 @Observable 16 final class PerformanceMonitor { 17 private(set) var entries: [PerformanceDiagnosticEntry] = [] 18 private(set) var totalEvents = 0 19 private(set) var suppressedEvents = 0 20 21 private let maxEntries = 120 22 private var nextRenderProbeID = 1 23 private var pendingRenderProbes: [Int: RenderProbe] = [:] 24 @ObservationIgnored private var viewBodyCounts: [String: Int] = [:] 25 @ObservationIgnored private var viewBodyDetails: [String: String] = [:] 26 @ObservationIgnored private var scheduledViewBodyReports: Set<String> = [] 27 28 private struct RenderProbe { 29 let name: String 30 let position: GridPosition 31 let expectedEntry: String 32 let startedAt: ContinuousClock.Instant 33 let detail: String 34 } 35 36 func record( 37 _ name: String, 38 durationMS: Double, 39 detail: String = "", 40 thresholdMS: Double = 8 41 ) { 42 totalEvents += 1 43 guard durationMS >= thresholdMS else { 44 suppressedEvents += 1 45 return 46 } 47 entries.append(PerformanceDiagnosticEntry( 48 timestamp: Date(), 49 name: name, 50 durationMS: durationMS, 51 detail: detail 52 )) 53 if entries.count > maxEntries { 54 entries.removeFirst(entries.count - maxEntries) 55 } 56 } 57 58 func note(_ name: String, detail: String = "") { 59 entries.append(PerformanceDiagnosticEntry( 60 timestamp: Date(), 61 name: name, 62 durationMS: 0, 63 detail: detail 64 )) 65 if entries.count > maxEntries { 66 entries.removeFirst(entries.count - maxEntries) 67 } 68 } 69 70 func clear() { 71 entries.removeAll() 72 totalEvents = 0 73 suppressedEvents = 0 74 pendingRenderProbes.removeAll() 75 viewBodyCounts.removeAll() 76 viewBodyDetails.removeAll() 77 scheduledViewBodyReports.removeAll() 78 } 79 80 func beginRenderProbe( 81 name: String, 82 position: GridPosition, 83 expectedEntry: String, 84 detail: String = "" 85 ) -> Int { 86 let id = nextRenderProbeID 87 nextRenderProbeID += 1 88 pendingRenderProbes[id] = RenderProbe( 89 name: name, 90 position: position, 91 expectedEntry: expectedEntry, 92 startedAt: .now, 93 detail: detail 94 ) 95 return id 96 } 97 98 func renderProbeDetailPrefix(for id: Int) -> String? { 99 guard let probe = pendingRenderProbes[id] else { return nil } 100 return "probe=\(id) row=\(probe.position.row) col=\(probe.position.col)" 101 } 102 103 func completeRenderProbe(id: Int, position: GridPosition, entry: String) { 104 guard let probe = pendingRenderProbes[id], 105 probe.position == position, 106 probe.expectedEntry == entry 107 else { return } 108 pendingRenderProbes.removeValue(forKey: id) 109 let duration = probe.startedAt.duration(to: .now) 110 let milliseconds = Double(duration.components.seconds) * 1000 111 + Double(duration.components.attoseconds) / 1_000_000_000_000_000 112 let baseDetail = "probe=\(id) row=\(probe.position.row) col=\(probe.position.col)" 113 let detail = probe.detail.isEmpty ? baseDetail : "\(baseDetail) \(probe.detail)" 114 record( 115 probe.name, 116 durationMS: milliseconds, 117 detail: detail, 118 thresholdMS: 6 119 ) 120 } 121 122 func recordDeferred( 123 _ name: String, 124 start: ContinuousClock.Instant, 125 detail: String = "", 126 thresholdMS: Double = 8 127 ) { 128 let milliseconds = Self.milliseconds(from: start.duration(to: .now)) 129 guard milliseconds >= thresholdMS else { return } 130 Task { @MainActor [weak self] in 131 self?.record( 132 name, 133 durationMS: milliseconds, 134 detail: detail, 135 thresholdMS: thresholdMS 136 ) 137 } 138 } 139 140 func markViewBodyDeferred( 141 _ name: String, 142 key: String = "", 143 detail: String = "" 144 ) { 145 Task { @MainActor [weak self] in 146 self?.markViewBody(name, key: key, detail: detail) 147 } 148 } 149 150 private func markViewBody(_ name: String, key: String, detail: String) { 151 let reportKey = key.isEmpty ? name : "\(name)|\(key)" 152 viewBodyCounts[reportKey, default: 0] += 1 153 viewBodyDetails[reportKey] = detail 154 155 guard !scheduledViewBodyReports.contains(reportKey) else { return } 156 scheduledViewBodyReports.insert(reportKey) 157 Task { @MainActor [weak self] in 158 try? await Task.sleep(for: .milliseconds(50)) 159 self?.flushViewBodyReport(name: name, reportKey: reportKey) 160 } 161 } 162 163 private func flushViewBodyReport(name: String, reportKey: String) { 164 scheduledViewBodyReports.remove(reportKey) 165 let count = viewBodyCounts.removeValue(forKey: reportKey) ?? 0 166 guard count > 0 else { return } 167 let detail = viewBodyDetails.removeValue(forKey: reportKey) ?? "" 168 let countDetail = detail.isEmpty ? "count=\(count)" : "count=\(count) \(detail)" 169 note("\(name).body", detail: countDetail) 170 } 171 172 private static func milliseconds(from duration: Duration) -> Double { 173 Double(duration.components.seconds) * 1000 174 + Double(duration.components.attoseconds) / 1_000_000_000_000_000 175 } 176 } 177 178 struct EventLogEntry: Identifiable, Sendable, Codable { 179 var id = UUID() 180 let timestamp: Date 181 let level: String 182 let message: String 183 } 184 185 /// Truncates identifiers before they land in the event log, so a shared 186 /// diagnostics dump can't expose a complete game UUID, CloudKit user/device 187 /// record name, or share-link token. The first eight characters survive — 188 /// enough to correlate lines within one log (matching the existing 189 /// `PlayerRoster[0A5207B4]` and truncated-APNs-token conventions) without 190 /// reconstructing the real identifier. 191 /// 192 /// Main-actor-confined because `Regex` is not `Sendable`; both producers 193 /// (`EventLog`, `SyncMonitor`) already live on the main actor. 194 @MainActor 195 enum LogScrubber { 196 private static let uuid = 197 /[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}/ 198 /// CloudKit user record names are `_` + 32 hex; device IDs inside ping 199 /// record names are bare 32-hex. `{32,}` swallows anything longer (e.g. a 200 /// full APNs token) in one match rather than leaving a usable tail. 201 /// Sentinels like `__defaultOwner__` contain non-hex letters and pass 202 /// through untouched. 203 private static let hexToken = /_?[0-9A-Fa-f]{32,}/ 204 /// An iCloud share URL's path token is a bearer credential — anyone 205 /// holding the full link can join the game — so only a stub survives. 206 private static let shareURLToken = /(icloud\.com\/share\/)([A-Za-z0-9._~-]{9,})/ 207 208 static func scrub(_ message: String) -> String { 209 var result = message 210 result = result.replacing(uuid) { "\($0.output.prefix(8))…" } 211 result = result.replacing(hexToken) { match in 212 let keep = match.output.hasPrefix("_") ? 9 : 8 213 return "\(match.output.prefix(keep))…" 214 } 215 result = result.replacing(shareURLToken) { "\($0.output.1)\($0.output.2.prefix(8))…" } 216 return result 217 } 218 } 219 220 /// Off-main durable backing for `EventLog`. Owns a single atomically-rewritten 221 /// snapshot file in Application Support — chosen over Caches, which iOS can 222 /// purge under storage pressure, exactly when a tester needs the log most. 223 /// The file survives termination, so a collaborator can still share their 224 /// diagnostics hours after the activity, long after their app was suspended. 225 actor EventLogStore { 226 private let fileURL: URL 227 228 init(filename: String = "event-log.json") { 229 let dir = FileManager.default 230 .urls(for: .applicationSupportDirectory, in: .userDomainMask)[0] 231 try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) 232 self.fileURL = dir.appendingPathComponent(filename) 233 } 234 235 func load() -> [EventLogEntry] { 236 guard let data = try? Data(contentsOf: fileURL), 237 let entries = try? JSONDecoder().decode([EventLogEntry].self, from: data) 238 else { return [] } 239 return entries 240 } 241 242 /// Rewrites the file from the already-pruned in-memory snapshot. Called 243 /// coalesced (debounced after a burst, and on backgrounding) rather than 244 /// per-append, so the O(n) encode stays rare even during an intense 245 /// ~5/sec co-solve. 246 func persist(_ entries: [EventLogEntry]) { 247 guard let data = try? JSONEncoder().encode(entries) else { return } 248 try? data.write(to: fileURL, options: .atomic) 249 } 250 } 251 252 /// Generic in-app event log. Anything that wants to leave a breadcrumb the 253 /// user can read in the diagnostics screen calls `note(_:)`. SyncMonitor and 254 /// other producers compose this — they don't subclass it. 255 /// 256 /// Backed by `EventLogStore` so the buffer outlives the process: `loadPersisted` 257 /// hydrates the in-memory view on launch, and a debounced flush mirrors live 258 /// breadcrumbs back to disk. That durability is what lets a quiet-overnight 259 /// device still surface a collaborator's hours-old session the next morning. 260 @MainActor 261 @Observable 262 final class EventLog { 263 private(set) var entries: [EventLogEntry] = [] 264 265 /// How far back the log reaches. A time window — rather than a fixed entry 266 /// count — guarantees the buffer always spans a known wall-clock duration 267 /// regardless of how chatty the session is. During an intense co-solve the 268 /// log churns at ~5 events/sec, so a count cap of a few thousand only holds 269 /// a couple of minutes and evicts the start of the session before anyone 270 /// can grab it. A day comfortably spans a collaborator's evening session 271 /// plus a quiet night before someone collects the log in the morning, while 272 /// staying cheap on an idle device (a quiet night writes a handful of rows). 273 private let retention: TimeInterval = 60 * 60 * 24 274 275 /// Hard ceiling so a runaway producer (e.g. a tight error-retry loop) can't 276 /// grow the buffer without bound inside the retention window. Sized well 277 /// above an hour of normal+intense traffic (~18k entries) so the time 278 /// window stays the governing limit in practice; this only trips on a fault. 279 private let maxEntries = 30_000 280 281 private let store: EventLogStore 282 283 /// Mirror of every breadcrumb into the unified system log. The file-backed 284 /// buffer remains the system of record for TestFlight shares; this mirror 285 /// adds live streaming in Xcode/Console.app on tethered runs, capture in 286 /// sysdiagnoses, and crash resilience — a hard crash loses up to 2s of 287 /// tail to the debounced file flush, but the system log keeps it. 288 /// Interpolation is `.public` because these messages are already exposed 289 /// verbatim through the diagnostics share sheet. 290 @ObservationIgnored private let systemLog = Logger( 291 subsystem: Bundle.main.bundleIdentifier ?? "Crossmate", 292 category: "events" 293 ) 294 295 /// Until the on-disk history has been merged in, appends must not flush: 296 /// a flush before `loadPersisted` would clobber the file with a partial, 297 /// pre-hydration snapshot and lose the very overnight history we're after. 298 private var hydrated = false 299 private var flushTask: Task<Void, Never>? 300 301 init(store: EventLogStore = EventLogStore()) { 302 self.store = store 303 } 304 305 /// Merges the persisted history into the in-memory view. Call once, early 306 /// in launch. Any breadcrumbs noted before hydration are kept and ordered 307 /// in by timestamp, so nothing recorded during startup is lost. 308 func loadPersisted() async { 309 let persisted = await store.load() 310 entries = (persisted + entries).sorted { $0.timestamp < $1.timestamp } 311 prune(now: Date()) 312 hydrated = true 313 scheduleFlush() 314 } 315 316 func note(_ message: String, level: String = "info") { 317 append(level: level, message) 318 } 319 320 fileprivate func append(level: String, _ message: String) { 321 let message = LogScrubber.scrub(message) 322 switch level { 323 case "error": 324 systemLog.error("\(message, privacy: .public)") 325 case "warn": 326 systemLog.warning("\(message, privacy: .public)") 327 default: 328 systemLog.info("\(message, privacy: .public)") 329 } 330 let now = Date() 331 entries.append(EventLogEntry(timestamp: now, level: level, message: message)) 332 prune(now: now) 333 scheduleFlush() 334 } 335 336 /// Coalesce writes: one persist a couple of seconds after the last append, 337 /// not one per breadcrumb. A fresh append cancels the pending write and 338 /// re-arms it, so a steady burst collapses into a single trailing flush. 339 private func scheduleFlush() { 340 guard hydrated else { return } 341 flushTask?.cancel() 342 flushTask = Task { [weak self] in 343 try? await Task.sleep(for: .seconds(2)) 344 guard !Task.isCancelled, let self else { return } 345 await self.store.persist(self.entries) 346 } 347 } 348 349 /// Persist immediately, bypassing the debounce. Call on backgrounding so an 350 /// imminent suspend keeps the tail that the timer hasn't written yet. 351 func flush() async { 352 guard hydrated else { return } 353 flushTask?.cancel() 354 await store.persist(entries) 355 } 356 357 /// Drops entries older than the retention window, then enforces the count 358 /// ceiling. Entries are appended in timestamp order, so the expired ones are 359 /// always a prefix — `firstIndex` stops at the first fresh entry, making the 360 /// scan proportional to how many just aged out (typically zero or one per 361 /// append), not to the buffer size. The entry just appended carries `now`, 362 /// so at least one entry always survives the cutoff. 363 private func prune(now: Date) { 364 let cutoff = now.addingTimeInterval(-retention) 365 if let firstFresh = entries.firstIndex(where: { $0.timestamp >= cutoff }), firstFresh > 0 { 366 entries.removeFirst(firstFresh) 367 } 368 if entries.count > maxEntries { 369 entries.removeFirst(entries.count - maxEntries) 370 } 371 } 372 } 373 374 @MainActor 375 @Observable 376 final class SyncMonitor { 377 private(set) var lastSuccessAt: Date? 378 private(set) var lastErrorPhase: String? 379 private(set) var lastErrorDomain: String? 380 private(set) var lastErrorCode: Int? 381 private(set) var lastErrorDescription: String? 382 private(set) var snapshot: SyncEngine.DiagnosticSnapshot? 383 384 private let log: EventLog 385 386 init(log: EventLog) { 387 self.log = log 388 } 389 390 func recordStart(_ phase: String) { 391 log.append(level: "info", "starting \(phase)") 392 } 393 394 func recordSuccess(_ phase: String) { 395 lastSuccessAt = Date() 396 // Once the phase that last errored runs to completion, the stored 397 // error is stale — the engine has recovered. Clear it so the 398 // diagnostics panel reflects current health instead of pinning to 399 // a transient failure (e.g. an OpLock that retried successfully). 400 if lastErrorPhase == phase { 401 clearLastError() 402 } 403 log.append(level: "info", "\(phase) succeeded") 404 } 405 406 func recordError(_ phase: String, _ error: Error) { 407 let nsError = error as NSError 408 lastErrorPhase = phase 409 lastErrorDomain = nsError.domain 410 lastErrorCode = nsError.code 411 // Scrub at storage, not display: this string surfaces both in the 412 // diagnostics view and in the shared dump's header, and CloudKit 413 // error text can embed full record and zone names. 414 lastErrorDescription = LogScrubber.scrub(nsError.localizedDescription) 415 let userInfoSummary = nsError.userInfo 416 .map { "\($0.key)=\($0.value)" } 417 .joined(separator: " | ") 418 let message = "\(phase) failed: domain=\(nsError.domain) code=\(nsError.code) \(nsError.localizedDescription) | userInfo: \(userInfoSummary)" 419 log.append(level: "error", message) 420 } 421 422 func note(_ message: String) { 423 log.note(message) 424 } 425 426 /// Bumps `lastSuccessAt` without writing a `succeeded` log line. Used by 427 /// the CKSyncEngine delegate handlers so that auto-drained sends and 428 /// framework-initiated fetches — which never go through `run(_:_:)` — 429 /// still count as "the engine is alive and round-tripping with the 430 /// server". Without this the timestamp pins to whichever instrumented 431 /// phase last ran and silently lies about engine health when only 432 /// auto-drain activity is happening. 433 func noteSuccess() { 434 lastSuccessAt = Date() 435 clearLastError() 436 } 437 438 func updateSnapshot(_ snapshot: SyncEngine.DiagnosticSnapshot) { 439 self.snapshot = snapshot 440 } 441 442 func clearLastError() { 443 lastErrorPhase = nil 444 lastErrorDomain = nil 445 lastErrorCode = nil 446 lastErrorDescription = nil 447 } 448 449 func run(_ phase: String, _ body: @Sendable () async throws -> Void) async { 450 recordStart(phase) 451 do { 452 try await body() 453 recordSuccess(phase) 454 } catch { 455 recordError(phase, error) 456 } 457 } 458 459 func run<T: Sendable>(_ phase: String, _ body: @Sendable () async throws -> T) async -> T? { 460 recordStart(phase) 461 do { 462 let result = try await body() 463 recordSuccess(phase) 464 return result 465 } catch { 466 recordError(phase, error) 467 return nil 468 } 469 } 470 }