main.swift (35384B)
1 import Foundation 2 3 let weekdayNames = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] 4 let weekdayLookup = Dictionary(uniqueKeysWithValues: weekdayNames.enumerated().map { ($0.element.lowercased(), $0.offset) }) 5 6 let veryWeakShortWords: Set<String> = [ 7 "aah", "aal", "aar", "aba", "abe", "abo", "abu", "ado", "adz", "aer", 8 "ane", "ani", "ara", "ase", "ato", "ava", "een", "eme", "ene", "eon", 9 "ere", "ese", "ess", "est", "eta", "ete", "eth", "ier", "iii", "ile", 10 "ita", "ite", "lai", "lar", "mee", "nee", "oer", "ona", "onea", 11 "ort", "ose", "oto", "ree", "rei", "ria", "ser", "tae", "tba", "tbs", 12 "tko", "tnt", "tpe", "tsp", "ute", "wye", "xed", "xii", 13 "ance", "elee", "orle", "eder", "ssts" 14 ] 15 16 let commonShortWords: Set<String> = [ 17 "able", "acre", "act", "acts", "age", "aged", "ages", "air", "airs", 18 "ale", "ales", "all", "ally", "also", "and", "ant", "ante", "anti", 19 "ants", "ape", "apes", "area", "are", "ark", "arks", "arm", "arms", 20 "art", "arts", "ash", "ate", "aunt", "awe", "awed", "awes", "bad", 21 "bag", "bags", "bar", "bare", "bars", "base", "bash", "bat", "bats", 22 "bed", "beds", "bee", "beer", "bees", "bell", "belt", "bent", "best", 23 "bet", "bets", "bite", "blue", "bus", "but", "can", "cap", "car", 24 "care", "case", "cat", "cats", "clay", "cod", "coil", "cold", "cord", 25 "cow", "cue", "curl", "cut", "cuts", "deer", "dog", "dogs", "dry", 26 "ear", "ears", "eat", "eats", "eel", "eels", "era", "eras", "eye", 27 "eyes", "far", "farm", "fast", "fat", "feet", "few", "fire", "fish", 28 "fit", "fits", "free", "fun", "game", "gas", "gate", "get", "gets", 29 "goat", "good", "hair", "hand", "hard", "hat", "hate", "heat", "hen", 30 "her", "here", "hit", "hits", "ice", "idea", "ink", "inn", "iron", 31 "its", "kid", "kids", "knee", "land", "last", "late", "law", "lead", 32 "left", "let", "lets", "lie", "life", "line", "long", "lose", "lost", 33 "man", "many", "map", "men", "mile", "mind", "more", "name", "near", 34 "new", "note", "oar", "oars", "oil", "old", "one", "open", "ore", 35 "ores", "our", "out", "over", "own", "pan", "part", "past", "pen", 36 "pet", "pets", "pie", "pin", "pins", "play", "pot", "red", "rest", 37 "rice", "ring", "road", "root", "rose", "run", "runs", "sad", "salt", 38 "sat", "save", "sea", "seat", "see", "seed", "sees", "set", "sets", 39 "shoe", "side", "sit", "site", "sky", "snow", "son", "song", "star", 40 "step", "stop", "sun", "take", "tea", "team", "ten", "tens", "test", 41 "the", "tie", "ties", "time", "tree", "try", "use", "used", "uses", 42 "war", "way", "west", "wet", "win", "wine", "word", "work", "yard", 43 "year", "yes" 44 ] 45 46 let shortExactFragments: Set<String> = [ 47 "ance", "ence", "enne", "ette", "ible", "iest", "ism", "isms", "itis" 48 ] 49 50 let crosswordeseWords: Set<String> = [ 51 "adue", "anent", "anet", "antae", "atee", "atit", "atle", "bassi", 52 "brocatelle", "ecce", "ente", "esne", "etes", "gnar", "imset", 53 "labile", "laic", "laram", "mtida", "neer", "onor", "relee", "rete", 54 "terete", "togae", "ulee", 55 "aereo", "alai", "aretes", "aril", "atri", "aussi", "eaude", "ees", 56 "emeer", "enote", "erian", "ers", "esnes", "evoe", "ilia", "leisterer", 57 "mesne", "olea", "ooo", "oss", "ranee", "rorem", "seral", 58 "snee", "soras", "sri", "tse", "yeses", 59 "abas", "adano", "agena", "agin", "agorae", "ams", "ans", "aper", 60 "arear", "bretharte", "donees", "eloi", "endat", "etnas", "etui", 61 "hammshams", "nin", "oas", "ogees", "olan", "ont", "ossa", "otra", 62 "poetaster", "rainintheface", "rea", "retia", "roone", "rpi", "seta", 63 "sisi", "soli", "tet", "tieto", "totoe", "tra", "yser" 64 ] 65 66 let easyAllowedWords: Set<String> = [ 67 "n", "s", "e", "w", "ne", "nw", "se", "sw", 68 "nne", "nnw", "ene", "ese", "sse", "ssw", "wsw", "wnw" 69 ] 70 71 struct Options { 72 var inputPath = "Data" 73 var outputPath = "Generated/word_list.json" 74 var countsOutputPath: String? 75 var minLength = 2 76 var maxLength: Int? 77 var minCount = 1 78 var lengthThresholds = false 79 var qualityFilter = false 80 var easyFillFilter = false 81 var badWordsPath: String? 82 var rejectionsOutputPath: String? 83 var qualityOutputPath: String? 84 var weekdays: Set<Int>? 85 var pretty = false 86 } 87 88 struct AnswerEvidence { 89 var count = 0 90 var badClueCount = 0 91 var obscureNameClueCount = 0 92 var fillBlankCount = 0 93 var foreignLanguageClueCount = 0 94 var cluePartCount = 0 95 var continuationClueCount = 0 96 var themeClueCount = 0 97 var latestSeen: String? 98 var sampleClues: [String] = [] 99 } 100 101 struct WordmakeError: Error, CustomStringConvertible { 102 let description: String 103 } 104 105 let badClueRegex = try! NSRegularExpression( 106 pattern: #"\b(abbr|var|prefix|suffix|comb\.? form|archaic|poetic|dial|slang|obs|old-style|old fashioned|scot|irish|brit|fr|french|ger|german|lat|latin|span|spanish|ital|italian|port|portuguese|hebr|hebrew|yiddish|jap|japanese|inits|initials|letters|compass point|bearing|direction|shoe width|monogram|degs|degree|of yore|old|olden|heraldry|genus|legal|in law|anatomical|serf|serfs|slave|slaves|vassal|pilaster|opera voices|persian fairy|mideast|eastern v\.?i\.?p|arab prince|arab chieftain|hindu title|indian princess|rajah|dirk|knife of old|seed covering|seed casing|seed envelope|hip bones|pelvic bones|bitter vetch|marsh birds)\b"#, 107 options: [.caseInsensitive] 108 ) 109 let obscureNameClueRegex = try! NSRegularExpression( 110 pattern: #"\b(actor|actress|author|composer|singer|poet|novelist|painter|artist|playwright|violinist|pianist|conductor|golfer|pitcher|baseballer|songwriter|politician|senator|governor|pope)\b"#, 111 options: [.caseInsensitive] 112 ) 113 let fillBlankRegex = try! NSRegularExpression(pattern: #"_{2,}|\.{3,}"#) 114 let foreignLanguageClueRegex = try! NSRegularExpression( 115 pattern: #":\s*(?:Sp|Span|Fr|Ger|Germ|It|Ital|Lat|Port|Heb|Yid|Jap|Rus|Gr|Gk|Swed|Norw|Dan|Arab|Chin|Hind|Skt|Turk|Finn)\.|,\s+in\s+[A-Z][a-z]"# 116 ) 117 let cluePartRegex = try! NSRegularExpression( 118 pattern: #"\b(?:part|section|segment|installment)\s+\d+\b|\b\d+(?:st|nd|rd|th)\s+(?:part|section|segment|installment)\b"#, 119 options: [.caseInsensitive] 120 ) 121 let continuationClueRegex = try! NSRegularExpression( 122 pattern: #"\b(?:end|start|middle|continuation)\s+of\s+(?:the\s+)?(?:quote|quip|remark|saying|joke)\b|^see\s+\d+"#, 123 options: [.caseInsensitive] 124 ) 125 let themeClueRegex = try! NSRegularExpression(pattern: #"^\s*\*"#) 126 let clueNumberRegex = try! NSRegularExpression(pattern: #"^\s*\d+\.\s*"#) 127 let datePathRegex = try! NSRegularExpression(pattern: #"(\d{4})/(\d{2})/(\d{2})\.json$"#) 128 129 func printUsage() { 130 print(""" 131 Usage: Wordmake [options] 132 133 Options: 134 --input PATH Directory containing crossword JSON files. Default: Data 135 --output PATH Path for the generated JSON word list. Default: Generated/word_list.json 136 --counts-output PATH Optional path for a JSON object of normalized answer frequencies. 137 --min-length N Minimum normalized answer length to include. Default: 2 138 --max-length N Maximum normalized answer length to include. 139 --min-count N Minimum number of appearances required to include an answer. Default: 1 140 --length-thresholds Use length-based appearance thresholds. 141 --quality-filter Use stricter count thresholds intended for generated fill. 142 --easy-fill-filter Use clue-aware filters for easier generated fill. 143 --bad-words PATH Optional JSON word list to exclude from the generated dictionary. 144 --rejections-output PATH Optional path for rejected word metadata from --easy-fill-filter. 145 --quality-output PATH Optional path for per-answer quality metadata from --easy-fill-filter. 146 --weekdays LIST Optional comma-separated weekday filter, e.g. Monday,Tuesday. 147 --pretty Pretty-print generated JSON. 148 -h, --help Show this help. 149 """) 150 } 151 152 func parseOptions(_ arguments: [String]) throws -> Options { 153 var options = Options() 154 var index = 1 155 156 func requireValue(_ name: String) throws -> String { 157 guard index + 1 < arguments.count else { 158 throw WordmakeError(description: "Missing value for \(name)") 159 } 160 index += 1 161 return arguments[index] 162 } 163 164 while index < arguments.count { 165 let argument = arguments[index] 166 switch argument { 167 case "--input": 168 options.inputPath = try requireValue(argument) 169 case "--output": 170 options.outputPath = try requireValue(argument) 171 case "--counts-output": 172 options.countsOutputPath = try requireValue(argument) 173 case "--min-length": 174 guard let value = Int(try requireValue(argument)) else { 175 throw WordmakeError(description: "--min-length must be an integer") 176 } 177 options.minLength = value 178 case "--max-length": 179 guard let value = Int(try requireValue(argument)) else { 180 throw WordmakeError(description: "--max-length must be an integer") 181 } 182 options.maxLength = value 183 case "--min-count": 184 guard let value = Int(try requireValue(argument)) else { 185 throw WordmakeError(description: "--min-count must be an integer") 186 } 187 options.minCount = value 188 case "--length-thresholds": 189 options.lengthThresholds = true 190 case "--quality-filter": 191 options.qualityFilter = true 192 case "--easy-fill-filter": 193 options.easyFillFilter = true 194 case "--bad-words": 195 options.badWordsPath = try requireValue(argument) 196 case "--rejections-output": 197 options.rejectionsOutputPath = try requireValue(argument) 198 case "--quality-output": 199 options.qualityOutputPath = try requireValue(argument) 200 case "--weekdays": 201 options.weekdays = try parseWeekdays(try requireValue(argument)) 202 case "--pretty": 203 options.pretty = true 204 case "--help", "-h": 205 printUsage() 206 exit(0) 207 default: 208 throw WordmakeError(description: "Unknown argument: \(argument)") 209 } 210 index += 1 211 } 212 213 return options 214 } 215 216 func parseWeekdays(_ value: String) throws -> Set<Int> { 217 var weekdays: Set<Int> = [] 218 for rawName in value.split(separator: ",") { 219 let name = rawName.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() 220 guard !name.isEmpty else { 221 continue 222 } 223 guard let weekday = weekdayLookup[name] else { 224 throw WordmakeError(description: "Unknown weekday '\(rawName)'; expected one of: \(weekdayNames.joined(separator: ", "))") 225 } 226 weekdays.insert(weekday) 227 } 228 guard !weekdays.isEmpty else { 229 throw WordmakeError(description: "--weekdays must include at least one weekday name") 230 } 231 return weekdays 232 } 233 234 func normalizeAnswer(_ text: String) -> String { 235 String(text.unicodeScalars.compactMap { scalar in 236 switch scalar.value { 237 case 65...90: 238 return Character(UnicodeScalar(scalar.value + 32)!) 239 case 97...122: 240 return Character(scalar) 241 default: 242 return nil 243 } 244 }) 245 } 246 247 func stripClueNumber(_ clue: String) -> String { 248 let range = NSRange(clue.startIndex..<clue.endIndex, in: clue) 249 return clueNumberRegex.stringByReplacingMatches(in: clue, range: range, withTemplate: "") 250 .trimmingCharacters(in: .whitespacesAndNewlines) 251 } 252 253 func regexMatches(_ regex: NSRegularExpression, _ text: String) -> Bool { 254 let range = NSRange(text.startIndex..<text.endIndex, in: text) 255 return regex.firstMatch(in: text, range: range) != nil 256 } 257 258 func jsonFiles(in directory: URL) throws -> [URL] { 259 guard let enumerator = FileManager.default.enumerator( 260 at: directory, 261 includingPropertiesForKeys: [.isRegularFileKey], 262 options: [.skipsHiddenFiles] 263 ) else { 264 return [] 265 } 266 return try enumerator.compactMap { item -> URL? in 267 guard let url = item as? URL, url.pathExtension == "json" else { 268 return nil 269 } 270 let values = try url.resourceValues(forKeys: [.isRegularFileKey]) 271 return values.isRegularFile == true ? url : nil 272 } 273 .sorted { $0.path < $1.path } 274 } 275 276 func loadJSONObject(from url: URL) throws -> Any { 277 let data = try Foundation.Data(contentsOf: url) 278 return try JSONSerialization.jsonObject(with: data) 279 } 280 281 func normalizedAnswers(from puzzle: Any) -> [String] { 282 guard let puzzle = puzzle as? [String: Any], 283 let answers = puzzle["answers"] as? [String: Any] else { 284 return [] 285 } 286 return ["across", "down"].flatMap { direction -> [String] in 287 guard let entries = answers[direction] as? [Any] else { 288 return [] 289 } 290 return entries.compactMap { entry in 291 guard let answer = entry as? String else { 292 return nil 293 } 294 let normalized = normalizeAnswer(answer) 295 return normalized.isEmpty ? nil : normalized 296 } 297 } 298 } 299 300 func normalizedEntries(from puzzle: Any) -> [(answer: String, clue: String)] { 301 guard let puzzle = puzzle as? [String: Any], 302 let answers = puzzle["answers"] as? [String: Any], 303 let clues = puzzle["clues"] as? [String: Any] else { 304 return [] 305 } 306 return ["across", "down"].flatMap { direction -> [(answer: String, clue: String)] in 307 guard let answerEntries = answers[direction] as? [Any], 308 let clueEntries = clues[direction] as? [Any] else { 309 return [] 310 } 311 return zip(answerEntries, clueEntries).compactMap { answerEntry, clueEntry in 312 guard let answer = answerEntry as? String else { 313 return nil 314 } 315 let normalized = normalizeAnswer(answer) 316 guard !normalized.isEmpty else { 317 return nil 318 } 319 let clue = stripClueNumber(clueEntry as? String ?? "") 320 return (normalized, clue) 321 } 322 } 323 } 324 325 func shouldInclude(_ word: String, minLength: Int, maxLength: Int?) -> Bool { 326 if word.count < minLength { 327 return false 328 } 329 if let maxLength, word.count > maxLength { 330 return false 331 } 332 return true 333 } 334 335 func weekdayForPath(_ url: URL) -> Int? { 336 let path = url.path.replacingOccurrences(of: "\\", with: "/") 337 let range = NSRange(path.startIndex..<path.endIndex, in: path) 338 guard let match = datePathRegex.firstMatch(in: path, range: range), 339 match.numberOfRanges == 4, 340 let yearRange = Range(match.range(at: 1), in: path), 341 let monthRange = Range(match.range(at: 2), in: path), 342 let dayRange = Range(match.range(at: 3), in: path), 343 let year = Int(path[yearRange]), 344 let month = Int(path[monthRange]), 345 let day = Int(path[dayRange]) else { 346 return nil 347 } 348 349 var components = DateComponents() 350 components.calendar = Calendar(identifier: .gregorian) 351 components.year = year 352 components.month = month 353 components.day = day 354 guard let date = components.date, 355 let calendarWeekday = components.calendar?.component(.weekday, from: date) else { 356 return nil 357 } 358 return (calendarWeekday + 5) % 7 359 } 360 361 func isoDateForPath(_ url: URL) -> String? { 362 let path = url.path.replacingOccurrences(of: "\\", with: "/") 363 let range = NSRange(path.startIndex..<path.endIndex, in: path) 364 guard let match = datePathRegex.firstMatch(in: path, range: range), 365 match.numberOfRanges == 4, 366 let yearRange = Range(match.range(at: 1), in: path), 367 let monthRange = Range(match.range(at: 2), in: path), 368 let dayRange = Range(match.range(at: 3), in: path) else { 369 return nil 370 } 371 372 return "\(path[yearRange])-\(path[monthRange])-\(path[dayRange])" 373 } 374 375 func puzzleWeekday(_ puzzle: Any, path: URL) -> Int? { 376 if let puzzle = puzzle as? [String: Any], 377 let dow = puzzle["dow"] as? String, 378 let weekday = weekdayLookup[dow.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()] { 379 return weekday 380 } 381 return weekdayForPath(path) 382 } 383 384 func shouldIncludePuzzle(_ puzzle: Any, path: URL, weekdays: Set<Int>?) -> Bool { 385 guard let weekdays else { 386 return true 387 } 388 guard let weekday = puzzleWeekday(puzzle, path: path) else { 389 return false 390 } 391 return weekdays.contains(weekday) 392 } 393 394 func requiredCount(word: String, flatMinCount: Int, lengthThresholds: Bool, qualityFilter: Bool) -> Int { 395 if qualityFilter { 396 if word.count < 5 { 397 return 15 398 } 399 if word.count > 7 { 400 return 1 401 } 402 return 3 403 } 404 guard lengthThresholds else { 405 return flatMinCount 406 } 407 if word.count < 5 { 408 return 10 409 } 410 if word.count > 7 { 411 return 1 412 } 413 return 2 414 } 415 416 func easyRequiredCount(_ word: String) -> Int { 417 if word.count <= 4 { 418 return 15 419 } 420 if word.count <= 7 { 421 return 3 422 } 423 return 1 424 } 425 426 func hasVowel(_ word: String) -> Bool { 427 word.contains { "aeiouy".contains($0) } 428 } 429 430 func clueRatio(_ part: Int, _ whole: Int) -> Double { 431 Double(part) / Double(max(whole, 1)) 432 } 433 434 func hasSequentialLetterRun(_ word: String, minimumLength: Int = 4) -> Bool { 435 let scalars = word.unicodeScalars.map(\.value) 436 guard scalars.count >= minimumLength else { 437 return false 438 } 439 440 var runLength = 1 441 for index in 1..<scalars.count { 442 if scalars[index] == scalars[index - 1] + 1 { 443 runLength += 1 444 if runLength >= minimumLength { 445 return true 446 } 447 } else { 448 runLength = 1 449 } 450 } 451 return false 452 } 453 454 func isInitialSurnameArtifact(_ word: String) -> Bool { 455 let fusedNames: Set<String> = [ 456 "ed", "ek", "el", "em", "et", 457 "jk", "jl", "jr", 458 "kd", "kt", 459 "mj", 460 "oj", 461 "tj", 462 "wc" 463 ] 464 465 return fusedNames.contains { word.hasPrefix($0) && word.count >= 6 } 466 } 467 468 func isPhraseFragmentArtifact(_ word: String, fillBlankRatio: Double) -> Bool { 469 guard word.count >= 5 else { 470 return false 471 } 472 let prefixes = [ 473 "aan", "aand", "anold", "asof", "ated", "cani", 474 "cular", 475 "igota", "igot", "imeta", "imet", "ihave", "itisa", "itwas", 476 "weare", "youre" 477 ] 478 if prefixes.contains(where: { word.hasPrefix($0) }) { 479 return true 480 } 481 if word.count >= 10 { 482 return fillBlankRatio >= 0.8 483 } 484 return fillBlankRatio >= 0.8 && word.count <= 7 485 } 486 487 func isVerySuspiciousOneOff(_ word: String, evidence: AnswerEvidence) -> Bool { 488 guard word.count >= 8, evidence.count == 1, evidence.latestSeen != nil else { 489 return false 490 } 491 if evidence.obscureNameClueCount > 0 { 492 return true 493 } 494 if word.hasSuffix("c") || word.hasSuffix("v") || word.hasSuffix("x") { 495 return true 496 } 497 return false 498 } 499 500 func isStaleOneOffLongFill(_ word: String, evidence: AnswerEvidence) -> Bool { 501 guard word.count >= 8, 502 evidence.count == 1, 503 let latestSeen = evidence.latestSeen else { 504 return false 505 } 506 return latestSeen < "2000-01-01" 507 } 508 509 func isPunnyOneOff(_ word: String, evidence: AnswerEvidence) -> Bool { 510 guard word.count >= 8, evidence.count == 1 else { 511 return false 512 } 513 return evidence.sampleClues.contains { $0.contains("?") } 514 } 515 516 func stricterShortRequiredCount(_ word: String) -> Int? { 517 switch word.count { 518 case ...2: 519 return nil 520 case 3: 521 return 25 522 case 4: 523 return 12 524 case 5: 525 return 6 526 default: 527 return nil 528 } 529 } 530 531 func easyRejectionReason(word: String, evidence: AnswerEvidence) -> String? { 532 let length = word.count 533 let count = evidence.count 534 535 if easyAllowedWords.contains(word) { 536 return nil 537 } 538 539 if crosswordeseWords.contains(word) { 540 return "known crosswordese/glue fill" 541 } 542 543 let badRatio = clueRatio(evidence.badClueCount, count) 544 let obscureNameRatio = clueRatio(evidence.obscureNameClueCount, count) 545 let fillBlankRatio = clueRatio(evidence.fillBlankCount, count) 546 let foreignLanguageRatio = clueRatio(evidence.foreignLanguageClueCount, count) 547 let cluePartRatio = clueRatio(evidence.cluePartCount, count) 548 549 if count < easyRequiredCount(word) { 550 return "count \(count) below easy threshold \(easyRequiredCount(word))" 551 } 552 553 if let stricterCount = stricterShortRequiredCount(word), 554 !commonShortWords.contains(word), 555 count < stricterCount { 556 return "short entry count \(count) below generated-fill threshold \(stricterCount)" 557 } 558 559 if length <= 5 && count < 80 && evidence.badClueCount >= 2 && badRatio >= 0.35 { 560 return "short entry mostly clued as abbreviation/variant/form" 561 } 562 563 if length <= 5 && 564 !commonShortWords.contains(word), 565 evidence.latestSeen.map({ $0 < "2000-01-01" }) == true { 566 return "stale short fill" 567 } 568 569 if hasSequentialLetterRun(word) { 570 return "alphabet-run artifact" 571 } 572 573 if length >= 5 && !hasVowel(word) { 574 return "long entry with no vowel" 575 } 576 577 if isInitialSurnameArtifact(word) { 578 return "initial-plus-name artifact" 579 } 580 581 if isPhraseFragmentArtifact(word, fillBlankRatio: fillBlankRatio) { 582 return "phrase fragment artifact" 583 } 584 585 if evidence.themeClueCount > 0 && (count <= 2 || clueRatio(evidence.themeClueCount, count) >= 0.5) { 586 return "one-off theme clue artifact" 587 } 588 589 if evidence.continuationClueCount > 0 && (count <= 3 || clueRatio(evidence.continuationClueCount, count) >= 0.5) { 590 return "cross-reference or quote-continuation artifact" 591 } 592 593 if isStaleOneOffLongFill(word, evidence: evidence) { 594 return "stale one-off long fill" 595 } 596 597 if isVerySuspiciousOneOff(word, evidence: evidence) { 598 return "one-off long artifact" 599 } 600 601 if isPunnyOneOff(word, evidence: evidence) { 602 return "one-off pun/theme artifact" 603 } 604 605 if length <= 5 && evidence.badClueCount >= 2 && badRatio >= 0.45 { 606 return "entry mostly clued as abbreviation/variant/form" 607 } 608 609 if length <= 5 && evidence.obscureNameClueCount >= 2 && obscureNameRatio >= 0.5 { 610 return "short entry mostly clued as obscure name/title" 611 } 612 613 if length <= 4 && commonShortWords.contains(word) { 614 return nil 615 } 616 617 if length <= 4 && veryWeakShortWords.contains(word) { 618 return "known weak short fill" 619 } 620 621 if length <= 4 && !hasVowel(word) { 622 return "short entry with no vowel" 623 } 624 625 if length <= 5 && shortExactFragments.contains(word) { 626 return "short suffix-like fragment" 627 } 628 629 if evidence.cluePartCount > 0 && (count <= 3 || cluePartRatio >= 0.5) { 630 return "entry mostly clued as one part of a longer answer" 631 } 632 633 if length <= 5 && 634 evidence.foreignLanguageClueCount >= 2 && 635 (count <= 10 || foreignLanguageRatio >= 0.5) { 636 return "entry mostly clued as foreign-language translation" 637 } 638 639 if length <= 4 && evidence.badClueCount >= 2 && badRatio >= 0.18 { 640 return "short entry mostly clued as abbreviation/variant/foreign/form" 641 } 642 643 if length <= 4 && evidence.obscureNameClueCount >= 4 && obscureNameRatio >= 0.35 { 644 return "short entry mostly clued as obscure name" 645 } 646 647 if length <= 4 && count < 40 && evidence.fillBlankCount >= 8 && fillBlankRatio >= 0.55 { 648 return "short entry mostly dependent on fill-in-the-blank clues" 649 } 650 651 if length <= 3 && count < 60 && (badRatio + obscureNameRatio + fillBlankRatio) >= 0.45 { 652 return "rare short entry with weak clue evidence" 653 } 654 655 return nil 656 } 657 658 func loadAnswers(inputURL: URL, options: Options) throws -> (counts: [String: Int], filesRead: Int, filesUsed: Int) { 659 var counts: [String: Int] = [:] 660 var filesRead = 0 661 var filesUsed = 0 662 663 for path in try jsonFiles(in: inputURL) { 664 do { 665 let puzzle = try loadJSONObject(from: path) 666 filesRead += 1 667 guard shouldIncludePuzzle(puzzle, path: path, weekdays: options.weekdays) else { 668 continue 669 } 670 filesUsed += 1 671 for answer in normalizedAnswers(from: puzzle) where shouldInclude(answer, minLength: options.minLength, maxLength: options.maxLength) { 672 counts[answer, default: 0] += 1 673 } 674 } catch { 675 fputs("Skipping \(path.path): \(error)\n", stderr) 676 } 677 } 678 679 return (counts, filesRead, filesUsed) 680 } 681 682 func loadAnswerEvidence(inputURL: URL, options: Options) throws -> (evidence: [String: AnswerEvidence], filesRead: Int, filesUsed: Int) { 683 var evidence: [String: AnswerEvidence] = [:] 684 var filesRead = 0 685 var filesUsed = 0 686 687 for path in try jsonFiles(in: inputURL) { 688 do { 689 let puzzle = try loadJSONObject(from: path) 690 filesRead += 1 691 guard shouldIncludePuzzle(puzzle, path: path, weekdays: options.weekdays) else { 692 continue 693 } 694 let latestSeen = isoDateForPath(path) 695 filesUsed += 1 696 for (answer, clue) in normalizedEntries(from: puzzle) where shouldInclude(answer, minLength: options.minLength, maxLength: options.maxLength) { 697 var entry = evidence[answer] ?? AnswerEvidence() 698 entry.count += 1 699 if let latestSeen { 700 if entry.latestSeen.map({ latestSeen > $0 }) ?? true { 701 entry.latestSeen = latestSeen 702 } 703 } 704 if entry.sampleClues.count < 5 && !clue.isEmpty { 705 entry.sampleClues.append(clue) 706 } 707 if regexMatches(badClueRegex, clue) { 708 entry.badClueCount += 1 709 } 710 if regexMatches(obscureNameClueRegex, clue) { 711 entry.obscureNameClueCount += 1 712 } 713 if regexMatches(fillBlankRegex, clue) { 714 entry.fillBlankCount += 1 715 } 716 if regexMatches(foreignLanguageClueRegex, clue) { 717 entry.foreignLanguageClueCount += 1 718 } 719 if regexMatches(cluePartRegex, clue) { 720 entry.cluePartCount += 1 721 } 722 if regexMatches(continuationClueRegex, clue) { 723 entry.continuationClueCount += 1 724 } 725 if regexMatches(themeClueRegex, clue) { 726 entry.themeClueCount += 1 727 } 728 evidence[answer] = entry 729 } 730 } catch { 731 fputs("Skipping \(path.path): \(error)\n", stderr) 732 } 733 } 734 735 return (evidence, filesRead, filesUsed) 736 } 737 738 func loadBadWords(path: String) throws -> Set<String> { 739 let value = try loadJSONObject(from: URL(fileURLWithPath: path)) 740 guard let words = value as? [Any] else { 741 throw WordmakeError(description: "expected JSON array") 742 } 743 return Set(words.compactMap { word in 744 guard let word = word as? String else { 745 return nil 746 } 747 let normalized = normalizeAnswer(word) 748 return normalized.isEmpty ? nil : normalized 749 }) 750 } 751 752 func sortedDictionary(_ dictionary: [String: Any]) -> [String: Any] { 753 Dictionary(uniqueKeysWithValues: dictionary.keys.sorted().map { ($0, dictionary[$0]!) }) 754 } 755 756 func evidenceObject(_ entry: AnswerEvidence, rejectionReason: String?) -> [String: Any] { 757 [ 758 "count": entry.count, 759 "bad_clue_count": entry.badClueCount, 760 "obscure_name_clue_count": entry.obscureNameClueCount, 761 "fill_blank_count": entry.fillBlankCount, 762 "foreign_language_clue_count": entry.foreignLanguageClueCount, 763 "clue_part_count": entry.cluePartCount, 764 "continuation_clue_count": entry.continuationClueCount, 765 "theme_clue_count": entry.themeClueCount, 766 "latest_seen": entry.latestSeen as Any? ?? NSNull(), 767 "sample_clues": entry.sampleClues, 768 "rejection_reason": rejectionReason as Any? ?? NSNull() 769 ] 770 } 771 772 func rejectionObject(reason: String, entry: AnswerEvidence) -> [String: Any] { 773 [ 774 "reason": reason, 775 "count": entry.count, 776 "bad_clue_count": entry.badClueCount, 777 "obscure_name_clue_count": entry.obscureNameClueCount, 778 "fill_blank_count": entry.fillBlankCount, 779 "foreign_language_clue_count": entry.foreignLanguageClueCount, 780 "clue_part_count": entry.cluePartCount, 781 "continuation_clue_count": entry.continuationClueCount, 782 "theme_clue_count": entry.themeClueCount, 783 "latest_seen": entry.latestSeen as Any? ?? NSNull(), 784 "sample_clues": entry.sampleClues 785 ] 786 } 787 788 func writeJSON(_ value: Any, to path: String, pretty: Bool) throws { 789 let url = URL(fileURLWithPath: path) 790 let directory = url.deletingLastPathComponent() 791 if directory.path != "." { 792 try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) 793 } 794 var options: JSONSerialization.WritingOptions = [.sortedKeys] 795 if pretty { 796 options.insert(.prettyPrinted) 797 } 798 let data = try JSONSerialization.data(withJSONObject: value, options: options) 799 var output = data 800 output.append(0x0A) 801 try output.write(to: url) 802 } 803 804 func validate(_ options: Options, inputURL: URL) throws { 805 var isDirectory: ObjCBool = false 806 guard FileManager.default.fileExists(atPath: inputURL.path, isDirectory: &isDirectory), isDirectory.boolValue else { 807 throw WordmakeError(description: "Input directory not found: \(options.inputPath)") 808 } 809 guard options.minLength >= 1 else { 810 throw WordmakeError(description: "--min-length must be at least 1") 811 } 812 if let maxLength = options.maxLength, maxLength < options.minLength { 813 throw WordmakeError(description: "--max-length must be greater than or equal to --min-length") 814 } 815 guard options.minCount >= 1 else { 816 throw WordmakeError(description: "--min-count must be at least 1") 817 } 818 let filters = [options.lengthThresholds, options.qualityFilter, options.easyFillFilter].filter { $0 }.count 819 guard filters <= 1 else { 820 throw WordmakeError(description: "Choose only one of --length-thresholds, --quality-filter, or --easy-fill-filter") 821 } 822 if (options.rejectionsOutputPath != nil || options.qualityOutputPath != nil) && !options.easyFillFilter { 823 throw WordmakeError(description: "--rejections-output and --quality-output require --easy-fill-filter") 824 } 825 } 826 827 func run() throws { 828 let options = try parseOptions(CommandLine.arguments) 829 let inputURL = URL(fileURLWithPath: options.inputPath) 830 try validate(options, inputURL: inputURL) 831 832 let badWords: Set<String> 833 if let badWordsPath = options.badWordsPath { 834 do { 835 badWords = try loadBadWords(path: badWordsPath) 836 } catch { 837 throw WordmakeError(description: "Could not read --bad-words \(badWordsPath): \(error)") 838 } 839 } else { 840 badWords = [] 841 } 842 843 var counts: [String: Int] 844 let filesRead: Int 845 let filesUsed: Int 846 var words: [String] 847 var easyRejections: [String: Any] = [:] 848 var wordQuality: [String: Any] = [:] 849 850 if options.easyFillFilter { 851 let result = try loadAnswerEvidence(inputURL: inputURL, options: options) 852 filesRead = result.filesRead 853 filesUsed = result.filesUsed 854 counts = Dictionary(uniqueKeysWithValues: result.evidence.map { ($0.key, $0.value.count) }) 855 words = [] 856 857 for word in result.evidence.keys.sorted() { 858 guard let entry = result.evidence[word] else { 859 continue 860 } 861 let reason = easyRejectionReason(word: word, evidence: entry) 862 let finalReason = badWords.contains(word) ? "bad word" : reason 863 wordQuality[word] = evidenceObject(entry, rejectionReason: finalReason) 864 if badWords.contains(word) { 865 continue 866 } 867 if let reason { 868 easyRejections[word] = rejectionObject(reason: reason, entry: entry) 869 } else { 870 words.append(word) 871 } 872 } 873 } else { 874 let result = try loadAnswers(inputURL: inputURL, options: options) 875 counts = result.counts 876 filesRead = result.filesRead 877 filesUsed = result.filesUsed 878 words = counts.keys 879 .filter { word in 880 !badWords.contains(word) && 881 (counts[word] ?? 0) >= requiredCount( 882 word: word, 883 flatMinCount: options.minCount, 884 lengthThresholds: options.lengthThresholds, 885 qualityFilter: options.qualityFilter 886 ) 887 } 888 .sorted() 889 } 890 891 try writeJSON(words, to: options.outputPath, pretty: options.pretty) 892 if let countsOutputPath = options.countsOutputPath { 893 try writeJSON(Dictionary(uniqueKeysWithValues: counts.keys.sorted().map { ($0, counts[$0]!) }), to: countsOutputPath, pretty: options.pretty) 894 } 895 if let rejectionsOutputPath = options.rejectionsOutputPath { 896 try writeJSON(sortedDictionary(easyRejections), to: rejectionsOutputPath, pretty: options.pretty) 897 } 898 if let qualityOutputPath = options.qualityOutputPath { 899 try writeJSON(sortedDictionary(wordQuality), to: qualityOutputPath, pretty: options.pretty) 900 } 901 902 print("Read \(filesRead) puzzle files") 903 if let weekdays = options.weekdays { 904 let selected = weekdays.sorted().map { weekdayNames[$0] }.joined(separator: ", ") 905 print("Used \(filesUsed) \(selected) puzzle files") 906 } 907 print("Wrote \(words.count) unique answers to \(options.outputPath)") 908 if options.lengthThresholds { 909 print("Included answers using length thresholds: <5 letters >=10, 5-7 letters >=2, >7 letters >=1") 910 } else if options.qualityFilter { 911 print("Included answers using quality filter: <5 letters >=15, 5-7 letters >=3, >7 letters >=1") 912 } else if options.easyFillFilter { 913 print("Included answers using clue-aware easy fill filter") 914 print("Rejected \(easyRejections.count) answers with easy fill filter") 915 } else if options.minCount > 1 { 916 print("Included answers appearing at least \(options.minCount) times") 917 } 918 if let badWordsPath = options.badWordsPath { 919 print("Excluded \(badWords.count) bad words from \(badWordsPath)") 920 } 921 if let countsOutputPath = options.countsOutputPath { 922 print("Wrote answer frequencies to \(countsOutputPath)") 923 } 924 if let rejectionsOutputPath = options.rejectionsOutputPath { 925 print("Wrote easy-filter rejections to \(rejectionsOutputPath)") 926 } 927 if let qualityOutputPath = options.qualityOutputPath { 928 print("Wrote word quality metadata to \(qualityOutputPath)") 929 } 930 } 931 932 do { 933 try run() 934 } catch { 935 fputs("Wordmake: \(error)\n", stderr) 936 exit(1) 937 }