crossmate

A collaborative crossword app for iOS
Log | Files | Refs | LICENSE

generate_puzzles.sh (11227B)


      1 #!/usr/bin/env bash
      2 set -euo pipefail
      3 
      4 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
      5 CROSSMAKE_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
      6 cd "$CROSSMAKE_DIR"
      7 
      8 DATA_DIR="Data"
      9 GENERATED_DIR="Generated"
     10 GRID_LIST="Sources/Fillmake/Resources/grid_list.json"
     11 WORD_LIST="${GENERATED_DIR}/word_list.json"
     12 COUNTS_FILE="${GENERATED_DIR}/answer_counts.json"
     13 QUALITY_FILE="${GENERATED_DIR}/word_quality.json"
     14 BAD_WORDS_FILE="${DATA_DIR}/bad_words.json"
     15 TIMEOUT_SECONDS="${CROSSMAKE_TIMEOUT_SECONDS:-10}"
     16 EXAMPLES_PER_GRID="${CROSSMAKE_EXAMPLES_PER_GRID:-4}"
     17 OUTPUT_PREFIX="Crossmate"
     18 MIN_FILL_SCORE="${CROSSMAKE_MIN_FILL_SCORE:-7500}"
     19 BREADTH="${CROSSMAKE_BREADTH:-80}"
     20 MAX_ANSWER_USAGES="${CROSSMAKE_MAX_ANSWER_USAGES:-1}"
     21 MAX_PARALLEL_JOBS="${CROSSMAKE_JOBS:-}"
     22 # Use CROSSMAKE_GRID_SELECTION=ranked to sample from Gridmake's top-ranked grids.
     23 GRID_SELECTION="${CROSSMAKE_GRID_SELECTION:-random}"
     24 GRID_RANK_LIMIT="${CROSSMAKE_GRID_RANK_LIMIT:-250}"
     25 FIXED_GRID_INDEX="${CROSSMAKE_GRID_INDEX:-}"
     26 FIXED_SEED="${CROSSMAKE_SEED:-}"
     27 TARGET_PUZZLES="${1:-10}"
     28 FILLMAKE_EXECUTABLE=".build/release/Fillmake"
     29 GRIDMAKE_EXECUTABLE=".build/release/Gridmake"
     30 
     31 if [[ ! -f "$WORD_LIST" ]]; then
     32   echo "Word list not found: $WORD_LIST" >&2
     33   exit 1
     34 fi
     35 
     36 if [[ ! -f "$GRID_LIST" ]]; then
     37   echo "Grid list not found: $GRID_LIST" >&2
     38   exit 1
     39 fi
     40 
     41 if [[ ! -f "$COUNTS_FILE" ]]; then
     42   echo "Answer counts not found: $COUNTS_FILE" >&2
     43   exit 1
     44 fi
     45 
     46 if [[ ! -f "$QUALITY_FILE" ]]; then
     47   echo "Word quality metadata not found: $QUALITY_FILE" >&2
     48   exit 1
     49 fi
     50 
     51 if [[ ! -f "$BAD_WORDS_FILE" ]]; then
     52   echo "Bad-word list not found: $BAD_WORDS_FILE" >&2
     53   exit 1
     54 fi
     55 
     56 if ! command -v jq >/dev/null 2>&1; then
     57   echo "jq is required to read $GRID_LIST" >&2
     58   exit 1
     59 fi
     60 
     61 if [[ -z "$MAX_PARALLEL_JOBS" ]]; then
     62   MAX_PARALLEL_JOBS="$(sysctl -n hw.ncpu 2>/dev/null || getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)"
     63   if ((MAX_PARALLEL_JOBS > 4)); then
     64     MAX_PARALLEL_JOBS=4
     65   fi
     66 fi
     67 
     68 if ! [[ "$MAX_PARALLEL_JOBS" =~ ^[0-9]+$ ]] || ((10#$MAX_PARALLEL_JOBS < 1)); then
     69   echo "CROSSMAKE_JOBS must be a positive integer" >&2
     70   exit 1
     71 fi
     72 
     73 if ! [[ "$TIMEOUT_SECONDS" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
     74   echo "CROSSMAKE_TIMEOUT_SECONDS must be a positive number" >&2
     75   exit 1
     76 fi
     77 
     78 if ! [[ "$EXAMPLES_PER_GRID" =~ ^[0-9]+$ ]] || ((10#$EXAMPLES_PER_GRID < 1)); then
     79   echo "CROSSMAKE_EXAMPLES_PER_GRID must be a positive integer" >&2
     80   exit 1
     81 fi
     82 
     83 if ! [[ "$MIN_FILL_SCORE" =~ ^-?[0-9]+$ ]]; then
     84   echo "CROSSMAKE_MIN_FILL_SCORE must be an integer" >&2
     85   exit 1
     86 fi
     87 
     88 if ! [[ "$BREADTH" =~ ^[0-9]+$ ]] || ((10#$BREADTH < 1)); then
     89   echo "CROSSMAKE_BREADTH must be a positive integer" >&2
     90   exit 1
     91 fi
     92 
     93 if ! [[ "$MAX_ANSWER_USAGES" =~ ^[0-9]+$ ]] || ((10#$MAX_ANSWER_USAGES < 1)); then
     94   echo "CROSSMAKE_MAX_ANSWER_USAGES must be a positive integer" >&2
     95   exit 1
     96 fi
     97 
     98 if [[ "$GRID_SELECTION" != "ranked" && "$GRID_SELECTION" != "random" ]]; then
     99   echo "CROSSMAKE_GRID_SELECTION must be ranked or random" >&2
    100   exit 1
    101 fi
    102 
    103 if [[ -n "$FIXED_GRID_INDEX" ]] && ! [[ "$FIXED_GRID_INDEX" =~ ^[0-9]+$ ]]; then
    104   echo "CROSSMAKE_GRID_INDEX must be a non-negative integer" >&2
    105   exit 1
    106 fi
    107 
    108 if [[ -n "$FIXED_SEED" ]] && ! [[ "$FIXED_SEED" =~ ^[0-9]+$ ]]; then
    109   echo "CROSSMAKE_SEED must be a non-negative integer" >&2
    110   exit 1
    111 fi
    112 
    113 if ! [[ "$GRID_RANK_LIMIT" =~ ^[0-9]+$ ]] || ((10#$GRID_RANK_LIMIT < 1)); then
    114   echo "CROSSMAKE_GRID_RANK_LIMIT must be a positive integer" >&2
    115   exit 1
    116 fi
    117 
    118 if ! [[ "$TARGET_PUZZLES" =~ ^[0-9]+$ ]]; then
    119   echo "Usage: $0 [positive-puzzle-count]" >&2
    120   exit 1
    121 fi
    122 
    123 target_puzzle_count=$((10#$TARGET_PUZZLES))
    124 if ((target_puzzle_count < 1)); then
    125   echo "Usage: $0 [positive-puzzle-count]" >&2
    126   exit 1
    127 fi
    128 
    129 next_output_number() {
    130   local highest=0
    131   local path filename number number_value
    132 
    133   for path in "${GENERATED_DIR}/${OUTPUT_PREFIX}-"*.xd; do
    134     [[ -e "$path" ]] || continue
    135     filename="${path##*/}"
    136     [[ "$filename" =~ ^${OUTPUT_PREFIX}-([0-9]+)\.xd$ ]] || continue
    137     number="${BASH_REMATCH[1]}"
    138     number_value=$((10#$number))
    139     if ((number_value > highest)); then
    140       highest="$number_value"
    141     fi
    142   done
    143 
    144   echo "$((highest + 1))"
    145 }
    146 
    147 output_number="$(next_output_number)"
    148 mkdir -p "$GENERATED_DIR"
    149 tmp_dir="$(mktemp -d)"
    150 trap 'rm -rf "$tmp_dir"' EXIT
    151 
    152 echo "Building Fillmake in release mode"
    153 swift build -c release
    154 
    155 if [[ ! -x "$FILLMAKE_EXECUTABLE" ]]; then
    156   echo "Fillmake executable not found after build: $FILLMAKE_EXECUTABLE" >&2
    157   exit 1
    158 fi
    159 
    160 if [[ ! -x "$GRIDMAKE_EXECUTABLE" ]]; then
    161   echo "Gridmake executable not found after build: $GRIDMAKE_EXECUTABLE" >&2
    162   exit 1
    163 fi
    164 
    165 available_grid_count="$(jq 'length' "$GRID_LIST")"
    166 grid_indices="$tmp_dir/grid-indices.txt"
    167 
    168 if [[ -n "$FIXED_GRID_INDEX" ]]; then
    169   if ((10#$FIXED_GRID_INDEX >= available_grid_count)); then
    170     echo "CROSSMAKE_GRID_INDEX ${FIXED_GRID_INDEX} is out of range; found ${available_grid_count} grids" >&2
    171     exit 1
    172   fi
    173   echo "$FIXED_GRID_INDEX" >"$grid_indices"
    174 elif [[ "$GRID_SELECTION" == "ranked" ]]; then
    175   "$GRIDMAKE_EXECUTABLE" --grids "$GRID_LIST" --limit "$GRID_RANK_LIMIT" \
    176     | awk 'NR > 1 { print $2 }' >"$grid_indices"
    177 else
    178   for ((grid_index = 0; grid_index < available_grid_count; grid_index++)); do
    179     echo "$grid_index"
    180   done >"$grid_indices"
    181 fi
    182 
    183 selection_grid_count="$(wc -l <"$grid_indices" | tr -d '[:space:]')"
    184 if ((10#$selection_grid_count < 1)); then
    185   echo "No grids available for selection" >&2
    186   exit 1
    187 fi
    188 
    189 selected_indices=()
    190 while IFS= read -r selected_index; do
    191   selected_indices+=("$selected_index")
    192 done <"$grid_indices"
    193 
    194 random_grid_index() {
    195   echo "${selected_indices[$((RANDOM % selection_grid_count))]}"
    196 }
    197 
    198 if [[ -n "$FIXED_GRID_INDEX" ]]; then
    199   echo "Generating ${target_puzzle_count} puzzle(s) from fixed grid ${FIXED_GRID_INDEX} with ${EXAMPLES_PER_GRID} fill attempt(s) per puzzle, ${MAX_PARALLEL_JOBS} parallel job(s), timeout ${TIMEOUT_SECONDS}s per run, breadth ${BREADTH}, min score ${MIN_FILL_SCORE}, max answer usage ${MAX_ANSWER_USAGES}"
    200 elif [[ "$GRID_SELECTION" == "ranked" ]]; then
    201   echo "Generating ${target_puzzle_count} puzzle(s) from the top ${selection_grid_count} ranked grid(s) with ${EXAMPLES_PER_GRID} fill attempt(s) per puzzle, ${MAX_PARALLEL_JOBS} parallel job(s), timeout ${TIMEOUT_SECONDS}s per run, breadth ${BREADTH}, min score ${MIN_FILL_SCORE}, max answer usage ${MAX_ANSWER_USAGES}"
    202 else
    203   echo "Generating ${target_puzzle_count} puzzle(s) from ${selection_grid_count} random grid(s) with ${EXAMPLES_PER_GRID} fill attempt(s) per puzzle, ${MAX_PARALLEL_JOBS} parallel job(s), timeout ${TIMEOUT_SECONDS}s per run, breadth ${BREADTH}, min score ${MIN_FILL_SCORE}, max answer usage ${MAX_ANSWER_USAGES}"
    204 fi
    205 
    206 running_job_count() {
    207   jobs -rp | wc -l | tr -d '[:space:]'
    208 }
    209 
    210 wait_for_job_slot() {
    211   while ((10#$(running_job_count) >= 10#$MAX_PARALLEL_JOBS)); do
    212     sleep 1
    213   done
    214 }
    215 
    216 candidate_index=0
    217 
    218 start_candidate() {
    219   local group_id="$1"
    220   local grid_index="$2"
    221   local example="$3"
    222   local candidate_id="$4"
    223   local seed="$5"
    224   local output_path="${tmp_dir}/candidate-${candidate_id}.xd"
    225   local log_path="${tmp_dir}/candidate-${candidate_id}.log"
    226   local ok_path="${tmp_dir}/candidate-${candidate_id}.ok"
    227   local status_path="${tmp_dir}/candidate-${candidate_id}.status"
    228   local group_path="${tmp_dir}/candidate-${candidate_id}.group"
    229 
    230   candidate_outputs+=("$output_path")
    231   echo "$group_id" >"$group_path"
    232   echo "Generating candidate ${candidate_id} from grid index ${grid_index} (${example}/${EXAMPLES_PER_GRID}) with seed ${seed}"
    233 
    234   (
    235     set +e
    236     "$FILLMAKE_EXECUTABLE" \
    237       --words "$WORD_LIST" \
    238       --counts "$COUNTS_FILE" \
    239       --quality "$QUALITY_FILE" \
    240       --bad-words "$BAD_WORDS_FILE" \
    241       --grid-index "$grid_index" \
    242       --seed "$seed" \
    243       --timeout "$TIMEOUT_SECONDS" \
    244       --breadth "$BREADTH" \
    245       --optimize-fill \
    246       --min-fill-score "$MIN_FILL_SCORE" \
    247       --max-answer-usages "$MAX_ANSWER_USAGES" \
    248       --output "$output_path" >"$log_path" 2>&1
    249     status="$?"
    250     echo "$status" >"$status_path"
    251     if [[ "$status" == "0" ]]; then
    252       touch "$ok_path"
    253     fi
    254   ) &
    255 }
    256 
    257 accepted_count=0
    258 rejected_count=0
    259 discarded_count=0
    260 attempted_group_count=0
    261 
    262 fill_score_for_log() {
    263   local log_path="$1"
    264   local score
    265 
    266   score="$(
    267     awk '
    268       /kept best fill score/ {
    269         value = $(NF)
    270         gsub(/\./, "", value)
    271         print value
    272       }
    273       /Selected fill score:/ {
    274         print $NF
    275       }
    276     ' "$log_path" | tail -1
    277   )"
    278 
    279   if [[ -n "$score" ]]; then
    280     echo "$score"
    281   else
    282     echo "-999999999"
    283   fi
    284 }
    285 
    286 run_grid_group() {
    287   local group_id="$1"
    288   local grid_index="$2"
    289   best_candidate_id=""
    290   best_output_path=""
    291   best_score="-999999999"
    292   candidate_outputs=()
    293 
    294   echo "Starting grid group ${group_id} from grid index ${grid_index}"
    295 
    296   for ((example = 1; example <= EXAMPLES_PER_GRID; example++)); do
    297     wait_for_job_slot
    298     candidate_index=$((candidate_index + 1))
    299     if [[ -n "$FIXED_SEED" ]]; then
    300       seed="$FIXED_SEED"
    301     else
    302       seed="$(($(date +%s) * 1000000 + candidate_index * 1000 + RANDOM))"
    303     fi
    304     start_candidate "$group_id" "$grid_index" "$example" "$candidate_index" "$seed"
    305   done
    306 
    307   wait
    308 
    309   for output_path in "${candidate_outputs[@]}"; do
    310     candidate_id="${output_path##*/candidate-}"
    311     candidate_id="${candidate_id%.xd}"
    312     group_path="${tmp_dir}/candidate-${candidate_id}.group"
    313     ok_path="${tmp_dir}/candidate-${candidate_id}.ok"
    314     log_path="${tmp_dir}/candidate-${candidate_id}.log"
    315 
    316     [[ -f "$group_path" ]] || continue
    317     [[ "$(cat "$group_path")" == "$group_id" ]] || continue
    318     [[ -f "$ok_path" ]] || continue
    319 
    320     score="$(fill_score_for_log "$log_path")"
    321     if ((score > best_score)); then
    322       best_score="$score"
    323       best_candidate_id="$candidate_id"
    324       best_output_path="$output_path"
    325     fi
    326   done
    327 
    328   if [[ -n "$best_candidate_id" ]]; then
    329     printf -v final_output_path "%s/%s-%04d.xd" "$GENERATED_DIR" "$OUTPUT_PREFIX" "$output_number"
    330     mv "$best_output_path" "$final_output_path"
    331     echo "Wrote ${final_output_path} from candidate ${best_candidate_id}"
    332     grep -E "Searched .* seconds|Selected fill score:" "${tmp_dir}/candidate-${best_candidate_id}.log" || true
    333     output_number=$((output_number + 1))
    334     accepted_count=$((accepted_count + 1))
    335   else
    336     echo "No accepted fill for grid group ${group_id}." >&2
    337   fi
    338 
    339   for output_path in "${candidate_outputs[@]}"; do
    340     candidate_id="${output_path##*/candidate-}"
    341     candidate_id="${candidate_id%.xd}"
    342     log_path="${tmp_dir}/candidate-${candidate_id}.log"
    343     ok_path="${tmp_dir}/candidate-${candidate_id}.ok"
    344 
    345     if [[ -f "$ok_path" ]]; then
    346       if [[ -e "$output_path" ]]; then
    347         discarded_count=$((discarded_count + 1))
    348       fi
    349     else
    350       rejected_count=$((rejected_count + 1))
    351       echo "Rejected candidate ${candidate_id}:" >&2
    352       tail -3 "$log_path" >&2 || true
    353     fi
    354   done
    355 }
    356 
    357 while ((accepted_count < target_puzzle_count)); do
    358   attempted_group_count=$((attempted_group_count + 1))
    359   run_grid_group "$attempted_group_count" "$(random_grid_index)"
    360 done
    361 
    362 echo "Accepted ${accepted_count} puzzle(s) after ${attempted_group_count} grid group(s); discarded ${discarded_count} lower-scoring accepted fill(s); rejected ${rejected_count}."