#!/usr/bin/env bash # Generate raw folksy sayings across all 7 templates. # Output: corpus/corpus_raw.jsonl (~10,500 entries) set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" CORPUS_DIR="$PROJECT_DIR/corpus" GENERATOR="$PROJECT_DIR/folksy_generator.py" COUNT_PER_TEMPLATE=${1:-1500} mkdir -p "$CORPUS_DIR" OUTPUT="$CORPUS_DIR/corpus_raw.jsonl" # Clear existing file > "$OUTPUT" TEMPLATES=( deconstruction denial_of_consequences ironic_deficiency futile_preparation hypocritical_complaint tautological_wisdom false_equivalence ) echo "Generating $COUNT_PER_TEMPLATE sayings per template (${#TEMPLATES[@]} templates)..." echo "Output: $OUTPUT" total=0 for template in "${TEMPLATES[@]}"; do echo -n " $template ($COUNT_PER_TEMPLATE)... " before=$(wc -l < "$OUTPUT") python3 "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null after=$(wc -l < "$OUTPUT") generated=$((after - before)) total=$((total + generated)) echo "$generated generated" done echo "" echo "Total: $total raw sayings in $OUTPUT" echo "" # Check template distribution echo "Template distribution:" python3 -c " import json, sys from collections import Counter counts = Counter() with open('$OUTPUT') as f: for line in f: entry = json.loads(line) counts[entry['meta_template']] += 1 for template, count in sorted(counts.items()): print(f' {template:30s} {count:5d}') print(f\" {'TOTAL':30s} {sum(counts.values()):5d}\") "