folksy_idioms/scripts/generate_raw_batch.sh

#!/usr/bin/env bash
# Generate raw folksy sayings across all 7 templates.
# Output: corpus/corpus_raw.jsonl (~10,500 entries)

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
CORPUS_DIR="$PROJECT_DIR/corpus"
GENERATOR="$PROJECT_DIR/folksy_generator.py"

COUNT_PER_TEMPLATE=${1:-1500}

mkdir -p "$CORPUS_DIR"

OUTPUT="$CORPUS_DIR/corpus_raw.jsonl"
# Clear existing file
> "$OUTPUT"

TEMPLATES=(
    deconstruction
    denial_of_consequences
    ironic_deficiency
    futile_preparation
    hypocritical_complaint
    tautological_wisdom
    false_equivalence
)

echo "Generating $COUNT_PER_TEMPLATE sayings per template (${#TEMPLATES[@]} templates)..."
echo "Output: $OUTPUT"

total=0
for template in "${TEMPLATES[@]}"; do
    echo -n "  $template ($COUNT_PER_TEMPLATE)... "
    before=$(wc -l < "$OUTPUT")
    python3 "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
    after=$(wc -l < "$OUTPUT")
    generated=$((after - before))
    total=$((total + generated))
    echo "$generated generated"
done

echo ""
echo "Total: $total raw sayings in $OUTPUT"
echo ""

# Check template distribution
echo "Template distribution:"
python3 -c "
import json, sys
from collections import Counter
counts = Counter()
with open('$OUTPUT') as f:
    for line in f:
        entry = json.loads(line)
        counts[entry['meta_template']] += 1
for template, count in sorted(counts.items()):
    print(f'  {template:30s} {count:5d}')
print(f\"  {'TOTAL':30s} {sum(counts.values()):5d}\")
"