Pre-corpus fixes (from EVALUATION.md): - Clean 2,264 contaminated rows from augmented relations (bridge artifacts, full-sentence HasProperty values, null bytes, empty words) - Fix article logic: dynamic a/an across Deconstruction, FalseEquivalence, DenialOfConsequences, TautologicalWisdom templates - Tighten _short_concepts() default from max_words=3 to 2 - Fix FutilePreparation gerunding: filter vocab nouns and noun-suffix words from UsedFor targets; fix CVC doubling for 'y'-ending words - Add _looks_like_verb() heuristic, improve _a() for vowel-sound edges Pipeline hardening: - polish_corpus.py: context-size fallback (truncate chain, then minimal prompt), classified error types, consecutive-error circuit breaker, 10-entry flush granularity, ETA tracking, KeyboardInterrupt handling - generate_raw_batch.sh: fix python -> python3 Corpus generation run (9,835 raw -> 5,499 polished -> 2,312 filtered): - 44.1% discard rate, 0 errors, 82 minutes on RTX 4090 - 9,257 training pairs across 5 input framing types - 97.6% vocab coverage (609/624 words) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
61 lines
1.6 KiB
Bash
Executable file
61 lines
1.6 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Generate raw folksy sayings across all 7 templates.
|
|
# Output: corpus/corpus_raw.jsonl (~10,500 entries)
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
CORPUS_DIR="$PROJECT_DIR/corpus"
|
|
GENERATOR="$PROJECT_DIR/folksy_generator.py"
|
|
|
|
COUNT_PER_TEMPLATE=${1:-1500}
|
|
|
|
mkdir -p "$CORPUS_DIR"
|
|
|
|
OUTPUT="$CORPUS_DIR/corpus_raw.jsonl"
|
|
# Clear existing file
|
|
> "$OUTPUT"
|
|
|
|
TEMPLATES=(
|
|
deconstruction
|
|
denial_of_consequences
|
|
ironic_deficiency
|
|
futile_preparation
|
|
hypocritical_complaint
|
|
tautological_wisdom
|
|
false_equivalence
|
|
)
|
|
|
|
echo "Generating $COUNT_PER_TEMPLATE sayings per template (${#TEMPLATES[@]} templates)..."
|
|
echo "Output: $OUTPUT"
|
|
|
|
total=0
|
|
for template in "${TEMPLATES[@]}"; do
|
|
echo -n " $template ($COUNT_PER_TEMPLATE)... "
|
|
before=$(wc -l < "$OUTPUT")
|
|
python3 "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
|
|
after=$(wc -l < "$OUTPUT")
|
|
generated=$((after - before))
|
|
total=$((total + generated))
|
|
echo "$generated generated"
|
|
done
|
|
|
|
echo ""
|
|
echo "Total: $total raw sayings in $OUTPUT"
|
|
echo ""
|
|
|
|
# Check template distribution
|
|
echo "Template distribution:"
|
|
python3 -c "
|
|
import json, sys
|
|
from collections import Counter
|
|
counts = Counter()
|
|
with open('$OUTPUT') as f:
|
|
for line in f:
|
|
entry = json.loads(line)
|
|
counts[entry['meta_template']] += 1
|
|
for template, count in sorted(counts.items()):
|
|
print(f' {template:30s} {count:5d}')
|
|
print(f\" {'TOTAL':30s} {sum(counts.values()):5d}\")
|
|
"
|