Fix generator quality issues and run initial corpus pipeline

Pre-corpus fixes (from EVALUATION.md):
- Clean 2,264 contaminated rows from augmented relations (bridge
  artifacts, full-sentence HasProperty values, null bytes, empty words)
- Fix article logic: dynamic a/an across Deconstruction, FalseEquivalence,
  DenialOfConsequences, TautologicalWisdom templates
- Tighten _short_concepts() default from max_words=3 to 2
- Fix FutilePreparation gerunding: filter vocab nouns and noun-suffix
  words from UsedFor targets; fix CVC doubling for 'y'-ending words
- Add _looks_like_verb() heuristic, improve _a() for vowel-sound edges

Pipeline hardening:
- polish_corpus.py: context-size fallback (truncate chain, then minimal
  prompt), classified error types, consecutive-error circuit breaker,
  10-entry flush granularity, ETA tracking, KeyboardInterrupt handling
- generate_raw_batch.sh: fix python -> python3

Corpus generation run (9,835 raw -> 5,499 polished -> 2,312 filtered):
- 44.1% discard rate, 0 errors, 82 minutes on RTX 4090
- 9,257 training pairs across 5 input framing types
- 97.6% vocab coverage (609/624 words)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John McCardle 2026-03-10 04:33:56 -04:00
commit 651ec3ffc6
10 changed files with 34853 additions and 2406 deletions

View file

@ -199,10 +199,10 @@ class Deconstruction(MetaTemplate):
id = "deconstruction"
name = "Deconstruction"
surface_templates = [
"You know what they say, a {A} with no {B} is just a {C} {D}.",
"You know what they say, {A_article} with no {B} is just a {C} {D}.",
"Take the {B} out of {A} and all you've got left is {C} {D}.",
"{A} without {B}? That's just {D} with ideas above its station.",
"An {A} ain't nothing but {D} that met some {B}.",
"{A_Article} ain't nothing but {D} that met some {B}.",
]
def generate(self, seed_word=None, seed_category=None):
@ -255,7 +255,8 @@ class Deconstruction(MetaTemplate):
c_word = random.choice(["plain", "sorry", "old", "humble", "dry", "wet", "cold"])
template = self._pick_template()
saying = template.format(A=a, B=b_word, C=c_word, D=d_word)
saying = template.format(A=a, A_article=_a(a), A_Article=_a(a).capitalize(),
B=b_word, C=c_word, D=d_word)
debug = {
"template_family": self.id,
@ -275,8 +276,8 @@ class DenialOfConsequences(MetaTemplate):
surface_templates = [
"Don't {C} the {A} and say you ain't got {B}.",
"Don't {C} the {A} and act surprised when the {B} show up.",
"Man who {C}s a {A} can't complain about {B}.",
"You can't {C} a {A} and then wonder where all the {B} came from.",
"Man who {C}s {A_article} can't complain about {B}.",
"You can't {C} {A_article} and then wonder where all the {B} came from.",
]
def generate(self, seed_word=None, seed_category=None):
@ -323,7 +324,7 @@ class DenialOfConsequences(MetaTemplate):
c_word = random.choice(["build", "set up", "put out", "lay down", "make"])
template = self._pick_template()
saying = template.format(A=a, B=b_word, C=c_word)
saying = template.format(A=a, A_article=_a(a), B=b_word, C=c_word)
debug = {
"template_family": self.id,
@ -407,8 +408,9 @@ class FutilePreparation(MetaTemplate):
if not seed:
return None, None
# What is the seed used for?
# What is the seed used for? Filter out vocab nouns and noun-like words — we need verbs.
uses = _short_concepts(self.graph.neighbors(seed, "UsedFor", min_weight=0.5), max_words=2)
uses = [u for u in uses if u[0] not in self.graph.vocab and _looks_like_verb(u[0])]
if not uses:
return None, None
@ -532,9 +534,9 @@ class TautologicalWisdom(MetaTemplate):
id = "tautological_wisdom"
name = "Tautological Wisdom"
surface_templates = [
"You know what they say, it takes a {X} to get a {Y}.",
"You know what they say, it takes {X_article} to get {Y_article}.",
"My daddy always said, can't have {Y} without {X}.",
"A {Y} don't come without its {X}, now does it?",
"{Y_Article} don't come without its {X}, now does it?",
"You want {Y}? Well, first you're gonna need {X}.",
"Ain't no {Y} ever came from nothing — you need {X}.",
]
@ -566,7 +568,9 @@ class TautologicalWisdom(MetaTemplate):
chain_edge = choice[3]
template = self._pick_template()
saying = template.format(X=x_word, Y=y_word)
saying = template.format(X=x_word, Y=y_word,
X_article=_a(x_word), Y_article=_a(y_word),
Y_Article=_a(y_word).capitalize())
debug = {
"template_family": self.id,
@ -584,10 +588,10 @@ class FalseEquivalence(MetaTemplate):
id = "false_equivalence"
name = "False Equivalence"
surface_templates = [
"A {A} is just a {B} that's got {P}.",
"What's a {A} but a {B} with {P}?",
"The only difference between a {A} and a {B} is {P}.",
"Take the {P} from a {A} and you've got yourself a {B}.",
"{A_article} is just {B_article} that's got {P}.",
"What's {A_article} but {B_article} with {P}?",
"The only difference between {A_article} and {B_article} is {P}.",
"Take the {P} from {A_article} and you've got yourself {B_article}.",
]
def generate(self, seed_word=None, seed_category=None):
@ -635,7 +639,8 @@ class FalseEquivalence(MetaTemplate):
p_word = random.choice(["ambition", "an attitude", "a plan", "patience"])
template = self._pick_template()
saying = template.format(A=a, B=b_word, P=p_word)
saying = template.format(A=a, B=b_word, P=p_word,
A_article=_a(a), B_article=_a(b_word))
debug = {
"template_family": self.id,
@ -656,7 +661,7 @@ def _readable(concept):
return concept.replace("_", " ")
def _short_concepts(items, max_words=3):
def _short_concepts(items, max_words=2):
"""Filter concept tuples to only those with short readable names.
Items can be tuples where first element is the concept string.
@ -672,14 +677,37 @@ def _gerund(word):
return word[:-1] + "ing"
if word.endswith("ing"):
return word
if len(word) > 2 and word[-1] not in "aeiou" and word[-2] in "aeiou" and word[-3] not in "aeiou":
# CVC doubling: "run" -> "running", "sit" -> "sitting"
# Treat 'y' and 'w' as vowels at word end (prey->preying, not preyying)
if len(word) > 2 and word[-1] not in "aeiouy" and word[-2] in "aeiou" and word[-3] not in "aeiou":
return word + word[-1] + "ing"
return word + "ing"
def _looks_like_verb(word):
"""Heuristic: does this word look like it could be a verb?
Rejects words with obvious noun/adjective suffixes."""
w = word.split("_")[0].lower() if "_" in word else word.lower()
noun_suffixes = ("tion", "sion", "ment", "ness", "ity", "ance", "ence",
"ture", "ism", "ist", "ery", "ory", "ling")
return not any(w.endswith(s) for s in noun_suffixes)
def _a(word):
"""Add 'a' or 'an' article."""
if word and word[0] in "aeiou":
if not word:
return "a"
first = word.split()[0].lower() if word else ""
# Words that start with a vowel letter but consonant sound
consonant_sound = ("uni", "use", "used", "user", "usual", "usu", "uti", "uto",
"uro", "uku", "ula")
# Words that start with a consonant letter but vowel sound
vowel_sound = ("hour", "honest", "honor", "honour", "heir", "herb")
if any(first.startswith(p) for p in vowel_sound):
return f"an {word}"
if any(first.startswith(p) for p in consonant_sound):
return f"a {word}"
if first[0] in "aeiou":
return f"an {word}"
return f"a {word}"