Fix generator quality issues and run initial corpus pipeline
Pre-corpus fixes (from EVALUATION.md): - Clean 2,264 contaminated rows from augmented relations (bridge artifacts, full-sentence HasProperty values, null bytes, empty words) - Fix article logic: dynamic a/an across Deconstruction, FalseEquivalence, DenialOfConsequences, TautologicalWisdom templates - Tighten _short_concepts() default from max_words=3 to 2 - Fix FutilePreparation gerunding: filter vocab nouns and noun-suffix words from UsedFor targets; fix CVC doubling for 'y'-ending words - Add _looks_like_verb() heuristic, improve _a() for vowel-sound edges Pipeline hardening: - polish_corpus.py: context-size fallback (truncate chain, then minimal prompt), classified error types, consecutive-error circuit breaker, 10-entry flush granularity, ETA tracking, KeyboardInterrupt handling - generate_raw_batch.sh: fix python -> python3 Corpus generation run (9,835 raw -> 5,499 polished -> 2,312 filtered): - 44.1% discard rate, 0 errors, 82 minutes on RTX 4090 - 9,257 training pairs across 5 input framing types - 97.6% vocab coverage (609/624 words) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
356b62c6ea
commit
651ec3ffc6
10 changed files with 34853 additions and 2406 deletions
2312
corpus/corpus_filtered.jsonl
Normal file
2312
corpus/corpus_filtered.jsonl
Normal file
File diff suppressed because it is too large
Load diff
9835
corpus/corpus_polished.jsonl
Normal file
9835
corpus/corpus_polished.jsonl
Normal file
File diff suppressed because it is too large
Load diff
9835
corpus/corpus_raw.jsonl
Normal file
9835
corpus/corpus_raw.jsonl
Normal file
File diff suppressed because it is too large
Load diff
91
corpus/corpus_stats.json
Normal file
91
corpus/corpus_stats.json
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
{
|
||||||
|
"raw_count": 9835,
|
||||||
|
"raw_by_template": {
|
||||||
|
"deconstruction": 1500,
|
||||||
|
"denial_of_consequences": 1500,
|
||||||
|
"false_equivalence": 1500,
|
||||||
|
"futile_preparation": 1500,
|
||||||
|
"hypocritical_complaint": 1500,
|
||||||
|
"ironic_deficiency": 1500,
|
||||||
|
"tautological_wisdom": 835
|
||||||
|
},
|
||||||
|
"polished_count": 5499,
|
||||||
|
"discarded_during_polish": 4336,
|
||||||
|
"errors_during_polish": 0,
|
||||||
|
"polish_discard_rate": "44.1%",
|
||||||
|
"polished_by_template": {
|
||||||
|
"deconstruction": 1105,
|
||||||
|
"denial_of_consequences": 733,
|
||||||
|
"false_equivalence": 590,
|
||||||
|
"futile_preparation": 882,
|
||||||
|
"hypocritical_complaint": 573,
|
||||||
|
"ironic_deficiency": 831,
|
||||||
|
"tautological_wisdom": 785
|
||||||
|
},
|
||||||
|
"discarded_by_template": {
|
||||||
|
"deconstruction": 395,
|
||||||
|
"denial_of_consequences": 767,
|
||||||
|
"false_equivalence": 910,
|
||||||
|
"futile_preparation": 618,
|
||||||
|
"hypocritical_complaint": 927,
|
||||||
|
"ironic_deficiency": 669,
|
||||||
|
"tautological_wisdom": 50
|
||||||
|
},
|
||||||
|
"filtered_count": 2312,
|
||||||
|
"filtered_by_template": {
|
||||||
|
"deconstruction": 619,
|
||||||
|
"denial_of_consequences": 159,
|
||||||
|
"false_equivalence": 517,
|
||||||
|
"futile_preparation": 284,
|
||||||
|
"hypocritical_complaint": 168,
|
||||||
|
"ironic_deficiency": 358,
|
||||||
|
"tautological_wisdom": 207
|
||||||
|
},
|
||||||
|
"discarded_during_filter": 3187,
|
||||||
|
"training_pair_count": 9257,
|
||||||
|
"training_by_template": {
|
||||||
|
"deconstruction": 2488,
|
||||||
|
"denial_of_consequences": 630,
|
||||||
|
"false_equivalence": 2059,
|
||||||
|
"futile_preparation": 1146,
|
||||||
|
"hypocritical_complaint": 681,
|
||||||
|
"ironic_deficiency": 1429,
|
||||||
|
"tautological_wisdom": 824
|
||||||
|
},
|
||||||
|
"training_by_input_type": {
|
||||||
|
"category_seeded": 2312,
|
||||||
|
"open_ended": 562,
|
||||||
|
"persona_seeded": 2312,
|
||||||
|
"template_seeded": 1759,
|
||||||
|
"word_seeded": 2312
|
||||||
|
},
|
||||||
|
"unique_slot_words_used": 609,
|
||||||
|
"total_vocab_words": 624,
|
||||||
|
"vocab_coverage": "97.6%",
|
||||||
|
"words_never_used": [
|
||||||
|
"agate",
|
||||||
|
"alabaster",
|
||||||
|
"anise",
|
||||||
|
"azalea",
|
||||||
|
"bee",
|
||||||
|
"blowfish",
|
||||||
|
"cattail",
|
||||||
|
"cypress",
|
||||||
|
"emerald",
|
||||||
|
"gem",
|
||||||
|
"grebe",
|
||||||
|
"juniper",
|
||||||
|
"lyre",
|
||||||
|
"spear",
|
||||||
|
"theater"
|
||||||
|
],
|
||||||
|
"words_never_used_count": 15,
|
||||||
|
"avg_saying_length_words": 13.1,
|
||||||
|
"min_saying_length_words": 6,
|
||||||
|
"max_saying_length_words": 23,
|
||||||
|
"balance_warnings": [
|
||||||
|
"WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
|
||||||
|
"WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
|
||||||
|
"WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
|
||||||
|
]
|
||||||
|
}
|
||||||
3188
corpus/discard_analysis.csv
Normal file
3188
corpus/discard_analysis.csv
Normal file
File diff suppressed because it is too large
Load diff
9257
corpus/training_pairs.jsonl
Normal file
9257
corpus/training_pairs.jsonl
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -199,10 +199,10 @@ class Deconstruction(MetaTemplate):
|
||||||
id = "deconstruction"
|
id = "deconstruction"
|
||||||
name = "Deconstruction"
|
name = "Deconstruction"
|
||||||
surface_templates = [
|
surface_templates = [
|
||||||
"You know what they say, a {A} with no {B} is just a {C} {D}.",
|
"You know what they say, {A_article} with no {B} is just a {C} {D}.",
|
||||||
"Take the {B} out of {A} and all you've got left is {C} {D}.",
|
"Take the {B} out of {A} and all you've got left is {C} {D}.",
|
||||||
"{A} without {B}? That's just {D} with ideas above its station.",
|
"{A} without {B}? That's just {D} with ideas above its station.",
|
||||||
"An {A} ain't nothing but {D} that met some {B}.",
|
"{A_Article} ain't nothing but {D} that met some {B}.",
|
||||||
]
|
]
|
||||||
|
|
||||||
def generate(self, seed_word=None, seed_category=None):
|
def generate(self, seed_word=None, seed_category=None):
|
||||||
|
|
@ -255,7 +255,8 @@ class Deconstruction(MetaTemplate):
|
||||||
c_word = random.choice(["plain", "sorry", "old", "humble", "dry", "wet", "cold"])
|
c_word = random.choice(["plain", "sorry", "old", "humble", "dry", "wet", "cold"])
|
||||||
|
|
||||||
template = self._pick_template()
|
template = self._pick_template()
|
||||||
saying = template.format(A=a, B=b_word, C=c_word, D=d_word)
|
saying = template.format(A=a, A_article=_a(a), A_Article=_a(a).capitalize(),
|
||||||
|
B=b_word, C=c_word, D=d_word)
|
||||||
|
|
||||||
debug = {
|
debug = {
|
||||||
"template_family": self.id,
|
"template_family": self.id,
|
||||||
|
|
@ -275,8 +276,8 @@ class DenialOfConsequences(MetaTemplate):
|
||||||
surface_templates = [
|
surface_templates = [
|
||||||
"Don't {C} the {A} and say you ain't got {B}.",
|
"Don't {C} the {A} and say you ain't got {B}.",
|
||||||
"Don't {C} the {A} and act surprised when the {B} show up.",
|
"Don't {C} the {A} and act surprised when the {B} show up.",
|
||||||
"Man who {C}s a {A} can't complain about {B}.",
|
"Man who {C}s {A_article} can't complain about {B}.",
|
||||||
"You can't {C} a {A} and then wonder where all the {B} came from.",
|
"You can't {C} {A_article} and then wonder where all the {B} came from.",
|
||||||
]
|
]
|
||||||
|
|
||||||
def generate(self, seed_word=None, seed_category=None):
|
def generate(self, seed_word=None, seed_category=None):
|
||||||
|
|
@ -323,7 +324,7 @@ class DenialOfConsequences(MetaTemplate):
|
||||||
c_word = random.choice(["build", "set up", "put out", "lay down", "make"])
|
c_word = random.choice(["build", "set up", "put out", "lay down", "make"])
|
||||||
|
|
||||||
template = self._pick_template()
|
template = self._pick_template()
|
||||||
saying = template.format(A=a, B=b_word, C=c_word)
|
saying = template.format(A=a, A_article=_a(a), B=b_word, C=c_word)
|
||||||
|
|
||||||
debug = {
|
debug = {
|
||||||
"template_family": self.id,
|
"template_family": self.id,
|
||||||
|
|
@ -407,8 +408,9 @@ class FutilePreparation(MetaTemplate):
|
||||||
if not seed:
|
if not seed:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
# What is the seed used for?
|
# What is the seed used for? Filter out vocab nouns and noun-like words — we need verbs.
|
||||||
uses = _short_concepts(self.graph.neighbors(seed, "UsedFor", min_weight=0.5), max_words=2)
|
uses = _short_concepts(self.graph.neighbors(seed, "UsedFor", min_weight=0.5), max_words=2)
|
||||||
|
uses = [u for u in uses if u[0] not in self.graph.vocab and _looks_like_verb(u[0])]
|
||||||
if not uses:
|
if not uses:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
|
@ -532,9 +534,9 @@ class TautologicalWisdom(MetaTemplate):
|
||||||
id = "tautological_wisdom"
|
id = "tautological_wisdom"
|
||||||
name = "Tautological Wisdom"
|
name = "Tautological Wisdom"
|
||||||
surface_templates = [
|
surface_templates = [
|
||||||
"You know what they say, it takes a {X} to get a {Y}.",
|
"You know what they say, it takes {X_article} to get {Y_article}.",
|
||||||
"My daddy always said, can't have {Y} without {X}.",
|
"My daddy always said, can't have {Y} without {X}.",
|
||||||
"A {Y} don't come without its {X}, now does it?",
|
"{Y_Article} don't come without its {X}, now does it?",
|
||||||
"You want {Y}? Well, first you're gonna need {X}.",
|
"You want {Y}? Well, first you're gonna need {X}.",
|
||||||
"Ain't no {Y} ever came from nothing — you need {X}.",
|
"Ain't no {Y} ever came from nothing — you need {X}.",
|
||||||
]
|
]
|
||||||
|
|
@ -566,7 +568,9 @@ class TautologicalWisdom(MetaTemplate):
|
||||||
chain_edge = choice[3]
|
chain_edge = choice[3]
|
||||||
|
|
||||||
template = self._pick_template()
|
template = self._pick_template()
|
||||||
saying = template.format(X=x_word, Y=y_word)
|
saying = template.format(X=x_word, Y=y_word,
|
||||||
|
X_article=_a(x_word), Y_article=_a(y_word),
|
||||||
|
Y_Article=_a(y_word).capitalize())
|
||||||
|
|
||||||
debug = {
|
debug = {
|
||||||
"template_family": self.id,
|
"template_family": self.id,
|
||||||
|
|
@ -584,10 +588,10 @@ class FalseEquivalence(MetaTemplate):
|
||||||
id = "false_equivalence"
|
id = "false_equivalence"
|
||||||
name = "False Equivalence"
|
name = "False Equivalence"
|
||||||
surface_templates = [
|
surface_templates = [
|
||||||
"A {A} is just a {B} that's got {P}.",
|
"{A_article} is just {B_article} that's got {P}.",
|
||||||
"What's a {A} but a {B} with {P}?",
|
"What's {A_article} but {B_article} with {P}?",
|
||||||
"The only difference between a {A} and a {B} is {P}.",
|
"The only difference between {A_article} and {B_article} is {P}.",
|
||||||
"Take the {P} from a {A} and you've got yourself a {B}.",
|
"Take the {P} from {A_article} and you've got yourself {B_article}.",
|
||||||
]
|
]
|
||||||
|
|
||||||
def generate(self, seed_word=None, seed_category=None):
|
def generate(self, seed_word=None, seed_category=None):
|
||||||
|
|
@ -635,7 +639,8 @@ class FalseEquivalence(MetaTemplate):
|
||||||
p_word = random.choice(["ambition", "an attitude", "a plan", "patience"])
|
p_word = random.choice(["ambition", "an attitude", "a plan", "patience"])
|
||||||
|
|
||||||
template = self._pick_template()
|
template = self._pick_template()
|
||||||
saying = template.format(A=a, B=b_word, P=p_word)
|
saying = template.format(A=a, B=b_word, P=p_word,
|
||||||
|
A_article=_a(a), B_article=_a(b_word))
|
||||||
|
|
||||||
debug = {
|
debug = {
|
||||||
"template_family": self.id,
|
"template_family": self.id,
|
||||||
|
|
@ -656,7 +661,7 @@ def _readable(concept):
|
||||||
return concept.replace("_", " ")
|
return concept.replace("_", " ")
|
||||||
|
|
||||||
|
|
||||||
def _short_concepts(items, max_words=3):
|
def _short_concepts(items, max_words=2):
|
||||||
"""Filter concept tuples to only those with short readable names.
|
"""Filter concept tuples to only those with short readable names.
|
||||||
|
|
||||||
Items can be tuples where first element is the concept string.
|
Items can be tuples where first element is the concept string.
|
||||||
|
|
@ -672,14 +677,37 @@ def _gerund(word):
|
||||||
return word[:-1] + "ing"
|
return word[:-1] + "ing"
|
||||||
if word.endswith("ing"):
|
if word.endswith("ing"):
|
||||||
return word
|
return word
|
||||||
if len(word) > 2 and word[-1] not in "aeiou" and word[-2] in "aeiou" and word[-3] not in "aeiou":
|
# CVC doubling: "run" -> "running", "sit" -> "sitting"
|
||||||
|
# Treat 'y' and 'w' as vowels at word end (prey->preying, not preyying)
|
||||||
|
if len(word) > 2 and word[-1] not in "aeiouy" and word[-2] in "aeiou" and word[-3] not in "aeiou":
|
||||||
return word + word[-1] + "ing"
|
return word + word[-1] + "ing"
|
||||||
return word + "ing"
|
return word + "ing"
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_verb(word):
|
||||||
|
"""Heuristic: does this word look like it could be a verb?
|
||||||
|
Rejects words with obvious noun/adjective suffixes."""
|
||||||
|
w = word.split("_")[0].lower() if "_" in word else word.lower()
|
||||||
|
noun_suffixes = ("tion", "sion", "ment", "ness", "ity", "ance", "ence",
|
||||||
|
"ture", "ism", "ist", "ery", "ory", "ling")
|
||||||
|
return not any(w.endswith(s) for s in noun_suffixes)
|
||||||
|
|
||||||
|
|
||||||
def _a(word):
|
def _a(word):
|
||||||
"""Add 'a' or 'an' article."""
|
"""Add 'a' or 'an' article."""
|
||||||
if word and word[0] in "aeiou":
|
if not word:
|
||||||
|
return "a"
|
||||||
|
first = word.split()[0].lower() if word else ""
|
||||||
|
# Words that start with a vowel letter but consonant sound
|
||||||
|
consonant_sound = ("uni", "use", "used", "user", "usual", "usu", "uti", "uto",
|
||||||
|
"uro", "uku", "ula")
|
||||||
|
# Words that start with a consonant letter but vowel sound
|
||||||
|
vowel_sound = ("hour", "honest", "honor", "honour", "heir", "herb")
|
||||||
|
if any(first.startswith(p) for p in vowel_sound):
|
||||||
|
return f"an {word}"
|
||||||
|
if any(first.startswith(p) for p in consonant_sound):
|
||||||
|
return f"a {word}"
|
||||||
|
if first[0] in "aeiou":
|
||||||
return f"an {word}"
|
return f"an {word}"
|
||||||
return f"a {word}"
|
return f"a {word}"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ total=0
|
||||||
for template in "${TEMPLATES[@]}"; do
|
for template in "${TEMPLATES[@]}"; do
|
||||||
echo -n " $template ($COUNT_PER_TEMPLATE)... "
|
echo -n " $template ($COUNT_PER_TEMPLATE)... "
|
||||||
before=$(wc -l < "$OUTPUT")
|
before=$(wc -l < "$OUTPUT")
|
||||||
python "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
|
python3 "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
|
||||||
after=$(wc -l < "$OUTPUT")
|
after=$(wc -l < "$OUTPUT")
|
||||||
generated=$((after - before))
|
generated=$((after - before))
|
||||||
total=$((total + generated))
|
total=$((total + generated))
|
||||||
|
|
@ -47,7 +47,7 @@ echo ""
|
||||||
|
|
||||||
# Check template distribution
|
# Check template distribution
|
||||||
echo "Template distribution:"
|
echo "Template distribution:"
|
||||||
python -c "
|
python3 -c "
|
||||||
import json, sys
|
import json, sys
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
counts = Counter()
|
counts = Counter()
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,13 @@
|
||||||
Reads corpus_raw.jsonl, sends each to GLM4-32B for polish.
|
Reads corpus_raw.jsonl, sends each to GLM4-32B for polish.
|
||||||
Output file is the checkpoint — append mode with resume detection.
|
Output file is the checkpoint — append mode with resume detection.
|
||||||
|
|
||||||
|
Robust error handling:
|
||||||
|
- Context size errors: truncates chain data and retries
|
||||||
|
- JSON parse errors: retries, then marks as error
|
||||||
|
- Transient HTTP errors: exponential backoff retry
|
||||||
|
- Keyboard interrupt: flushes and exits cleanly
|
||||||
|
- Safe resume: skips entries already in output file
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python scripts/polish_corpus.py
|
python scripts/polish_corpus.py
|
||||||
python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl
|
python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl
|
||||||
|
|
@ -62,8 +69,27 @@ Chain: canoe UsedFor transport, fire UsedFor boiling_food
|
||||||
Polished: DISCARD"""
|
Polished: DISCARD"""
|
||||||
|
|
||||||
|
|
||||||
|
class LLMError(Exception):
|
||||||
|
"""Base class for LLM errors."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ContextTooLong(LLMError):
|
||||||
|
"""Prompt exceeded context window."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TransientError(LLMError):
|
||||||
|
"""Recoverable error (network, server overload, etc.)."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def llm_chat_completion(messages, max_retries=3):
|
def llm_chat_completion(messages, max_retries=3):
|
||||||
"""Chat completion with retry logic."""
|
"""Chat completion with retry logic and error classification.
|
||||||
|
|
||||||
|
Returns (response_text, error_type) tuple.
|
||||||
|
response_text is None on failure; error_type is None on success.
|
||||||
|
"""
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
|
|
@ -71,21 +97,94 @@ def llm_chat_completion(messages, max_retries=3):
|
||||||
resp = requests.post(LLM_ENDPOINT, json={
|
resp = requests.post(LLM_ENDPOINT, json={
|
||||||
"model": LLM_MODEL,
|
"model": LLM_MODEL,
|
||||||
"messages": messages,
|
"messages": messages,
|
||||||
|
"temperature": 0.7,
|
||||||
}, timeout=120)
|
}, timeout=120)
|
||||||
resp.raise_for_status()
|
|
||||||
data = resp.json()
|
# Check for context length errors (HTTP 400 typically)
|
||||||
return data["choices"][0]["message"]["content"].strip()
|
if resp.status_code == 400:
|
||||||
except Exception as e:
|
body = resp.text.lower()
|
||||||
wait = (2 ** attempt)
|
if any(kw in body for kw in ["context", "token", "length", "too long", "exceed"]):
|
||||||
print(f" LLM error (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
|
return None, "context_too_long"
|
||||||
|
# Other 400 errors — log and retry
|
||||||
|
print(f" HTTP 400 (attempt {attempt+1}): {resp.text[:200]}", file=sys.stderr)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(2 ** attempt)
|
||||||
|
continue
|
||||||
|
return None, "http_400"
|
||||||
|
|
||||||
|
if resp.status_code == 503 or resp.status_code == 429:
|
||||||
|
wait = 2 ** (attempt + 1)
|
||||||
|
print(f" HTTP {resp.status_code} (attempt {attempt+1}), waiting {wait}s...",
|
||||||
|
file=sys.stderr)
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
time.sleep(wait)
|
time.sleep(wait)
|
||||||
else:
|
continue
|
||||||
return None
|
return None, "server_overload"
|
||||||
|
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
# Parse JSON response
|
||||||
|
try:
|
||||||
|
data = resp.json()
|
||||||
|
except (json.JSONDecodeError, ValueError) as e:
|
||||||
|
print(f" JSON parse error (attempt {attempt+1}): {e}", file=sys.stderr)
|
||||||
|
print(f" Response body: {resp.text[:300]}", file=sys.stderr)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(2 ** attempt)
|
||||||
|
continue
|
||||||
|
return None, "json_parse"
|
||||||
|
|
||||||
|
# Extract content from response
|
||||||
|
try:
|
||||||
|
content = data["choices"][0]["message"]["content"]
|
||||||
|
if content is None:
|
||||||
|
print(f" Null content in response (attempt {attempt+1})", file=sys.stderr)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(1)
|
||||||
|
continue
|
||||||
|
return None, "null_content"
|
||||||
|
return content.strip(), None
|
||||||
|
except (KeyError, IndexError) as e:
|
||||||
|
print(f" Unexpected JSON structure (attempt {attempt+1}): {e}", file=sys.stderr)
|
||||||
|
print(f" Keys: {list(data.keys()) if isinstance(data, dict) else type(data)}",
|
||||||
|
file=sys.stderr)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(1)
|
||||||
|
continue
|
||||||
|
return None, "json_structure"
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
wait = 2 ** (attempt + 1)
|
||||||
|
print(f" Timeout (attempt {attempt+1}), waiting {wait}s...", file=sys.stderr)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(wait)
|
||||||
|
continue
|
||||||
|
return None, "timeout"
|
||||||
|
|
||||||
|
except requests.exceptions.ConnectionError as e:
|
||||||
|
wait = 2 ** (attempt + 2) # longer wait for connection errors
|
||||||
|
print(f" Connection error (attempt {attempt+1}): {e}", file=sys.stderr)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(wait)
|
||||||
|
continue
|
||||||
|
return None, "connection"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Unexpected error (attempt {attempt+1}): {type(e).__name__}: {e}",
|
||||||
|
file=sys.stderr)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(2 ** attempt)
|
||||||
|
continue
|
||||||
|
return None, "unexpected"
|
||||||
|
|
||||||
|
return None, "exhausted_retries"
|
||||||
|
|
||||||
|
|
||||||
def format_chain(chain_edges):
|
def format_chain(chain_edges, truncate=False):
|
||||||
"""Format chain_edges list into readable string for LLM context."""
|
"""Format chain_edges list into readable string for LLM context.
|
||||||
|
|
||||||
|
If truncate=True, omit weights and surface_text to reduce token count.
|
||||||
|
"""
|
||||||
if not chain_edges:
|
if not chain_edges:
|
||||||
return "(no chain data)"
|
return "(no chain data)"
|
||||||
parts = []
|
parts = []
|
||||||
|
|
@ -93,6 +192,9 @@ def format_chain(chain_edges):
|
||||||
start = edge.get("start", "?")
|
start = edge.get("start", "?")
|
||||||
rel = edge.get("relation", "?")
|
rel = edge.get("relation", "?")
|
||||||
end = edge.get("end", "?")
|
end = edge.get("end", "?")
|
||||||
|
if truncate:
|
||||||
|
parts.append(f"{start} --{rel}--> {end}")
|
||||||
|
else:
|
||||||
weight = edge.get("weight", 0)
|
weight = edge.get("weight", 0)
|
||||||
parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})")
|
parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})")
|
||||||
return ", ".join(parts)
|
return ", ".join(parts)
|
||||||
|
|
@ -103,9 +205,33 @@ def format_slots(slots):
|
||||||
return ", ".join(f"{k}={v}" for k, v in slots.items())
|
return ", ".join(f"{k}={v}" for k, v in slots.items())
|
||||||
|
|
||||||
|
|
||||||
|
def build_messages(entry, truncate_chain=False):
|
||||||
|
"""Build the messages list for a single entry."""
|
||||||
|
raw_text = entry.get("raw_text", "")
|
||||||
|
meta_template = entry.get("meta_template", "")
|
||||||
|
chain = format_chain(entry.get("chain", []), truncate=truncate_chain)
|
||||||
|
slots = format_slots(entry.get("slots", {}))
|
||||||
|
|
||||||
|
user_prompt = (
|
||||||
|
f"Meta-template: {meta_template}\n"
|
||||||
|
f"Relationship chain: {chain}\n"
|
||||||
|
f"Slot fills: {slots}\n"
|
||||||
|
f"Raw saying: {raw_text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_prompt},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def load_already_processed(output_path):
|
def load_already_processed(output_path):
|
||||||
"""Load set of raw_text strings already processed (for resume)."""
|
"""Load set of raw_text strings already processed (for resume).
|
||||||
|
|
||||||
|
Also returns counts of each status for accurate progress reporting.
|
||||||
|
"""
|
||||||
processed = set()
|
processed = set()
|
||||||
|
counts = {"polished": 0, "discarded": 0, "error": 0}
|
||||||
if output_path.exists():
|
if output_path.exists():
|
||||||
with open(output_path, encoding="utf-8") as f:
|
with open(output_path, encoding="utf-8") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
|
|
@ -115,9 +241,12 @@ def load_already_processed(output_path):
|
||||||
try:
|
try:
|
||||||
entry = json.loads(line)
|
entry = json.loads(line)
|
||||||
processed.add(entry.get("raw_text", ""))
|
processed.add(entry.get("raw_text", ""))
|
||||||
|
status = entry.get("status", "")
|
||||||
|
if status in counts:
|
||||||
|
counts[status] += 1
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
continue
|
continue
|
||||||
return processed
|
return processed, counts
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
@ -141,15 +270,21 @@ def main():
|
||||||
for line in f:
|
for line in f:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line:
|
if line:
|
||||||
|
try:
|
||||||
raw_entries.append(json.loads(line))
|
raw_entries.append(json.loads(line))
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Warning: skipping malformed input line: {e}", file=sys.stderr)
|
||||||
|
|
||||||
print(f"Loaded {len(raw_entries)} raw entries from {input_path}")
|
print(f"Loaded {len(raw_entries)} raw entries from {input_path}")
|
||||||
|
|
||||||
# Check what's already been processed
|
# Check what's already been processed
|
||||||
already_processed = load_already_processed(output_path)
|
already_processed, prev_counts = load_already_processed(output_path)
|
||||||
remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed]
|
remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed]
|
||||||
|
|
||||||
print(f"Already processed: {len(already_processed)}")
|
print(f"Already processed: {len(already_processed)} "
|
||||||
|
f"(polished={prev_counts['polished']}, "
|
||||||
|
f"discarded={prev_counts['discarded']}, "
|
||||||
|
f"errors={prev_counts['error']})")
|
||||||
print(f"Remaining: {len(remaining)}")
|
print(f"Remaining: {len(remaining)}")
|
||||||
|
|
||||||
if not remaining:
|
if not remaining:
|
||||||
|
|
@ -159,55 +294,105 @@ def main():
|
||||||
discards = 0
|
discards = 0
|
||||||
polished = 0
|
polished = 0
|
||||||
errors = 0
|
errors = 0
|
||||||
|
error_types = {}
|
||||||
|
consecutive_errors = 0
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
with open(output_path, "a", encoding="utf-8") as out:
|
with open(output_path, "a", encoding="utf-8") as out:
|
||||||
for i, entry in enumerate(remaining):
|
for i, entry in enumerate(remaining):
|
||||||
|
# First attempt with full chain data
|
||||||
|
messages = build_messages(entry, truncate_chain=False)
|
||||||
|
response, error_type = llm_chat_completion(messages)
|
||||||
|
|
||||||
|
# If context too long, retry with truncated chain
|
||||||
|
if error_type == "context_too_long":
|
||||||
|
print(f" #{i+1}: context too long, retrying with truncated chain...",
|
||||||
|
file=sys.stderr)
|
||||||
|
messages = build_messages(entry, truncate_chain=True)
|
||||||
|
response, error_type = llm_chat_completion(messages)
|
||||||
|
|
||||||
|
# If still too long, try with just the raw text
|
||||||
|
if error_type == "context_too_long":
|
||||||
|
print(f" #{i+1}: still too long, retrying with minimal prompt...",
|
||||||
|
file=sys.stderr)
|
||||||
raw_text = entry.get("raw_text", "")
|
raw_text = entry.get("raw_text", "")
|
||||||
meta_template = entry.get("meta_template", "")
|
|
||||||
chain = format_chain(entry.get("chain", []))
|
|
||||||
slots = format_slots(entry.get("slots", {}))
|
|
||||||
|
|
||||||
user_prompt = (
|
|
||||||
f"Meta-template: {meta_template}\n"
|
|
||||||
f"Relationship chain: {chain}\n"
|
|
||||||
f"Slot fills: {slots}\n"
|
|
||||||
f"Raw saying: {raw_text}"
|
|
||||||
)
|
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": SYSTEM_PROMPT},
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
{"role": "user", "content": user_prompt},
|
{"role": "user", "content": f"Raw saying: {raw_text}"},
|
||||||
]
|
]
|
||||||
|
response, error_type = llm_chat_completion(messages)
|
||||||
response = llm_chat_completion(messages)
|
|
||||||
|
|
||||||
if response is None:
|
if response is None:
|
||||||
entry["status"] = "error"
|
entry["status"] = "error"
|
||||||
|
entry["error_type"] = error_type or "unknown"
|
||||||
errors += 1
|
errors += 1
|
||||||
|
consecutive_errors += 1
|
||||||
|
error_types[error_type] = error_types.get(error_type, 0) + 1
|
||||||
|
|
||||||
|
# If we get 20 consecutive errors, something is seriously wrong
|
||||||
|
if consecutive_errors >= 20:
|
||||||
|
print(f"\nFATAL: {consecutive_errors} consecutive errors. "
|
||||||
|
f"Last error type: {error_type}", file=sys.stderr)
|
||||||
|
print("Flushing output and stopping. Re-run to resume.", file=sys.stderr)
|
||||||
|
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||||
|
out.flush()
|
||||||
|
sys.exit(1)
|
||||||
elif response.strip().upper() == "DISCARD":
|
elif response.strip().upper() == "DISCARD":
|
||||||
entry["status"] = "discarded"
|
entry["status"] = "discarded"
|
||||||
discards += 1
|
discards += 1
|
||||||
|
consecutive_errors = 0
|
||||||
else:
|
else:
|
||||||
entry["polished_text"] = response.strip()
|
# Sanity check the response
|
||||||
|
cleaned = response.strip()
|
||||||
|
# Sometimes the LLM wraps in quotes
|
||||||
|
if cleaned.startswith('"') and cleaned.endswith('"'):
|
||||||
|
cleaned = cleaned[1:-1]
|
||||||
|
# Sometimes the LLM prefixes with "Polished:" or similar
|
||||||
|
for prefix in ["Polished:", "polished:", "Output:", "Result:"]:
|
||||||
|
if cleaned.startswith(prefix):
|
||||||
|
cleaned = cleaned[len(prefix):].strip()
|
||||||
|
entry["polished_text"] = cleaned
|
||||||
entry["status"] = "polished"
|
entry["status"] = "polished"
|
||||||
polished += 1
|
polished += 1
|
||||||
|
consecutive_errors = 0
|
||||||
|
|
||||||
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
if (i + 1) % 100 == 0:
|
# Flush every 10 entries for fine-grained resume safety
|
||||||
|
if (i + 1) % 10 == 0:
|
||||||
out.flush()
|
out.flush()
|
||||||
|
|
||||||
|
# Progress report every 100 entries
|
||||||
|
if (i + 1) % 100 == 0:
|
||||||
total_done = len(already_processed) + i + 1
|
total_done = len(already_processed) + i + 1
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
rate = (i + 1) / elapsed
|
||||||
|
eta_sec = (len(remaining) - (i + 1)) / rate if rate > 0 else 0
|
||||||
|
eta_min = eta_sec / 60
|
||||||
print(f" [{total_done}/{len(raw_entries)}] "
|
print(f" [{total_done}/{len(raw_entries)}] "
|
||||||
f"polished={polished}, discarded={discards}, errors={errors}")
|
f"polished={polished}, discarded={discards}, errors={errors} "
|
||||||
|
f"({rate:.1f}/s, ETA {eta_min:.0f}m)")
|
||||||
|
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
total_done = len(already_processed) + len(remaining)
|
except KeyboardInterrupt:
|
||||||
print(f"\nDone: {total_done} total entries processed.")
|
print(f"\nInterrupted at entry {i+1}/{len(remaining)}. "
|
||||||
|
f"Progress saved — re-run to resume.", file=sys.stderr)
|
||||||
|
|
||||||
|
# Final report
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
total_done = len(already_processed) + polished + discards + errors
|
||||||
|
print(f"\nSession complete: {polished + discards + errors} entries processed "
|
||||||
|
f"in {elapsed/60:.1f} minutes.")
|
||||||
print(f" Polished: {polished}")
|
print(f" Polished: {polished}")
|
||||||
print(f" Discarded: {discards}")
|
print(f" Discarded: {discards}")
|
||||||
print(f" Errors: {errors}")
|
print(f" Errors: {errors}")
|
||||||
print(f" Discard rate: {discards/(polished+discards)*100:.1f}%" if (polished+discards) else " N/A")
|
if error_types:
|
||||||
|
print(f" Error breakdown: {error_types}")
|
||||||
|
if polished + discards > 0:
|
||||||
|
print(f" Discard rate: {discards/(polished+discards)*100:.1f}%")
|
||||||
|
print(f" Total across all sessions: {total_done}/{len(raw_entries)}")
|
||||||
print(f"Output: {output_path}")
|
print(f"Output: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue