Fix generator quality issues and run initial corpus pipeline

Pre-corpus fixes (from EVALUATION.md):
- Clean 2,264 contaminated rows from augmented relations (bridge
  artifacts, full-sentence HasProperty values, null bytes, empty words)
- Fix article logic: dynamic a/an across Deconstruction, FalseEquivalence,
  DenialOfConsequences, TautologicalWisdom templates
- Tighten _short_concepts() default from max_words=3 to 2
- Fix FutilePreparation gerunding: filter vocab nouns and noun-suffix
  words from UsedFor targets; fix CVC doubling for 'y'-ending words
- Add _looks_like_verb() heuristic, improve _a() for vowel-sound edges

Pipeline hardening:
- polish_corpus.py: context-size fallback (truncate chain, then minimal
  prompt), classified error types, consecutive-error circuit breaker,
  10-entry flush granularity, ETA tracking, KeyboardInterrupt handling
- generate_raw_batch.sh: fix python -> python3

Corpus generation run (9,835 raw -> 5,499 polished -> 2,312 filtered):
- 44.1% discard rate, 0 errors, 82 minutes on RTX 4090
- 9,257 training pairs across 5 input framing types
- 97.6% vocab coverage (609/624 words)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John McCardle 2026-03-10 04:33:56 -04:00
commit 651ec3ffc6
10 changed files with 34853 additions and 2406 deletions

2312
corpus/corpus_filtered.jsonl Normal file

File diff suppressed because it is too large Load diff

9835
corpus/corpus_polished.jsonl Normal file

File diff suppressed because it is too large Load diff

9835
corpus/corpus_raw.jsonl Normal file

File diff suppressed because it is too large Load diff

91
corpus/corpus_stats.json Normal file
View file

@ -0,0 +1,91 @@
{
"raw_count": 9835,
"raw_by_template": {
"deconstruction": 1500,
"denial_of_consequences": 1500,
"false_equivalence": 1500,
"futile_preparation": 1500,
"hypocritical_complaint": 1500,
"ironic_deficiency": 1500,
"tautological_wisdom": 835
},
"polished_count": 5499,
"discarded_during_polish": 4336,
"errors_during_polish": 0,
"polish_discard_rate": "44.1%",
"polished_by_template": {
"deconstruction": 1105,
"denial_of_consequences": 733,
"false_equivalence": 590,
"futile_preparation": 882,
"hypocritical_complaint": 573,
"ironic_deficiency": 831,
"tautological_wisdom": 785
},
"discarded_by_template": {
"deconstruction": 395,
"denial_of_consequences": 767,
"false_equivalence": 910,
"futile_preparation": 618,
"hypocritical_complaint": 927,
"ironic_deficiency": 669,
"tautological_wisdom": 50
},
"filtered_count": 2312,
"filtered_by_template": {
"deconstruction": 619,
"denial_of_consequences": 159,
"false_equivalence": 517,
"futile_preparation": 284,
"hypocritical_complaint": 168,
"ironic_deficiency": 358,
"tautological_wisdom": 207
},
"discarded_during_filter": 3187,
"training_pair_count": 9257,
"training_by_template": {
"deconstruction": 2488,
"denial_of_consequences": 630,
"false_equivalence": 2059,
"futile_preparation": 1146,
"hypocritical_complaint": 681,
"ironic_deficiency": 1429,
"tautological_wisdom": 824
},
"training_by_input_type": {
"category_seeded": 2312,
"open_ended": 562,
"persona_seeded": 2312,
"template_seeded": 1759,
"word_seeded": 2312
},
"unique_slot_words_used": 609,
"total_vocab_words": 624,
"vocab_coverage": "97.6%",
"words_never_used": [
"agate",
"alabaster",
"anise",
"azalea",
"bee",
"blowfish",
"cattail",
"cypress",
"emerald",
"gem",
"grebe",
"juniper",
"lyre",
"spear",
"theater"
],
"words_never_used_count": 15,
"avg_saying_length_words": 13.1,
"min_saying_length_words": 6,
"max_saying_length_words": 23,
"balance_warnings": [
"WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
"WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
"WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
]
}

3188
corpus/discard_analysis.csv Normal file

File diff suppressed because it is too large Load diff

9257
corpus/training_pairs.jsonl Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -199,10 +199,10 @@ class Deconstruction(MetaTemplate):
id = "deconstruction" id = "deconstruction"
name = "Deconstruction" name = "Deconstruction"
surface_templates = [ surface_templates = [
"You know what they say, a {A} with no {B} is just a {C} {D}.", "You know what they say, {A_article} with no {B} is just a {C} {D}.",
"Take the {B} out of {A} and all you've got left is {C} {D}.", "Take the {B} out of {A} and all you've got left is {C} {D}.",
"{A} without {B}? That's just {D} with ideas above its station.", "{A} without {B}? That's just {D} with ideas above its station.",
"An {A} ain't nothing but {D} that met some {B}.", "{A_Article} ain't nothing but {D} that met some {B}.",
] ]
def generate(self, seed_word=None, seed_category=None): def generate(self, seed_word=None, seed_category=None):
@ -255,7 +255,8 @@ class Deconstruction(MetaTemplate):
c_word = random.choice(["plain", "sorry", "old", "humble", "dry", "wet", "cold"]) c_word = random.choice(["plain", "sorry", "old", "humble", "dry", "wet", "cold"])
template = self._pick_template() template = self._pick_template()
saying = template.format(A=a, B=b_word, C=c_word, D=d_word) saying = template.format(A=a, A_article=_a(a), A_Article=_a(a).capitalize(),
B=b_word, C=c_word, D=d_word)
debug = { debug = {
"template_family": self.id, "template_family": self.id,
@ -275,8 +276,8 @@ class DenialOfConsequences(MetaTemplate):
surface_templates = [ surface_templates = [
"Don't {C} the {A} and say you ain't got {B}.", "Don't {C} the {A} and say you ain't got {B}.",
"Don't {C} the {A} and act surprised when the {B} show up.", "Don't {C} the {A} and act surprised when the {B} show up.",
"Man who {C}s a {A} can't complain about {B}.", "Man who {C}s {A_article} can't complain about {B}.",
"You can't {C} a {A} and then wonder where all the {B} came from.", "You can't {C} {A_article} and then wonder where all the {B} came from.",
] ]
def generate(self, seed_word=None, seed_category=None): def generate(self, seed_word=None, seed_category=None):
@ -323,7 +324,7 @@ class DenialOfConsequences(MetaTemplate):
c_word = random.choice(["build", "set up", "put out", "lay down", "make"]) c_word = random.choice(["build", "set up", "put out", "lay down", "make"])
template = self._pick_template() template = self._pick_template()
saying = template.format(A=a, B=b_word, C=c_word) saying = template.format(A=a, A_article=_a(a), B=b_word, C=c_word)
debug = { debug = {
"template_family": self.id, "template_family": self.id,
@ -407,8 +408,9 @@ class FutilePreparation(MetaTemplate):
if not seed: if not seed:
return None, None return None, None
# What is the seed used for? # What is the seed used for? Filter out vocab nouns and noun-like words — we need verbs.
uses = _short_concepts(self.graph.neighbors(seed, "UsedFor", min_weight=0.5), max_words=2) uses = _short_concepts(self.graph.neighbors(seed, "UsedFor", min_weight=0.5), max_words=2)
uses = [u for u in uses if u[0] not in self.graph.vocab and _looks_like_verb(u[0])]
if not uses: if not uses:
return None, None return None, None
@ -532,9 +534,9 @@ class TautologicalWisdom(MetaTemplate):
id = "tautological_wisdom" id = "tautological_wisdom"
name = "Tautological Wisdom" name = "Tautological Wisdom"
surface_templates = [ surface_templates = [
"You know what they say, it takes a {X} to get a {Y}.", "You know what they say, it takes {X_article} to get {Y_article}.",
"My daddy always said, can't have {Y} without {X}.", "My daddy always said, can't have {Y} without {X}.",
"A {Y} don't come without its {X}, now does it?", "{Y_Article} don't come without its {X}, now does it?",
"You want {Y}? Well, first you're gonna need {X}.", "You want {Y}? Well, first you're gonna need {X}.",
"Ain't no {Y} ever came from nothing — you need {X}.", "Ain't no {Y} ever came from nothing — you need {X}.",
] ]
@ -566,7 +568,9 @@ class TautologicalWisdom(MetaTemplate):
chain_edge = choice[3] chain_edge = choice[3]
template = self._pick_template() template = self._pick_template()
saying = template.format(X=x_word, Y=y_word) saying = template.format(X=x_word, Y=y_word,
X_article=_a(x_word), Y_article=_a(y_word),
Y_Article=_a(y_word).capitalize())
debug = { debug = {
"template_family": self.id, "template_family": self.id,
@ -584,10 +588,10 @@ class FalseEquivalence(MetaTemplate):
id = "false_equivalence" id = "false_equivalence"
name = "False Equivalence" name = "False Equivalence"
surface_templates = [ surface_templates = [
"A {A} is just a {B} that's got {P}.", "{A_article} is just {B_article} that's got {P}.",
"What's a {A} but a {B} with {P}?", "What's {A_article} but {B_article} with {P}?",
"The only difference between a {A} and a {B} is {P}.", "The only difference between {A_article} and {B_article} is {P}.",
"Take the {P} from a {A} and you've got yourself a {B}.", "Take the {P} from {A_article} and you've got yourself {B_article}.",
] ]
def generate(self, seed_word=None, seed_category=None): def generate(self, seed_word=None, seed_category=None):
@ -635,7 +639,8 @@ class FalseEquivalence(MetaTemplate):
p_word = random.choice(["ambition", "an attitude", "a plan", "patience"]) p_word = random.choice(["ambition", "an attitude", "a plan", "patience"])
template = self._pick_template() template = self._pick_template()
saying = template.format(A=a, B=b_word, P=p_word) saying = template.format(A=a, B=b_word, P=p_word,
A_article=_a(a), B_article=_a(b_word))
debug = { debug = {
"template_family": self.id, "template_family": self.id,
@ -656,7 +661,7 @@ def _readable(concept):
return concept.replace("_", " ") return concept.replace("_", " ")
def _short_concepts(items, max_words=3): def _short_concepts(items, max_words=2):
"""Filter concept tuples to only those with short readable names. """Filter concept tuples to only those with short readable names.
Items can be tuples where first element is the concept string. Items can be tuples where first element is the concept string.
@ -672,14 +677,37 @@ def _gerund(word):
return word[:-1] + "ing" return word[:-1] + "ing"
if word.endswith("ing"): if word.endswith("ing"):
return word return word
if len(word) > 2 and word[-1] not in "aeiou" and word[-2] in "aeiou" and word[-3] not in "aeiou": # CVC doubling: "run" -> "running", "sit" -> "sitting"
# Treat 'y' and 'w' as vowels at word end (prey->preying, not preyying)
if len(word) > 2 and word[-1] not in "aeiouy" and word[-2] in "aeiou" and word[-3] not in "aeiou":
return word + word[-1] + "ing" return word + word[-1] + "ing"
return word + "ing" return word + "ing"
def _looks_like_verb(word):
"""Heuristic: does this word look like it could be a verb?
Rejects words with obvious noun/adjective suffixes."""
w = word.split("_")[0].lower() if "_" in word else word.lower()
noun_suffixes = ("tion", "sion", "ment", "ness", "ity", "ance", "ence",
"ture", "ism", "ist", "ery", "ory", "ling")
return not any(w.endswith(s) for s in noun_suffixes)
def _a(word): def _a(word):
"""Add 'a' or 'an' article.""" """Add 'a' or 'an' article."""
if word and word[0] in "aeiou": if not word:
return "a"
first = word.split()[0].lower() if word else ""
# Words that start with a vowel letter but consonant sound
consonant_sound = ("uni", "use", "used", "user", "usual", "usu", "uti", "uto",
"uro", "uku", "ula")
# Words that start with a consonant letter but vowel sound
vowel_sound = ("hour", "honest", "honor", "honour", "heir", "herb")
if any(first.startswith(p) for p in vowel_sound):
return f"an {word}"
if any(first.startswith(p) for p in consonant_sound):
return f"a {word}"
if first[0] in "aeiou":
return f"an {word}" return f"an {word}"
return f"a {word}" return f"a {word}"

View file

@ -34,7 +34,7 @@ total=0
for template in "${TEMPLATES[@]}"; do for template in "${TEMPLATES[@]}"; do
echo -n " $template ($COUNT_PER_TEMPLATE)... " echo -n " $template ($COUNT_PER_TEMPLATE)... "
before=$(wc -l < "$OUTPUT") before=$(wc -l < "$OUTPUT")
python "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null python3 "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
after=$(wc -l < "$OUTPUT") after=$(wc -l < "$OUTPUT")
generated=$((after - before)) generated=$((after - before))
total=$((total + generated)) total=$((total + generated))
@ -47,7 +47,7 @@ echo ""
# Check template distribution # Check template distribution
echo "Template distribution:" echo "Template distribution:"
python -c " python3 -c "
import json, sys import json, sys
from collections import Counter from collections import Counter
counts = Counter() counts = Counter()

View file

@ -4,6 +4,13 @@
Reads corpus_raw.jsonl, sends each to GLM4-32B for polish. Reads corpus_raw.jsonl, sends each to GLM4-32B for polish.
Output file is the checkpoint append mode with resume detection. Output file is the checkpoint append mode with resume detection.
Robust error handling:
- Context size errors: truncates chain data and retries
- JSON parse errors: retries, then marks as error
- Transient HTTP errors: exponential backoff retry
- Keyboard interrupt: flushes and exits cleanly
- Safe resume: skips entries already in output file
Usage: Usage:
python scripts/polish_corpus.py python scripts/polish_corpus.py
python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl
@ -62,8 +69,27 @@ Chain: canoe UsedFor transport, fire UsedFor boiling_food
Polished: DISCARD""" Polished: DISCARD"""
class LLMError(Exception):
"""Base class for LLM errors."""
pass
class ContextTooLong(LLMError):
"""Prompt exceeded context window."""
pass
class TransientError(LLMError):
"""Recoverable error (network, server overload, etc.)."""
pass
def llm_chat_completion(messages, max_retries=3): def llm_chat_completion(messages, max_retries=3):
"""Chat completion with retry logic.""" """Chat completion with retry logic and error classification.
Returns (response_text, error_type) tuple.
response_text is None on failure; error_type is None on success.
"""
import requests import requests
for attempt in range(max_retries): for attempt in range(max_retries):
@ -71,21 +97,94 @@ def llm_chat_completion(messages, max_retries=3):
resp = requests.post(LLM_ENDPOINT, json={ resp = requests.post(LLM_ENDPOINT, json={
"model": LLM_MODEL, "model": LLM_MODEL,
"messages": messages, "messages": messages,
"temperature": 0.7,
}, timeout=120) }, timeout=120)
resp.raise_for_status()
data = resp.json() # Check for context length errors (HTTP 400 typically)
return data["choices"][0]["message"]["content"].strip() if resp.status_code == 400:
except Exception as e: body = resp.text.lower()
wait = (2 ** attempt) if any(kw in body for kw in ["context", "token", "length", "too long", "exceed"]):
print(f" LLM error (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr) return None, "context_too_long"
# Other 400 errors — log and retry
print(f" HTTP 400 (attempt {attempt+1}): {resp.text[:200]}", file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
continue
return None, "http_400"
if resp.status_code == 503 or resp.status_code == 429:
wait = 2 ** (attempt + 1)
print(f" HTTP {resp.status_code} (attempt {attempt+1}), waiting {wait}s...",
file=sys.stderr)
if attempt < max_retries - 1: if attempt < max_retries - 1:
time.sleep(wait) time.sleep(wait)
else: continue
return None return None, "server_overload"
resp.raise_for_status()
# Parse JSON response
try:
data = resp.json()
except (json.JSONDecodeError, ValueError) as e:
print(f" JSON parse error (attempt {attempt+1}): {e}", file=sys.stderr)
print(f" Response body: {resp.text[:300]}", file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
continue
return None, "json_parse"
# Extract content from response
try:
content = data["choices"][0]["message"]["content"]
if content is None:
print(f" Null content in response (attempt {attempt+1})", file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(1)
continue
return None, "null_content"
return content.strip(), None
except (KeyError, IndexError) as e:
print(f" Unexpected JSON structure (attempt {attempt+1}): {e}", file=sys.stderr)
print(f" Keys: {list(data.keys()) if isinstance(data, dict) else type(data)}",
file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(1)
continue
return None, "json_structure"
except requests.exceptions.Timeout:
wait = 2 ** (attempt + 1)
print(f" Timeout (attempt {attempt+1}), waiting {wait}s...", file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(wait)
continue
return None, "timeout"
except requests.exceptions.ConnectionError as e:
wait = 2 ** (attempt + 2) # longer wait for connection errors
print(f" Connection error (attempt {attempt+1}): {e}", file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(wait)
continue
return None, "connection"
except Exception as e:
print(f" Unexpected error (attempt {attempt+1}): {type(e).__name__}: {e}",
file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
continue
return None, "unexpected"
return None, "exhausted_retries"
def format_chain(chain_edges): def format_chain(chain_edges, truncate=False):
"""Format chain_edges list into readable string for LLM context.""" """Format chain_edges list into readable string for LLM context.
If truncate=True, omit weights and surface_text to reduce token count.
"""
if not chain_edges: if not chain_edges:
return "(no chain data)" return "(no chain data)"
parts = [] parts = []
@ -93,6 +192,9 @@ def format_chain(chain_edges):
start = edge.get("start", "?") start = edge.get("start", "?")
rel = edge.get("relation", "?") rel = edge.get("relation", "?")
end = edge.get("end", "?") end = edge.get("end", "?")
if truncate:
parts.append(f"{start} --{rel}--> {end}")
else:
weight = edge.get("weight", 0) weight = edge.get("weight", 0)
parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})") parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})")
return ", ".join(parts) return ", ".join(parts)
@ -103,9 +205,33 @@ def format_slots(slots):
return ", ".join(f"{k}={v}" for k, v in slots.items()) return ", ".join(f"{k}={v}" for k, v in slots.items())
def build_messages(entry, truncate_chain=False):
"""Build the messages list for a single entry."""
raw_text = entry.get("raw_text", "")
meta_template = entry.get("meta_template", "")
chain = format_chain(entry.get("chain", []), truncate=truncate_chain)
slots = format_slots(entry.get("slots", {}))
user_prompt = (
f"Meta-template: {meta_template}\n"
f"Relationship chain: {chain}\n"
f"Slot fills: {slots}\n"
f"Raw saying: {raw_text}"
)
return [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
]
def load_already_processed(output_path): def load_already_processed(output_path):
"""Load set of raw_text strings already processed (for resume).""" """Load set of raw_text strings already processed (for resume).
Also returns counts of each status for accurate progress reporting.
"""
processed = set() processed = set()
counts = {"polished": 0, "discarded": 0, "error": 0}
if output_path.exists(): if output_path.exists():
with open(output_path, encoding="utf-8") as f: with open(output_path, encoding="utf-8") as f:
for line in f: for line in f:
@ -115,9 +241,12 @@ def load_already_processed(output_path):
try: try:
entry = json.loads(line) entry = json.loads(line)
processed.add(entry.get("raw_text", "")) processed.add(entry.get("raw_text", ""))
status = entry.get("status", "")
if status in counts:
counts[status] += 1
except json.JSONDecodeError: except json.JSONDecodeError:
continue continue
return processed return processed, counts
def main(): def main():
@ -141,15 +270,21 @@ def main():
for line in f: for line in f:
line = line.strip() line = line.strip()
if line: if line:
try:
raw_entries.append(json.loads(line)) raw_entries.append(json.loads(line))
except json.JSONDecodeError as e:
print(f"Warning: skipping malformed input line: {e}", file=sys.stderr)
print(f"Loaded {len(raw_entries)} raw entries from {input_path}") print(f"Loaded {len(raw_entries)} raw entries from {input_path}")
# Check what's already been processed # Check what's already been processed
already_processed = load_already_processed(output_path) already_processed, prev_counts = load_already_processed(output_path)
remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed] remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed]
print(f"Already processed: {len(already_processed)}") print(f"Already processed: {len(already_processed)} "
f"(polished={prev_counts['polished']}, "
f"discarded={prev_counts['discarded']}, "
f"errors={prev_counts['error']})")
print(f"Remaining: {len(remaining)}") print(f"Remaining: {len(remaining)}")
if not remaining: if not remaining:
@ -159,55 +294,105 @@ def main():
discards = 0 discards = 0
polished = 0 polished = 0
errors = 0 errors = 0
error_types = {}
consecutive_errors = 0
start_time = time.time()
try:
with open(output_path, "a", encoding="utf-8") as out: with open(output_path, "a", encoding="utf-8") as out:
for i, entry in enumerate(remaining): for i, entry in enumerate(remaining):
# First attempt with full chain data
messages = build_messages(entry, truncate_chain=False)
response, error_type = llm_chat_completion(messages)
# If context too long, retry with truncated chain
if error_type == "context_too_long":
print(f" #{i+1}: context too long, retrying with truncated chain...",
file=sys.stderr)
messages = build_messages(entry, truncate_chain=True)
response, error_type = llm_chat_completion(messages)
# If still too long, try with just the raw text
if error_type == "context_too_long":
print(f" #{i+1}: still too long, retrying with minimal prompt...",
file=sys.stderr)
raw_text = entry.get("raw_text", "") raw_text = entry.get("raw_text", "")
meta_template = entry.get("meta_template", "")
chain = format_chain(entry.get("chain", []))
slots = format_slots(entry.get("slots", {}))
user_prompt = (
f"Meta-template: {meta_template}\n"
f"Relationship chain: {chain}\n"
f"Slot fills: {slots}\n"
f"Raw saying: {raw_text}"
)
messages = [ messages = [
{"role": "system", "content": SYSTEM_PROMPT}, {"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt}, {"role": "user", "content": f"Raw saying: {raw_text}"},
] ]
response, error_type = llm_chat_completion(messages)
response = llm_chat_completion(messages)
if response is None: if response is None:
entry["status"] = "error" entry["status"] = "error"
entry["error_type"] = error_type or "unknown"
errors += 1 errors += 1
consecutive_errors += 1
error_types[error_type] = error_types.get(error_type, 0) + 1
# If we get 20 consecutive errors, something is seriously wrong
if consecutive_errors >= 20:
print(f"\nFATAL: {consecutive_errors} consecutive errors. "
f"Last error type: {error_type}", file=sys.stderr)
print("Flushing output and stopping. Re-run to resume.", file=sys.stderr)
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
out.flush()
sys.exit(1)
elif response.strip().upper() == "DISCARD": elif response.strip().upper() == "DISCARD":
entry["status"] = "discarded" entry["status"] = "discarded"
discards += 1 discards += 1
consecutive_errors = 0
else: else:
entry["polished_text"] = response.strip() # Sanity check the response
cleaned = response.strip()
# Sometimes the LLM wraps in quotes
if cleaned.startswith('"') and cleaned.endswith('"'):
cleaned = cleaned[1:-1]
# Sometimes the LLM prefixes with "Polished:" or similar
for prefix in ["Polished:", "polished:", "Output:", "Result:"]:
if cleaned.startswith(prefix):
cleaned = cleaned[len(prefix):].strip()
entry["polished_text"] = cleaned
entry["status"] = "polished" entry["status"] = "polished"
polished += 1 polished += 1
consecutive_errors = 0
out.write(json.dumps(entry, ensure_ascii=False) + "\n") out.write(json.dumps(entry, ensure_ascii=False) + "\n")
if (i + 1) % 100 == 0: # Flush every 10 entries for fine-grained resume safety
if (i + 1) % 10 == 0:
out.flush() out.flush()
# Progress report every 100 entries
if (i + 1) % 100 == 0:
total_done = len(already_processed) + i + 1 total_done = len(already_processed) + i + 1
elapsed = time.time() - start_time
rate = (i + 1) / elapsed
eta_sec = (len(remaining) - (i + 1)) / rate if rate > 0 else 0
eta_min = eta_sec / 60
print(f" [{total_done}/{len(raw_entries)}] " print(f" [{total_done}/{len(raw_entries)}] "
f"polished={polished}, discarded={discards}, errors={errors}") f"polished={polished}, discarded={discards}, errors={errors} "
f"({rate:.1f}/s, ETA {eta_min:.0f}m)")
time.sleep(0.1) time.sleep(0.1)
total_done = len(already_processed) + len(remaining) except KeyboardInterrupt:
print(f"\nDone: {total_done} total entries processed.") print(f"\nInterrupted at entry {i+1}/{len(remaining)}. "
f"Progress saved — re-run to resume.", file=sys.stderr)
# Final report
elapsed = time.time() - start_time
total_done = len(already_processed) + polished + discards + errors
print(f"\nSession complete: {polished + discards + errors} entries processed "
f"in {elapsed/60:.1f} minutes.")
print(f" Polished: {polished}") print(f" Polished: {polished}")
print(f" Discarded: {discards}") print(f" Discarded: {discards}")
print(f" Errors: {errors}") print(f" Errors: {errors}")
print(f" Discard rate: {discards/(polished+discards)*100:.1f}%" if (polished+discards) else " N/A") if error_types:
print(f" Error breakdown: {error_types}")
if polished + discards > 0:
print(f" Discard rate: {discards/(polished+discards)*100:.1f}%")
print(f" Total across all sessions: {total_done}/{len(raw_entries)}")
print(f"Output: {output_path}") print(f"Output: {output_path}")