Fix generator quality issues and run initial corpus pipeline

Pre-corpus fixes (from EVALUATION.md): - Clean 2,264 contaminated rows from augmented relations (bridge artifacts, full-sentence HasProperty values, null bytes, empty words) - Fix article logic: dynamic a/an across Deconstruction, FalseEquivalence, DenialOfConsequences, TautologicalWisdom templates - Tighten _short_concepts() default from max_words=3 to 2 - Fix FutilePreparation gerunding: filter vocab nouns and noun-suffix words from UsedFor targets; fix CVC doubling for 'y'-ending words - Add _looks_like_verb() heuristic, improve _a() for vowel-sound edges Pipeline hardening: - polish_corpus.py: context-size fallback (truncate chain, then minimal prompt), classified error types, consecutive-error circuit breaker, 10-entry flush granularity, ETA tracking, KeyboardInterrupt handling - generate_raw_batch.sh: fix python -> python3 Corpus generation run (9,835 raw -> 5,499 polished -> 2,312 filtered): - 44.1% discard rate, 0 errors, 82 minutes on RTX 4090 - 9,257 training pairs across 5 input framing types - 97.6% vocab coverage (609/624 words) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 04:33:56 -04:00 · 2026-03-10 04:33:56 -04:00 · 651ec3ffc6
commit 651ec3ffc6
parent 356b62c6ea
10 changed files with 34853 additions and 2406 deletions
--- a/corpus/corpus_filtered.jsonl
+++ b/corpus/corpus_filtered.jsonl
--- a/corpus/corpus_polished.jsonl
+++ b/corpus/corpus_polished.jsonl
--- a/corpus/corpus_raw.jsonl
+++ b/corpus/corpus_raw.jsonl
--- a/corpus/corpus_stats.json
+++ b/corpus/corpus_stats.json
@ -0,0 +1,91 @@
+{
+  "raw_count": 9835,
+  "raw_by_template": {
+    "deconstruction": 1500,
+    "denial_of_consequences": 1500,
+    "false_equivalence": 1500,
+    "futile_preparation": 1500,
+    "hypocritical_complaint": 1500,
+    "ironic_deficiency": 1500,
+    "tautological_wisdom": 835
+  },
+  "polished_count": 5499,
+  "discarded_during_polish": 4336,
+  "errors_during_polish": 0,
+  "polish_discard_rate": "44.1%",
+  "polished_by_template": {
+    "deconstruction": 1105,
+    "denial_of_consequences": 733,
+    "false_equivalence": 590,
+    "futile_preparation": 882,
+    "hypocritical_complaint": 573,
+    "ironic_deficiency": 831,
+    "tautological_wisdom": 785
+  },
+  "discarded_by_template": {
+    "deconstruction": 395,
+    "denial_of_consequences": 767,
+    "false_equivalence": 910,
+    "futile_preparation": 618,
+    "hypocritical_complaint": 927,
+    "ironic_deficiency": 669,
+    "tautological_wisdom": 50
+  },
+  "filtered_count": 2312,
+  "filtered_by_template": {
+    "deconstruction": 619,
+    "denial_of_consequences": 159,
+    "false_equivalence": 517,
+    "futile_preparation": 284,
+    "hypocritical_complaint": 168,
+    "ironic_deficiency": 358,
+    "tautological_wisdom": 207
+  },
+  "discarded_during_filter": 3187,
+  "training_pair_count": 9257,
+  "training_by_template": {
+    "deconstruction": 2488,
+    "denial_of_consequences": 630,
+    "false_equivalence": 2059,
+    "futile_preparation": 1146,
+    "hypocritical_complaint": 681,
+    "ironic_deficiency": 1429,
+    "tautological_wisdom": 824
+  },
+  "training_by_input_type": {
+    "category_seeded": 2312,
+    "open_ended": 562,
+    "persona_seeded": 2312,
+    "template_seeded": 1759,
+    "word_seeded": 2312
+  },
+  "unique_slot_words_used": 609,
+  "total_vocab_words": 624,
+  "vocab_coverage": "97.6%",
+  "words_never_used": [
+    "agate",
+    "alabaster",
+    "anise",
+    "azalea",
+    "bee",
+    "blowfish",
+    "cattail",
+    "cypress",
+    "emerald",
+    "gem",
+    "grebe",
+    "juniper",
+    "lyre",
+    "spear",
+    "theater"
+  ],
+  "words_never_used_count": 15,
+  "avg_saying_length_words": 13.1,
+  "min_saying_length_words": 6,
+  "max_saying_length_words": 23,
+  "balance_warnings": [
+    "WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
+    "WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
+    "WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
+  ]
+}
--- a/corpus/discard_analysis.csv
+++ b/corpus/discard_analysis.csv
--- a/corpus/training_pairs.jsonl
+++ b/corpus/training_pairs.jsonl
--- a/data/folksy_relations_augmented.csv
+++ b/data/folksy_relations_augmented.csv
--- a/folksy_generator.py
+++ b/folksy_generator.py
@ -199,10 +199,10 @@ class Deconstruction(MetaTemplate):
    id = "deconstruction"
    name = "Deconstruction"
    surface_templates = [
-        "You know what they say, a {A} with no {B} is just a {C} {D}.",
+        "You know what they say, {A_article} with no {B} is just a {C} {D}.",
        "Take the {B} out of {A} and all you've got left is {C} {D}.",
        "{A} without {B}? That's just {D} with ideas above its station.",
-        "An {A} ain't nothing but {D} that met some {B}.",
+        "{A_Article} ain't nothing but {D} that met some {B}.",
    ]

    def generate(self, seed_word=None, seed_category=None):
@ -255,7 +255,8 @@ class Deconstruction(MetaTemplate):
            c_word = random.choice(["plain", "sorry", "old", "humble", "dry", "wet", "cold"])

        template = self._pick_template()
-        saying = template.format(A=a, B=b_word, C=c_word, D=d_word)
+        saying = template.format(A=a, A_article=_a(a), A_Article=_a(a).capitalize(),
+                                  B=b_word, C=c_word, D=d_word)

        debug = {
            "template_family": self.id,
@ -275,8 +276,8 @@ class DenialOfConsequences(MetaTemplate):
    surface_templates = [
        "Don't {C} the {A} and say you ain't got {B}.",
        "Don't {C} the {A} and act surprised when the {B} show up.",
-        "Man who {C}s a {A} can't complain about {B}.",
-        "You can't {C} a {A} and then wonder where all the {B} came from.",
+        "Man who {C}s {A_article} can't complain about {B}.",
+        "You can't {C} {A_article} and then wonder where all the {B} came from.",
    ]

    def generate(self, seed_word=None, seed_category=None):
@ -323,7 +324,7 @@ class DenialOfConsequences(MetaTemplate):
            c_word = random.choice(["build", "set up", "put out", "lay down", "make"])

        template = self._pick_template()
-        saying = template.format(A=a, B=b_word, C=c_word)
+        saying = template.format(A=a, A_article=_a(a), B=b_word, C=c_word)

        debug = {
            "template_family": self.id,
@ -407,8 +408,9 @@ class FutilePreparation(MetaTemplate):
        if not seed:
            return None, None

-        # What is the seed used for?
+        # What is the seed used for? Filter out vocab nouns and noun-like words — we need verbs.
        uses = _short_concepts(self.graph.neighbors(seed, "UsedFor", min_weight=0.5), max_words=2)
+        uses = [u for u in uses if u[0] not in self.graph.vocab and _looks_like_verb(u[0])]
        if not uses:
            return None, None

@ -532,9 +534,9 @@ class TautologicalWisdom(MetaTemplate):
    id = "tautological_wisdom"
    name = "Tautological Wisdom"
    surface_templates = [
-        "You know what they say, it takes a {X} to get a {Y}.",
+        "You know what they say, it takes {X_article} to get {Y_article}.",
        "My daddy always said, can't have {Y} without {X}.",
-        "A {Y} don't come without its {X}, now does it?",
+        "{Y_Article} don't come without its {X}, now does it?",
        "You want {Y}? Well, first you're gonna need {X}.",
        "Ain't no {Y} ever came from nothing — you need {X}.",
    ]
@ -566,7 +568,9 @@ class TautologicalWisdom(MetaTemplate):
        chain_edge = choice[3]

        template = self._pick_template()
-        saying = template.format(X=x_word, Y=y_word)
+        saying = template.format(X=x_word, Y=y_word,
+                                  X_article=_a(x_word), Y_article=_a(y_word),
+                                  Y_Article=_a(y_word).capitalize())

        debug = {
            "template_family": self.id,
@ -584,10 +588,10 @@ class FalseEquivalence(MetaTemplate):
    id = "false_equivalence"
    name = "False Equivalence"
    surface_templates = [
-        "A {A} is just a {B} that's got {P}.",
-        "What's a {A} but a {B} with {P}?",
-        "The only difference between a {A} and a {B} is {P}.",
-        "Take the {P} from a {A} and you've got yourself a {B}.",
+        "{A_article} is just {B_article} that's got {P}.",
+        "What's {A_article} but {B_article} with {P}?",
+        "The only difference between {A_article} and {B_article} is {P}.",
+        "Take the {P} from {A_article} and you've got yourself {B_article}.",
    ]

    def generate(self, seed_word=None, seed_category=None):
@ -635,7 +639,8 @@ class FalseEquivalence(MetaTemplate):
                p_word = random.choice(["ambition", "an attitude", "a plan", "patience"])

        template = self._pick_template()
-        saying = template.format(A=a, B=b_word, P=p_word)
+        saying = template.format(A=a, B=b_word, P=p_word,
+                                 A_article=_a(a), B_article=_a(b_word))

        debug = {
            "template_family": self.id,
@ -656,7 +661,7 @@ def _readable(concept):
    return concept.replace("_", " ")


-def _short_concepts(items, max_words=3):
+def _short_concepts(items, max_words=2):
    """Filter concept tuples to only those with short readable names.

    Items can be tuples where first element is the concept string.
@ -672,14 +677,37 @@ def _gerund(word):
        return word[:-1] + "ing"
    if word.endswith("ing"):
        return word
-    if len(word) > 2 and word[-1] not in "aeiou" and word[-2] in "aeiou" and word[-3] not in "aeiou":
+    # CVC doubling: "run" -> "running", "sit" -> "sitting"
+    # Treat 'y' and 'w' as vowels at word end (prey->preying, not preyying)
+    if len(word) > 2 and word[-1] not in "aeiouy" and word[-2] in "aeiou" and word[-3] not in "aeiou":
        return word + word[-1] + "ing"
    return word + "ing"


+def _looks_like_verb(word):
+    """Heuristic: does this word look like it could be a verb?
+    Rejects words with obvious noun/adjective suffixes."""
+    w = word.split("_")[0].lower() if "_" in word else word.lower()
+    noun_suffixes = ("tion", "sion", "ment", "ness", "ity", "ance", "ence",
+                     "ture", "ism", "ist", "ery", "ory", "ling")
+    return not any(w.endswith(s) for s in noun_suffixes)
+
+
 def _a(word):
    """Add 'a' or 'an' article."""
-    if word and word[0] in "aeiou":
+    if not word:
+        return "a"
+    first = word.split()[0].lower() if word else ""
+    # Words that start with a vowel letter but consonant sound
+    consonant_sound = ("uni", "use", "used", "user", "usual", "usu", "uti", "uto",
+                       "uro", "uku", "ula")
+    # Words that start with a consonant letter but vowel sound
+    vowel_sound = ("hour", "honest", "honor", "honour", "heir", "herb")
+    if any(first.startswith(p) for p in vowel_sound):
+        return f"an {word}"
+    if any(first.startswith(p) for p in consonant_sound):
+        return f"a {word}"
+    if first[0] in "aeiou":
        return f"an {word}"
    return f"a {word}"

--- a/scripts/generate_raw_batch.sh
+++ b/scripts/generate_raw_batch.sh
@ -34,7 +34,7 @@ total=0
 for template in "${TEMPLATES[@]}"; do
    echo -n "  $template ($COUNT_PER_TEMPLATE)... "
    before=$(wc -l < "$OUTPUT")
-    python "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
+    python3 "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
    after=$(wc -l < "$OUTPUT")
    generated=$((after - before))
    total=$((total + generated))
@ -47,7 +47,7 @@ echo ""

 # Check template distribution
 echo "Template distribution:"
-python -c "
+python3 -c "
 import json, sys
 from collections import Counter
 counts = Counter()
--- a/scripts/polish_corpus.py
+++ b/scripts/polish_corpus.py
@ -4,6 +4,13 @@
 Reads corpus_raw.jsonl, sends each to GLM4-32B for polish.
 Output file is the checkpoint — append mode with resume detection.

+Robust error handling:
+  - Context size errors: truncates chain data and retries
+  - JSON parse errors: retries, then marks as error
+  - Transient HTTP errors: exponential backoff retry
+  - Keyboard interrupt: flushes and exits cleanly
+  - Safe resume: skips entries already in output file
+
 Usage:
  python scripts/polish_corpus.py
  python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl
@ -62,8 +69,27 @@ Chain: canoe UsedFor transport, fire UsedFor boiling_food
 Polished: DISCARD"""


+class LLMError(Exception):
+    """Base class for LLM errors."""
+    pass
+
+
+class ContextTooLong(LLMError):
+    """Prompt exceeded context window."""
+    pass
+
+
+class TransientError(LLMError):
+    """Recoverable error (network, server overload, etc.)."""
+    pass
+
+
 def llm_chat_completion(messages, max_retries=3):
-    """Chat completion with retry logic."""
+    """Chat completion with retry logic and error classification.
+
+    Returns (response_text, error_type) tuple.
+    response_text is None on failure; error_type is None on success.
+    """
    import requests

    for attempt in range(max_retries):
@ -71,21 +97,94 @@ def llm_chat_completion(messages, max_retries=3):
            resp = requests.post(LLM_ENDPOINT, json={
                "model": LLM_MODEL,
                "messages": messages,
+                "temperature": 0.7,
            }, timeout=120)
-            resp.raise_for_status()
-            data = resp.json()
-            return data["choices"][0]["message"]["content"].strip()
-        except Exception as e:
-            wait = (2 ** attempt)
-            print(f"  LLM error (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
+
+            # Check for context length errors (HTTP 400 typically)
+            if resp.status_code == 400:
+                body = resp.text.lower()
+                if any(kw in body for kw in ["context", "token", "length", "too long", "exceed"]):
+                    return None, "context_too_long"
+                # Other 400 errors — log and retry
+                print(f"  HTTP 400 (attempt {attempt+1}): {resp.text[:200]}", file=sys.stderr)
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+                    continue
+                return None, "http_400"
+
+            if resp.status_code == 503 or resp.status_code == 429:
+                wait = 2 ** (attempt + 1)
+                print(f"  HTTP {resp.status_code} (attempt {attempt+1}), waiting {wait}s...",
+                      file=sys.stderr)
                if attempt < max_retries - 1:
                    time.sleep(wait)
-            else:
-                return None
+                    continue
+                return None, "server_overload"
+
+            resp.raise_for_status()
+
+            # Parse JSON response
+            try:
+                data = resp.json()
+            except (json.JSONDecodeError, ValueError) as e:
+                print(f"  JSON parse error (attempt {attempt+1}): {e}", file=sys.stderr)
+                print(f"  Response body: {resp.text[:300]}", file=sys.stderr)
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+                    continue
+                return None, "json_parse"
+
+            # Extract content from response
+            try:
+                content = data["choices"][0]["message"]["content"]
+                if content is None:
+                    print(f"  Null content in response (attempt {attempt+1})", file=sys.stderr)
+                    if attempt < max_retries - 1:
+                        time.sleep(1)
+                        continue
+                    return None, "null_content"
+                return content.strip(), None
+            except (KeyError, IndexError) as e:
+                print(f"  Unexpected JSON structure (attempt {attempt+1}): {e}", file=sys.stderr)
+                print(f"  Keys: {list(data.keys()) if isinstance(data, dict) else type(data)}",
+                      file=sys.stderr)
+                if attempt < max_retries - 1:
+                    time.sleep(1)
+                    continue
+                return None, "json_structure"
+
+        except requests.exceptions.Timeout:
+            wait = 2 ** (attempt + 1)
+            print(f"  Timeout (attempt {attempt+1}), waiting {wait}s...", file=sys.stderr)
+            if attempt < max_retries - 1:
+                time.sleep(wait)
+                continue
+            return None, "timeout"
+
+        except requests.exceptions.ConnectionError as e:
+            wait = 2 ** (attempt + 2)  # longer wait for connection errors
+            print(f"  Connection error (attempt {attempt+1}): {e}", file=sys.stderr)
+            if attempt < max_retries - 1:
+                time.sleep(wait)
+                continue
+            return None, "connection"
+
+        except Exception as e:
+            print(f"  Unexpected error (attempt {attempt+1}): {type(e).__name__}: {e}",
+                  file=sys.stderr)
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt)
+                continue
+            return None, "unexpected"
+
+    return None, "exhausted_retries"


-def format_chain(chain_edges):
-    """Format chain_edges list into readable string for LLM context."""
+def format_chain(chain_edges, truncate=False):
+    """Format chain_edges list into readable string for LLM context.
+
+    If truncate=True, omit weights and surface_text to reduce token count.
+    """
    if not chain_edges:
        return "(no chain data)"
    parts = []
@ -93,6 +192,9 @@ def format_chain(chain_edges):
        start = edge.get("start", "?")
        rel = edge.get("relation", "?")
        end = edge.get("end", "?")
+        if truncate:
+            parts.append(f"{start} --{rel}--> {end}")
+        else:
            weight = edge.get("weight", 0)
            parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})")
    return ", ".join(parts)
@ -103,9 +205,33 @@ def format_slots(slots):
    return ", ".join(f"{k}={v}" for k, v in slots.items())


+def build_messages(entry, truncate_chain=False):
+    """Build the messages list for a single entry."""
+    raw_text = entry.get("raw_text", "")
+    meta_template = entry.get("meta_template", "")
+    chain = format_chain(entry.get("chain", []), truncate=truncate_chain)
+    slots = format_slots(entry.get("slots", {}))
+
+    user_prompt = (
+        f"Meta-template: {meta_template}\n"
+        f"Relationship chain: {chain}\n"
+        f"Slot fills: {slots}\n"
+        f"Raw saying: {raw_text}"
+    )
+
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": user_prompt},
+    ]
+
+
 def load_already_processed(output_path):
-    """Load set of raw_text strings already processed (for resume)."""
+    """Load set of raw_text strings already processed (for resume).
+
+    Also returns counts of each status for accurate progress reporting.
+    """
    processed = set()
+    counts = {"polished": 0, "discarded": 0, "error": 0}
    if output_path.exists():
        with open(output_path, encoding="utf-8") as f:
            for line in f:
@ -115,9 +241,12 @@ def load_already_processed(output_path):
                try:
                    entry = json.loads(line)
                    processed.add(entry.get("raw_text", ""))
+                    status = entry.get("status", "")
+                    if status in counts:
+                        counts[status] += 1
                except json.JSONDecodeError:
                    continue
-    return processed
+    return processed, counts


 def main():
@ -141,15 +270,21 @@ def main():
        for line in f:
            line = line.strip()
            if line:
+                try:
                    raw_entries.append(json.loads(line))
+                except json.JSONDecodeError as e:
+                    print(f"Warning: skipping malformed input line: {e}", file=sys.stderr)

    print(f"Loaded {len(raw_entries)} raw entries from {input_path}")

    # Check what's already been processed
-    already_processed = load_already_processed(output_path)
+    already_processed, prev_counts = load_already_processed(output_path)
    remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed]

-    print(f"Already processed: {len(already_processed)}")
+    print(f"Already processed: {len(already_processed)} "
+          f"(polished={prev_counts['polished']}, "
+          f"discarded={prev_counts['discarded']}, "
+          f"errors={prev_counts['error']})")
    print(f"Remaining: {len(remaining)}")

    if not remaining:
@ -159,55 +294,105 @@ def main():
    discards = 0
    polished = 0
    errors = 0
+    error_types = {}
+    consecutive_errors = 0
+    start_time = time.time()

+    try:
        with open(output_path, "a", encoding="utf-8") as out:
            for i, entry in enumerate(remaining):
+                # First attempt with full chain data
+                messages = build_messages(entry, truncate_chain=False)
+                response, error_type = llm_chat_completion(messages)
+
+                # If context too long, retry with truncated chain
+                if error_type == "context_too_long":
+                    print(f"  #{i+1}: context too long, retrying with truncated chain...",
+                          file=sys.stderr)
+                    messages = build_messages(entry, truncate_chain=True)
+                    response, error_type = llm_chat_completion(messages)
+
+                    # If still too long, try with just the raw text
+                    if error_type == "context_too_long":
+                        print(f"  #{i+1}: still too long, retrying with minimal prompt...",
+                              file=sys.stderr)
                        raw_text = entry.get("raw_text", "")
-            meta_template = entry.get("meta_template", "")
-            chain = format_chain(entry.get("chain", []))
-            slots = format_slots(entry.get("slots", {}))
-
-            user_prompt = (
-                f"Meta-template: {meta_template}\n"
-                f"Relationship chain: {chain}\n"
-                f"Slot fills: {slots}\n"
-                f"Raw saying: {raw_text}"
-            )
-
                        messages = [
                            {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": user_prompt},
+                            {"role": "user", "content": f"Raw saying: {raw_text}"},
                        ]
-
-            response = llm_chat_completion(messages)
+                        response, error_type = llm_chat_completion(messages)

                if response is None:
                    entry["status"] = "error"
+                    entry["error_type"] = error_type or "unknown"
                    errors += 1
+                    consecutive_errors += 1
+                    error_types[error_type] = error_types.get(error_type, 0) + 1
+
+                    # If we get 20 consecutive errors, something is seriously wrong
+                    if consecutive_errors >= 20:
+                        print(f"\nFATAL: {consecutive_errors} consecutive errors. "
+                              f"Last error type: {error_type}", file=sys.stderr)
+                        print("Flushing output and stopping. Re-run to resume.", file=sys.stderr)
+                        out.write(json.dumps(entry, ensure_ascii=False) + "\n")
+                        out.flush()
+                        sys.exit(1)
                elif response.strip().upper() == "DISCARD":
                    entry["status"] = "discarded"
                    discards += 1
+                    consecutive_errors = 0
                else:
-                entry["polished_text"] = response.strip()
+                    # Sanity check the response
+                    cleaned = response.strip()
+                    # Sometimes the LLM wraps in quotes
+                    if cleaned.startswith('"') and cleaned.endswith('"'):
+                        cleaned = cleaned[1:-1]
+                    # Sometimes the LLM prefixes with "Polished:" or similar
+                    for prefix in ["Polished:", "polished:", "Output:", "Result:"]:
+                        if cleaned.startswith(prefix):
+                            cleaned = cleaned[len(prefix):].strip()
+                    entry["polished_text"] = cleaned
                    entry["status"] = "polished"
                    polished += 1
+                    consecutive_errors = 0

                out.write(json.dumps(entry, ensure_ascii=False) + "\n")

-            if (i + 1) % 100 == 0:
+                # Flush every 10 entries for fine-grained resume safety
+                if (i + 1) % 10 == 0:
                    out.flush()
+
+                # Progress report every 100 entries
+                if (i + 1) % 100 == 0:
                    total_done = len(already_processed) + i + 1
+                    elapsed = time.time() - start_time
+                    rate = (i + 1) / elapsed
+                    eta_sec = (len(remaining) - (i + 1)) / rate if rate > 0 else 0
+                    eta_min = eta_sec / 60
                    print(f"  [{total_done}/{len(raw_entries)}] "
-                      f"polished={polished}, discarded={discards}, errors={errors}")
+                          f"polished={polished}, discarded={discards}, errors={errors} "
+                          f"({rate:.1f}/s, ETA {eta_min:.0f}m)")

                time.sleep(0.1)

-    total_done = len(already_processed) + len(remaining)
-    print(f"\nDone: {total_done} total entries processed.")
+    except KeyboardInterrupt:
+        print(f"\nInterrupted at entry {i+1}/{len(remaining)}. "
+              f"Progress saved — re-run to resume.", file=sys.stderr)
+
+    # Final report
+    elapsed = time.time() - start_time
+    total_done = len(already_processed) + polished + discards + errors
+    print(f"\nSession complete: {polished + discards + errors} entries processed "
+          f"in {elapsed/60:.1f} minutes.")
    print(f"  Polished: {polished}")
    print(f"  Discarded: {discards}")
    print(f"  Errors: {errors}")
-    print(f"  Discard rate: {discards/(polished+discards)*100:.1f}%" if (polished+discards) else "  N/A")
+    if error_types:
+        print(f"  Error breakdown: {error_types}")
+    if polished + discards > 0:
+        print(f"  Discard rate: {discards/(polished+discards)*100:.1f}%")
+    print(f"  Total across all sessions: {total_done}/{len(raw_entries)}")
    print(f"Output: {output_path}")