corpus generation (work from mid february)
This commit is contained in:
parent
8c8a058301
commit
356b62c6ea
16 changed files with 25872 additions and 38 deletions
213
scripts/compute_corpus_stats.py
Normal file
213
scripts/compute_corpus_stats.py
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Compute corpus statistics and validation metrics.
|
||||
|
||||
Reads corpus files and computes counts, distributions, coverage, and balance warnings.
|
||||
|
||||
Usage:
|
||||
python scripts/compute_corpus_stats.py
|
||||
python scripts/compute_corpus_stats.py --corpus-dir corpus/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
DATA_DIR = PROJECT_DIR / "data"
|
||||
|
||||
|
||||
def load_jsonl(path):
|
||||
"""Load a JSONL file."""
|
||||
entries = []
|
||||
if not path.exists():
|
||||
return entries
|
||||
with open(path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
entries.append(json.loads(line))
|
||||
return entries
|
||||
|
||||
|
||||
def classify_input_type(inp):
|
||||
"""Classify the input framing type of a training pair."""
|
||||
if inp.startswith("Tell me something about"):
|
||||
return "word_seeded"
|
||||
elif inp.startswith("Tell me a saying about"):
|
||||
return "category_seeded"
|
||||
elif inp.startswith("What would a"):
|
||||
return "persona_seeded"
|
||||
elif inp.startswith("Give me a") and "proverb" in inp:
|
||||
return "template_seeded"
|
||||
elif any(inp.startswith(p) for p in [
|
||||
"Tell me some folk", "What do they", "Give me a proverb",
|
||||
"Share some", "What's a good"
|
||||
]):
|
||||
return "open_ended"
|
||||
else:
|
||||
return "fictional"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Compute corpus statistics.")
|
||||
parser.add_argument("--corpus-dir", default=str(PROJECT_DIR / "corpus"),
|
||||
help="Corpus directory")
|
||||
parser.add_argument("--output", default=None,
|
||||
help="Output JSON file (default: corpus_dir/corpus_stats.json)")
|
||||
args = parser.parse_args()
|
||||
|
||||
corpus_dir = Path(args.corpus_dir)
|
||||
output_path = Path(args.output) if args.output else corpus_dir / "corpus_stats.json"
|
||||
|
||||
# Load all corpus files
|
||||
raw = load_jsonl(corpus_dir / "corpus_raw.jsonl")
|
||||
polished = load_jsonl(corpus_dir / "corpus_polished.jsonl")
|
||||
filtered = load_jsonl(corpus_dir / "corpus_filtered.jsonl")
|
||||
training = load_jsonl(corpus_dir / "training_pairs.jsonl")
|
||||
|
||||
# Load vocab for coverage analysis
|
||||
vocab_words = set()
|
||||
vocab_path = DATA_DIR / "folksy_vocab.csv"
|
||||
if vocab_path.exists():
|
||||
with open(vocab_path, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
vocab_words.add(row["word"])
|
||||
|
||||
stats = {}
|
||||
|
||||
# --- Raw corpus stats ---
|
||||
stats["raw_count"] = len(raw)
|
||||
raw_by_template = Counter(e.get("meta_template", "unknown") for e in raw)
|
||||
stats["raw_by_template"] = dict(sorted(raw_by_template.items()))
|
||||
|
||||
# --- Polish stats ---
|
||||
polished_entries = [e for e in polished if e.get("status") == "polished"]
|
||||
discarded_entries = [e for e in polished if e.get("status") == "discarded"]
|
||||
error_entries = [e for e in polished if e.get("status") == "error"]
|
||||
|
||||
stats["polished_count"] = len(polished_entries)
|
||||
stats["discarded_during_polish"] = len(discarded_entries)
|
||||
stats["errors_during_polish"] = len(error_entries)
|
||||
if polished_entries or discarded_entries:
|
||||
total_processed = len(polished_entries) + len(discarded_entries)
|
||||
stats["polish_discard_rate"] = f"{len(discarded_entries)/total_processed*100:.1f}%"
|
||||
|
||||
polish_by_template = Counter(e.get("meta_template", "unknown") for e in polished_entries)
|
||||
stats["polished_by_template"] = dict(sorted(polish_by_template.items()))
|
||||
|
||||
discard_by_template = Counter(e.get("meta_template", "unknown") for e in discarded_entries)
|
||||
stats["discarded_by_template"] = dict(sorted(discard_by_template.items()))
|
||||
|
||||
# --- Filter stats ---
|
||||
stats["filtered_count"] = len(filtered)
|
||||
|
||||
filter_by_template = Counter(e.get("meta_template", "unknown") for e in filtered)
|
||||
stats["filtered_by_template"] = dict(sorted(filter_by_template.items()))
|
||||
|
||||
# Filter discard count
|
||||
stats["discarded_during_filter"] = len(polished_entries) - len(filtered)
|
||||
|
||||
# --- Training pairs stats ---
|
||||
stats["training_pair_count"] = len(training)
|
||||
|
||||
training_by_template = Counter(e.get("meta_template", "unknown") for e in training)
|
||||
stats["training_by_template"] = dict(sorted(training_by_template.items()))
|
||||
|
||||
input_type_counts = Counter(classify_input_type(e.get("input", "")) for e in training)
|
||||
stats["training_by_input_type"] = dict(sorted(input_type_counts.items()))
|
||||
|
||||
# --- Coverage analysis ---
|
||||
used_words = set()
|
||||
for entry in filtered:
|
||||
slots = entry.get("slots", {})
|
||||
for v in slots.values():
|
||||
word = v.lower().replace(" ", "_")
|
||||
if word in vocab_words:
|
||||
used_words.add(word)
|
||||
|
||||
stats["unique_slot_words_used"] = len(used_words)
|
||||
stats["total_vocab_words"] = len(vocab_words)
|
||||
stats["vocab_coverage"] = f"{len(used_words)/len(vocab_words)*100:.1f}%" if vocab_words else "N/A"
|
||||
|
||||
never_used = sorted(vocab_words - used_words)
|
||||
stats["words_never_used"] = never_used
|
||||
stats["words_never_used_count"] = len(never_used)
|
||||
|
||||
# --- Saying length stats ---
|
||||
lengths = []
|
||||
for entry in filtered:
|
||||
text = entry.get("polished_text", "")
|
||||
if text:
|
||||
lengths.append(len(text.split()))
|
||||
|
||||
if lengths:
|
||||
stats["avg_saying_length_words"] = round(sum(lengths) / len(lengths), 1)
|
||||
stats["min_saying_length_words"] = min(lengths)
|
||||
stats["max_saying_length_words"] = max(lengths)
|
||||
|
||||
# --- Balance warnings ---
|
||||
warnings = []
|
||||
if filtered:
|
||||
total_filtered = len(filtered)
|
||||
for template, count in filter_by_template.items():
|
||||
pct = count / total_filtered * 100
|
||||
if pct < 10:
|
||||
warnings.append(
|
||||
f"WARNING: {template} has only {count} entries ({pct:.1f}%) — "
|
||||
f"below 10% threshold. Generate more raw sayings for this family."
|
||||
)
|
||||
|
||||
if training:
|
||||
total_training = len(training)
|
||||
for template, count in training_by_template.items():
|
||||
pct = count / total_training * 100
|
||||
if pct < 5:
|
||||
warnings.append(
|
||||
f"WARNING: {template} has only {count} training pairs ({pct:.1f}%) — very underrepresented."
|
||||
)
|
||||
|
||||
stats["balance_warnings"] = warnings
|
||||
|
||||
# --- Write output ---
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(stats, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# --- Print summary ---
|
||||
print("=" * 60)
|
||||
print("CORPUS STATISTICS")
|
||||
print("=" * 60)
|
||||
|
||||
print(f"\nRaw sayings: {stats['raw_count']}")
|
||||
print(f"Polished sayings: {stats['polished_count']}")
|
||||
print(f"Discarded (polish): {stats.get('discarded_during_polish', 0)} ({stats.get('polish_discard_rate', 'N/A')})")
|
||||
print(f"Discarded (filter): {stats.get('discarded_during_filter', 0)}")
|
||||
print(f"Final filtered: {stats['filtered_count']}")
|
||||
print(f"Training pairs: {stats['training_pair_count']}")
|
||||
|
||||
print(f"\nDistribution by meta-template (filtered):")
|
||||
for t, c in sorted(filter_by_template.items()):
|
||||
pct = c / len(filtered) * 100 if filtered else 0
|
||||
print(f" {t:30s} {c:5d} ({pct:5.1f}%)")
|
||||
|
||||
print(f"\nDistribution by input framing type:")
|
||||
for t, c in sorted(input_type_counts.items()):
|
||||
print(f" {t:20s} {c:5d}")
|
||||
|
||||
print(f"\nVocab coverage: {stats['vocab_coverage']} ({stats['unique_slot_words_used']}/{stats['total_vocab_words']})")
|
||||
print(f"Average saying length: {stats.get('avg_saying_length_words', 'N/A')} words")
|
||||
|
||||
if warnings:
|
||||
print(f"\nBalance warnings:")
|
||||
for w in warnings:
|
||||
print(f" {w}")
|
||||
|
||||
print(f"\nFull stats: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
787
scripts/enhance_graph.py
Normal file
787
scripts/enhance_graph.py
Normal file
|
|
@ -0,0 +1,787 @@
|
|||
#!/usr/bin/env python3
|
||||
"""LLM-augmented graph enhancement for the folksy subgraph.
|
||||
|
||||
Three phases:
|
||||
Phase 1: Per-word relationship expansion
|
||||
Phase 2: Cross-word bridge discovery
|
||||
Phase 3: Property enrichment for false_equivalence templates
|
||||
|
||||
Usage:
|
||||
python scripts/enhance_graph.py --phase 1 # Run phase 1 only
|
||||
python scripts/enhance_graph.py --phase 2 # Run phase 2 only
|
||||
python scripts/enhance_graph.py --phase 3 # Run phase 3 only
|
||||
python scripts/enhance_graph.py --all # Run all phases
|
||||
python scripts/enhance_graph.py --phase 1 --dry-run # Print prompts without calling LLM
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Paths
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
DATA_DIR = PROJECT_DIR / "data"
|
||||
|
||||
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
|
||||
LLM_MODEL = "THUDM-GLM4-32B"
|
||||
|
||||
VALID_RELATIONS = {
|
||||
"AtLocation", "MadeOf", "PartOf", "UsedFor", "HasA", "HasProperty",
|
||||
"Causes", "HasPrerequisite", "CapableOf", "ReceivesAction", "Desires",
|
||||
"CausesDesire", "LocatedNear", "CreatedBy", "MotivatedByGoal", "HasSubevent",
|
||||
}
|
||||
|
||||
AUGMENTED_CSV = DATA_DIR / "folksy_relations_augmented.csv"
|
||||
CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"
|
||||
LOG_CSV = DATA_DIR / "enhancement_log.csv"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Infrastructure
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def llm_chat_completion(messages, max_retries=3):
|
||||
"""Chat completion with retry logic."""
|
||||
import requests
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = requests.post(LLM_ENDPOINT, json={
|
||||
"model": LLM_MODEL,
|
||||
"messages": messages,
|
||||
}, timeout=120)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["choices"][0]["message"]["content"]
|
||||
except Exception as e:
|
||||
wait = (2 ** attempt)
|
||||
print(f" LLM call failed (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
|
||||
if attempt < max_retries - 1:
|
||||
print(f" Retrying in {wait}s...", file=sys.stderr)
|
||||
time.sleep(wait)
|
||||
else:
|
||||
print(f" Giving up on this word.", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def load_vocab():
|
||||
"""Load folksy vocabulary."""
|
||||
vocab = {}
|
||||
with open(DATA_DIR / "folksy_vocab.csv", newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
word = row["word"]
|
||||
cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
|
||||
vocab[word] = {
|
||||
"categories": cats,
|
||||
"tangibility": float(row.get("tangibility_score", 0)),
|
||||
"edge_count": int(row.get("conceptnet_edge_count", 0)),
|
||||
}
|
||||
return vocab
|
||||
|
||||
|
||||
def load_relations():
|
||||
"""Load existing relations (ConceptNet + any existing augmented)."""
|
||||
edges = defaultdict(list) # (start, relation) -> [(end, weight, surface)]
|
||||
existing_triples = set() # (start, end, relation) for dedup
|
||||
|
||||
for path in [DATA_DIR / "folksy_relations.csv", AUGMENTED_CSV]:
|
||||
if not path.exists():
|
||||
continue
|
||||
with open(path, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
sw = row["start_word"]
|
||||
ew = row["end_word"]
|
||||
rel = row["relation"]
|
||||
if not row['weight']: continue # corruption / skip?
|
||||
w = float(row["weight"])
|
||||
surf = row.get("surface_text", "")
|
||||
edges[(sw, rel)].append((ew, w, surf))
|
||||
existing_triples.add((sw, ew, rel))
|
||||
|
||||
return edges, existing_triples
|
||||
|
||||
|
||||
def load_checkpoint():
|
||||
"""Load enhancement log to determine what's already been processed."""
|
||||
processed = set() # (word, phase)
|
||||
if LOG_CSV.exists():
|
||||
with open(LOG_CSV, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
processed.add((row["source_word"], row["phase"]))
|
||||
return processed
|
||||
|
||||
|
||||
def append_log(word, phase, edges_generated, edges_accepted, edges_duplicate, edges_oov):
|
||||
"""Append a row to the enhancement log."""
|
||||
write_header = not LOG_CSV.exists()
|
||||
with open(LOG_CSV, "a", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
if write_header:
|
||||
writer.writerow(["source_word", "phase", "timestamp",
|
||||
"edges_generated", "edges_accepted", "edges_duplicate", "edges_oov"])
|
||||
writer.writerow([word, phase, datetime.now().isoformat(),
|
||||
edges_generated, edges_accepted, edges_duplicate, edges_oov])
|
||||
|
||||
|
||||
def append_augmented_edges(edges):
|
||||
"""Append edges to the augmented relations CSV."""
|
||||
write_header = not AUGMENTED_CSV.exists()
|
||||
with open(AUGMENTED_CSV, "a", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
if write_header:
|
||||
writer.writerow(["start_word", "end_word", "relation", "weight", "surface_text", "source"])
|
||||
for e in edges:
|
||||
writer.writerow([e["start_word"], e["end_word"], e["relation"],
|
||||
e["weight"], e["surface_text"], e["source"]])
|
||||
|
||||
|
||||
def append_candidates(candidates):
|
||||
"""Append candidate words to the candidate additions CSV."""
|
||||
write_header = not CANDIDATE_CSV.exists()
|
||||
with open(CANDIDATE_CSV, "a", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
if write_header:
|
||||
writer.writerow(["word", "suggested_by", "relation_context", "frequency"])
|
||||
for c in candidates:
|
||||
writer.writerow([c["word"], c["suggested_by"], c["relation_context"], c["frequency"]])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_llm_relations(response_text, source_word):
|
||||
"""Parse structured LLM output into edge dicts.
|
||||
|
||||
Handles bullets, numbering, extra whitespace, multi-word targets.
|
||||
"""
|
||||
edges = []
|
||||
if not response_text:
|
||||
return edges
|
||||
|
||||
for line in response_text.strip().split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Strip leading bullets/numbers: "- ", "1. ", "* ", etc.
|
||||
line = re.sub(r"^[\d]+[.)]\s*", "", line)
|
||||
line = re.sub(r"^[-*•]\s*", "", line)
|
||||
line = line.strip()
|
||||
|
||||
if not line or "NONE" in line.upper():
|
||||
continue
|
||||
|
||||
# Match: RELATION_TYPE: target_word(s) | surface text
|
||||
match = re.match(r"^(\w+):\s*(.+?)\s*\|\s*(.+)$", line)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
relation, target_raw, surface = match.groups()
|
||||
relation = relation.strip()
|
||||
|
||||
if relation not in VALID_RELATIONS:
|
||||
continue
|
||||
|
||||
# Normalize target: lowercase, replace spaces with underscores for multi-word
|
||||
target = target_raw.strip().lower()
|
||||
target = re.sub(r"\s+", "_", target)
|
||||
|
||||
# Skip self-loops
|
||||
if target == source_word:
|
||||
continue
|
||||
|
||||
edges.append({
|
||||
"start_word": source_word,
|
||||
"end_word": target,
|
||||
"relation": relation,
|
||||
"weight": 0.8,
|
||||
"surface_text": surface.strip(),
|
||||
"source": "llm_augmented",
|
||||
})
|
||||
|
||||
return edges
|
||||
|
||||
|
||||
def parse_bridge_response(response_text, word_a, word_b):
|
||||
"""Parse bridge discovery LLM output."""
|
||||
edges = []
|
||||
if not response_text:
|
||||
return edges
|
||||
|
||||
for line in response_text.strip().split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Strip common prefixes
|
||||
line = re.sub(r"^[\d]+[.)]\s*", "", line)
|
||||
line = re.sub(r"^[-*•]\s*", "", line)
|
||||
line = re.sub(r"^BRIDGE:\s*", "", line, flags=re.IGNORECASE)
|
||||
line = line.strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# BRIDGE_WORD | relation_to_first: TYPE | relation_to_second: TYPE | explanation
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
|
||||
bridge_word = parts[0].strip().lower().replace(" ", "_")
|
||||
|
||||
# Parse relation_to_first
|
||||
rel1_match = re.search(r"(?:relation_to_first|first):\s*(\w+)", parts[1], re.IGNORECASE)
|
||||
rel2_match = re.search(r"(?:relation_to_second|second):\s*(\w+)", parts[2], re.IGNORECASE)
|
||||
|
||||
if not rel1_match or not rel2_match:
|
||||
# Try simpler format: just the relation type
|
||||
rel1_match = re.match(r"(\w+)", parts[1].split(":")[-1].strip())
|
||||
rel2_match = re.match(r"(\w+)", parts[2].split(":")[-1].strip())
|
||||
|
||||
if not rel1_match or not rel2_match:
|
||||
continue
|
||||
|
||||
rel1 = rel1_match.group(1)
|
||||
rel2 = rel2_match.group(1)
|
||||
|
||||
if rel1 not in VALID_RELATIONS or rel2 not in VALID_RELATIONS:
|
||||
continue
|
||||
|
||||
explanation = parts[3].strip() if len(parts) > 3 else ""
|
||||
|
||||
# Create edges: word_a -> bridge and bridge -> word_b
|
||||
edges.append({
|
||||
"start_word": word_a,
|
||||
"end_word": bridge_word,
|
||||
"relation": rel1,
|
||||
"weight": 0.8,
|
||||
"surface_text": explanation,
|
||||
"source": "llm_bridge",
|
||||
})
|
||||
edges.append({
|
||||
"start_word": bridge_word,
|
||||
"end_word": word_b,
|
||||
"relation": rel2,
|
||||
"weight": 0.8,
|
||||
"surface_text": explanation,
|
||||
"source": "llm_bridge",
|
||||
})
|
||||
|
||||
return edges
|
||||
|
||||
|
||||
def parse_property_response(response_text, word):
|
||||
"""Parse property enrichment LLM output."""
|
||||
edges = []
|
||||
if not response_text:
|
||||
return edges
|
||||
|
||||
for line in response_text.strip().split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = re.sub(r"^[\d]+[.)]\s*", "", line)
|
||||
line = re.sub(r"^[-*•]\s*", "", line)
|
||||
line = line.strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# PROPERTY | explanation
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
if len(parts) < 1:
|
||||
continue
|
||||
|
||||
prop = parts[0].strip().lower().replace(" ", "_")
|
||||
explanation = parts[1].strip() if len(parts) > 1 else f"{word} is {prop}"
|
||||
|
||||
if not prop or prop == word:
|
||||
continue
|
||||
|
||||
edges.append({
|
||||
"start_word": word,
|
||||
"end_word": prop,
|
||||
"relation": "HasProperty",
|
||||
"weight": 0.8,
|
||||
"surface_text": explanation,
|
||||
"source": "llm_property",
|
||||
})
|
||||
|
||||
return edges
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Phase 1: Per-Word Expansion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
PHASE1_SYSTEM = """You are a commonsense knowledge annotator. You will be given a concrete noun and its known relationships. Your job is to generate ADDITIONAL commonsense relationships that are missing.
|
||||
|
||||
Rules:
|
||||
- Only generate relationships involving concrete, tangible things (animals, foods, tools, plants, buildings, weather, landscape, household objects)
|
||||
- Every relationship must be something a typical adult would agree is true
|
||||
- Do not repeat any relationship already listed as "known"
|
||||
- Target words should be common English words (top 3000 frequency preferred)
|
||||
- Output ONLY the structured format shown below, one relationship per line
|
||||
- If you cannot think of good relationships for a given type, output NONE for that type
|
||||
- Aim for 3-5 relationships per type where possible
|
||||
|
||||
Output format (one per line):
|
||||
RELATION_TYPE: target_word | short natural phrasing
|
||||
|
||||
Example output:
|
||||
AtLocation: barn | you find a horse in a barn
|
||||
UsedFor: riding | a horse is used for riding
|
||||
HasA: mane | a horse has a mane
|
||||
CapableOf: gallop | a horse can gallop
|
||||
MadeOf: NONE
|
||||
PartOf: herd | a horse is part of a herd"""
|
||||
|
||||
|
||||
PHASE1_USER = """Word: {word}
|
||||
Categories: {categories}
|
||||
|
||||
Known relationships:
|
||||
{existing_edges}
|
||||
|
||||
Generate additional relationships for these types:
|
||||
- AtLocation (where is it found?)
|
||||
- UsedFor (what is it used for?)
|
||||
- HasA (what does it have / contain?)
|
||||
- PartOf (what is it part of?)
|
||||
- CapableOf (what can it do?)
|
||||
- MadeOf (what is it made of?)
|
||||
- HasPrerequisite (what do you need before you can have/use it?)
|
||||
- Causes (what does it cause or lead to?)
|
||||
- HasProperty (what adjectives describe it? — limit to physical/sensory properties)"""
|
||||
|
||||
|
||||
def format_existing_edges(edges_dict, word):
|
||||
"""Format existing edges for a word grouped by relation type."""
|
||||
relation_types = ["AtLocation", "UsedFor", "HasA", "PartOf", "CapableOf",
|
||||
"MadeOf", "HasPrerequisite", "Causes", "HasProperty"]
|
||||
|
||||
lines = []
|
||||
for rel in relation_types:
|
||||
targets = edges_dict.get((word, rel), [])
|
||||
if targets:
|
||||
formatted = ", ".join(f"{t[0]} (weight {t[1]:.1f})" for t in targets[:10])
|
||||
lines.append(f"{rel}: {formatted}")
|
||||
else:
|
||||
lines.append(f"{rel}: (none in database)")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def run_phase1(vocab, edges, existing_triples, checkpoint, dry_run=False):
|
||||
"""Phase 1: Per-word relationship expansion."""
|
||||
words = sorted(vocab.keys())
|
||||
total = len(words)
|
||||
total_accepted = 0
|
||||
total_skipped = 0
|
||||
|
||||
print(f"Phase 1: Processing {total} words...")
|
||||
|
||||
for i, word in enumerate(words):
|
||||
if (word, "1") in checkpoint:
|
||||
total_skipped += 1
|
||||
continue
|
||||
|
||||
categories = ", ".join(vocab[word]["categories"])
|
||||
existing = format_existing_edges(edges, word)
|
||||
|
||||
user_prompt = PHASE1_USER.format(
|
||||
word=word, categories=categories, existing_edges=existing
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": PHASE1_SYSTEM},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
if dry_run:
|
||||
if i < 3: # Show first 3 prompts
|
||||
print(f"\n--- Prompt for '{word}' ---")
|
||||
print(f"System: {PHASE1_SYSTEM[:200]}...")
|
||||
print(f"User:\n{user_prompt}")
|
||||
elif i == 3:
|
||||
print(f"\n... ({total - 3} more words) ...")
|
||||
continue
|
||||
|
||||
response = llm_chat_completion(messages)
|
||||
parsed = parse_llm_relations(response, word) if response else []
|
||||
|
||||
# Classify edges
|
||||
accepted = []
|
||||
candidates = []
|
||||
duplicates = 0
|
||||
|
||||
for edge in parsed:
|
||||
triple = (edge["start_word"], edge["end_word"], edge["relation"])
|
||||
if triple in existing_triples:
|
||||
duplicates += 1
|
||||
continue
|
||||
|
||||
existing_triples.add(triple)
|
||||
|
||||
if edge["end_word"] in vocab:
|
||||
accepted.append(edge)
|
||||
else:
|
||||
candidates.append({
|
||||
"word": edge["end_word"],
|
||||
"suggested_by": word,
|
||||
"relation_context": f"{edge['relation']}: {edge['surface_text']}",
|
||||
"frequency": 1,
|
||||
})
|
||||
|
||||
if accepted:
|
||||
append_augmented_edges(accepted)
|
||||
# Also update in-memory edges for subsequent words
|
||||
for e in accepted:
|
||||
edges[(e["start_word"], e["relation"])].append(
|
||||
(e["end_word"], e["weight"], e["surface_text"]))
|
||||
|
||||
if candidates:
|
||||
append_candidates(candidates)
|
||||
|
||||
total_accepted += len(accepted)
|
||||
|
||||
append_log(word, "1", len(parsed), len(accepted), duplicates, len(candidates))
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" [{i+1}/{total}] {total_accepted} edges accepted so far")
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
if dry_run:
|
||||
print(f"\nDry run complete. Would process {total - total_skipped} words.")
|
||||
else:
|
||||
print(f"\nPhase 1 complete: {total_accepted} new edges accepted.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Phase 2: Cross-Word Bridge Discovery
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
PHASE2_SYSTEM = """You are a commonsense knowledge annotator. You will be given two concrete nouns. Your job is to identify a BRIDGE word that connects them — something that relates to both.
|
||||
|
||||
Rules:
|
||||
- The bridge word must be a common, concrete noun
|
||||
- State the relationship type for each connection
|
||||
- Valid relationship types: AtLocation, UsedFor, HasA, PartOf, CapableOf, MadeOf, HasPrerequisite, Causes, HasProperty, ReceivesAction, Desires, CausesDesire, LocatedNear, CreatedBy
|
||||
- Output format: BRIDGE_WORD | relation_to_first: TYPE | relation_to_second: TYPE | explanation
|
||||
|
||||
Example:
|
||||
Words: "cow" and "butter"
|
||||
milk | relation_to_first: CapableOf | relation_to_second: MadeOf | milk connects production to product"""
|
||||
|
||||
|
||||
PHASE2_USER = """Words: "{word_a}" and "{word_b}"
|
||||
Categories: {word_a} is {categories_a}, {word_b} is {categories_b}
|
||||
Find 1-3 bridge words that connect them."""
|
||||
|
||||
|
||||
def build_reachability(vocab, edges):
|
||||
"""Build 2-hop reachability from vocab words to other vocab words."""
|
||||
vocab_set = set(vocab.keys())
|
||||
reachable = defaultdict(set) # word -> set of reachable vocab words
|
||||
|
||||
for word in vocab:
|
||||
# Direct (1-hop) neighbors in vocab
|
||||
for (sw, rel), targets in edges.items():
|
||||
if sw == word:
|
||||
for (ew, w, s) in targets:
|
||||
if ew in vocab_set and ew != word:
|
||||
reachable[word].add(ew)
|
||||
# 2-hop from this neighbor
|
||||
for (sw2, rel2), targets2 in edges.items():
|
||||
if sw2 == ew:
|
||||
for (ew2, w2, s2) in targets2:
|
||||
if ew2 in vocab_set and ew2 != word:
|
||||
reachable[word].add(ew2)
|
||||
|
||||
return reachable
|
||||
|
||||
|
||||
def run_phase2(vocab, edges, existing_triples, checkpoint, dry_run=False):
|
||||
"""Phase 2: Cross-word bridge discovery."""
|
||||
print("Phase 2: Building reachability matrix...")
|
||||
reachable = build_reachability(vocab, edges)
|
||||
|
||||
# Find low-connectivity words
|
||||
vocab_set = set(vocab.keys())
|
||||
low_connectivity = []
|
||||
for word in vocab:
|
||||
reach_count = len(reachable.get(word, set()))
|
||||
if reach_count < 10:
|
||||
low_connectivity.append((word, reach_count))
|
||||
|
||||
low_connectivity.sort(key=lambda x: x[1])
|
||||
print(f" {len(low_connectivity)} words with <10 reachable vocab words")
|
||||
|
||||
# Build category index
|
||||
by_category = defaultdict(list)
|
||||
for word, info in vocab.items():
|
||||
for cat in info["categories"]:
|
||||
by_category[cat].append(word)
|
||||
|
||||
total_accepted = 0
|
||||
pairs_processed = 0
|
||||
total_skipped = 0
|
||||
|
||||
for word, reach_count in low_connectivity:
|
||||
if (word, "2") in checkpoint:
|
||||
total_skipped += 1
|
||||
continue
|
||||
|
||||
word_cats = vocab[word]["categories"]
|
||||
word_reachable = reachable.get(word, set())
|
||||
|
||||
# Find same-category words that are unreachable
|
||||
unreachable = []
|
||||
for cat in word_cats:
|
||||
for peer in by_category.get(cat, []):
|
||||
if peer != word and peer not in word_reachable:
|
||||
unreachable.append(peer)
|
||||
|
||||
if not unreachable:
|
||||
append_log(word, "2", 0, 0, 0, 0)
|
||||
continue
|
||||
|
||||
# Sample 5-10 unreachable peers
|
||||
sample = random.sample(unreachable, min(10, len(unreachable)))
|
||||
|
||||
accepted_for_word = 0
|
||||
|
||||
for peer in sample:
|
||||
pair_key = f"{word}:{peer}"
|
||||
if (pair_key, "2") in checkpoint:
|
||||
continue
|
||||
|
||||
categories_a = ", ".join(vocab[word]["categories"])
|
||||
categories_b = ", ".join(vocab[peer]["categories"])
|
||||
|
||||
user_prompt = PHASE2_USER.format(
|
||||
word_a=word, word_b=peer,
|
||||
categories_a=categories_a, categories_b=categories_b,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": PHASE2_SYSTEM},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
if dry_run:
|
||||
if pairs_processed < 3:
|
||||
print(f"\n--- Bridge prompt: '{word}' <-> '{peer}' ---")
|
||||
print(f"User:\n{user_prompt}")
|
||||
elif pairs_processed == 3:
|
||||
print(f"\n... (more pairs) ...")
|
||||
pairs_processed += 1
|
||||
continue
|
||||
|
||||
response = llm_chat_completion(messages)
|
||||
parsed = parse_bridge_response(response, word, peer) if response else []
|
||||
|
||||
accepted = []
|
||||
duplicates = 0
|
||||
oov = 0
|
||||
|
||||
for edge in parsed:
|
||||
triple = (edge["start_word"], edge["end_word"], edge["relation"])
|
||||
if triple in existing_triples:
|
||||
duplicates += 1
|
||||
continue
|
||||
existing_triples.add(triple)
|
||||
|
||||
# For bridge edges, both endpoints should ideally be in vocab
|
||||
if edge["start_word"] in vocab_set and edge["end_word"] in vocab_set:
|
||||
accepted.append(edge)
|
||||
elif edge["start_word"] in vocab_set or edge["end_word"] in vocab_set:
|
||||
# At least one end in vocab — still useful
|
||||
accepted.append(edge)
|
||||
else:
|
||||
oov += 1
|
||||
|
||||
if accepted:
|
||||
append_augmented_edges(accepted)
|
||||
for e in accepted:
|
||||
edges[(e["start_word"], e["relation"])].append(
|
||||
(e["end_word"], e["weight"], e["surface_text"]))
|
||||
accepted_for_word += len(accepted)
|
||||
|
||||
pairs_processed += 1
|
||||
time.sleep(0.1)
|
||||
|
||||
total_accepted += accepted_for_word
|
||||
append_log(word, "2", 0, accepted_for_word, 0, 0)
|
||||
|
||||
if (pairs_processed) % 20 == 0:
|
||||
print(f" {pairs_processed} pairs processed, {total_accepted} edges accepted")
|
||||
|
||||
if dry_run:
|
||||
print(f"\nDry run complete. Would process {pairs_processed} word pairs.")
|
||||
else:
|
||||
print(f"\nPhase 2 complete: {total_accepted} bridge edges accepted from {pairs_processed} pairs.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Phase 3: Property Enrichment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
PHASE3_SYSTEM = """You are a commonsense knowledge annotator. Given a concrete noun, list its most distinctive physical or sensory properties — things you could see, touch, hear, smell, or taste. Also list behavioral properties for animals.
|
||||
|
||||
Rules:
|
||||
- Only physical/sensory/behavioral properties, not abstract qualities
|
||||
- Properties should DISTINGUISH this thing from similar things in its category
|
||||
- Output one property per line as: PROPERTY | brief explanation
|
||||
- Aim for 5-8 properties"""
|
||||
|
||||
|
||||
PHASE3_USER = """Word: {word}
|
||||
Category: {categories}
|
||||
Other words in same category: {peers}
|
||||
|
||||
What properties distinguish {word} from the others listed?"""
|
||||
|
||||
|
||||
def run_phase3(vocab, edges, existing_triples, checkpoint, dry_run=False):
|
||||
"""Phase 3: Property enrichment for false_equivalence templates."""
|
||||
by_category = defaultdict(list)
|
||||
for word, info in vocab.items():
|
||||
for cat in info["categories"]:
|
||||
by_category[cat].append(word)
|
||||
|
||||
words = sorted(vocab.keys())
|
||||
total = len(words)
|
||||
total_accepted = 0
|
||||
total_skipped = 0
|
||||
|
||||
print(f"Phase 3: Property enrichment for {total} words...")
|
||||
|
||||
for i, word in enumerate(words):
|
||||
if (word, "3") in checkpoint:
|
||||
total_skipped += 1
|
||||
continue
|
||||
|
||||
word_cats = vocab[word]["categories"]
|
||||
categories = ", ".join(word_cats)
|
||||
|
||||
# Gather same-category peers (sample of 10)
|
||||
peers = set()
|
||||
for cat in word_cats:
|
||||
for peer in by_category.get(cat, []):
|
||||
if peer != word:
|
||||
peers.add(peer)
|
||||
peer_sample = random.sample(list(peers), min(10, len(peers))) if peers else []
|
||||
|
||||
if not peer_sample:
|
||||
append_log(word, "3", 0, 0, 0, 0)
|
||||
continue
|
||||
|
||||
user_prompt = PHASE3_USER.format(
|
||||
word=word, categories=categories,
|
||||
peers=", ".join(peer_sample),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": PHASE3_SYSTEM},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
if dry_run:
|
||||
if i < 3:
|
||||
print(f"\n--- Property prompt for '{word}' ---")
|
||||
print(f"User:\n{user_prompt}")
|
||||
elif i == 3:
|
||||
print(f"\n... ({total - 3} more words) ...")
|
||||
continue
|
||||
|
||||
response = llm_chat_completion(messages)
|
||||
parsed = parse_property_response(response, word) if response else []
|
||||
|
||||
accepted = []
|
||||
duplicates = 0
|
||||
|
||||
for edge in parsed:
|
||||
triple = (edge["start_word"], edge["end_word"], edge["relation"])
|
||||
if triple in existing_triples:
|
||||
duplicates += 1
|
||||
continue
|
||||
existing_triples.add(triple)
|
||||
accepted.append(edge)
|
||||
|
||||
if accepted:
|
||||
append_augmented_edges(accepted)
|
||||
for e in accepted:
|
||||
edges[(e["start_word"], e["relation"])].append(
|
||||
(e["end_word"], e["weight"], e["surface_text"]))
|
||||
|
||||
total_accepted += len(accepted)
|
||||
append_log(word, "3", len(parsed), len(accepted), duplicates, 0)
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" [{i+1}/{total}] {total_accepted} properties accepted so far")
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
if dry_run:
|
||||
print(f"\nDry run complete. Would process {total - total_skipped} words.")
|
||||
else:
|
||||
print(f"\nPhase 3 complete: {total_accepted} new HasProperty edges accepted.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LLM-augmented graph enhancement for folksy subgraph."
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument("--phase", type=int, choices=[1, 2, 3],
|
||||
help="Run a specific phase (1, 2, or 3)")
|
||||
group.add_argument("--all", action="store_true",
|
||||
help="Run all three phases in sequence")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Print prompts without calling LLM")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
vocab = load_vocab()
|
||||
edges, existing_triples = load_relations()
|
||||
checkpoint = load_checkpoint()
|
||||
|
||||
print(f"Loaded {len(vocab)} vocab words, {len(existing_triples)} existing edge triples.")
|
||||
print(f"Checkpoint: {len(checkpoint)} (word, phase) pairs already processed.")
|
||||
|
||||
phases = [args.phase] if args.phase else [1, 2, 3]
|
||||
|
||||
for phase in phases:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Running Phase {phase}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if phase == 1:
|
||||
run_phase1(vocab, edges, existing_triples, checkpoint, args.dry_run)
|
||||
elif phase == 2:
|
||||
run_phase2(vocab, edges, existing_triples, checkpoint, args.dry_run)
|
||||
elif phase == 3:
|
||||
run_phase3(vocab, edges, existing_triples, checkpoint, args.dry_run)
|
||||
|
||||
# Reload checkpoint after each phase for resumability
|
||||
checkpoint = load_checkpoint()
|
||||
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
512
scripts/expand_vocab.py
Normal file
512
scripts/expand_vocab.py
Normal file
|
|
@ -0,0 +1,512 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Expand folksy vocabulary with high-quality candidates from LLM suggestions.
|
||||
|
||||
Reads candidate_additions.csv (words suggested by the LLM during phase 1 that
|
||||
weren't in the vocab), filters for quality, uses the LLM to assign categories,
|
||||
and appends the survivors to folksy_vocab.csv.
|
||||
|
||||
After running this, re-run `enhance_graph.py --phase 1` to generate edges
|
||||
for the new words (the checkpoint will skip already-processed words).
|
||||
|
||||
Usage:
|
||||
python scripts/expand_vocab.py # Full run
|
||||
python scripts/expand_vocab.py --dry-run # Show what would be added
|
||||
python scripts/expand_vocab.py --min-citations 8 # Stricter threshold
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
DATA_DIR = PROJECT_DIR / "data"
|
||||
|
||||
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
|
||||
LLM_MODEL = "THUDM-GLM4-32B"
|
||||
|
||||
VOCAB_CSV = DATA_DIR / "folksy_vocab.csv"
|
||||
CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"
|
||||
|
||||
# Valid categories from the existing vocabulary
|
||||
VALID_CATEGORIES = {
|
||||
"animal", "beverage", "bird", "building", "clothing", "container", "crop",
|
||||
"fabric", "fish", "flower", "food", "fruit", "furniture", "grain", "herb",
|
||||
"insect", "instrument", "landscape", "material", "metal", "mineral",
|
||||
"organism", "plant", "rock", "seed", "shelter", "spice", "stone",
|
||||
"structure", "tool", "tree", "vegetable", "vehicle", "water", "weapon", "wood",
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Exclusion lists
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Abstract concepts, emotions, processes — not concrete enough for folksy vocab
|
||||
EXCLUDE_ABSTRACT = {
|
||||
"ecosystem", "satisfaction", "fullness", "warmth", "fear", "relaxation",
|
||||
"growth", "interest", "nature", "protection", "digestion", "injury",
|
||||
"decoration", "construction", "landscape", "noise", "sound", "energy",
|
||||
"nourishment", "nutrition", "pollination", "sustainability", "tradition",
|
||||
"biodiversity", "symbolism", "elegance", "resilience", "patience",
|
||||
"beauty", "abundance", "fertility", "creativity", "harmony", "comfort",
|
||||
"curiosity", "companionship", "loyalty", "aggression", "alertness",
|
||||
"camouflage", "predation", "migration", "hibernation", "decomposition",
|
||||
"erosion", "combustion", "fermentation", "oxidation", "corrosion",
|
||||
"photosynthesis", "respiration", "evaporation", "precipitation",
|
||||
"transpiration", "germination", "excitement", "enjoyment", "satiety",
|
||||
"stability", "organization", "fragrance", "moisture", "wildlife",
|
||||
"preservation", "conversation", "inspiration", "storage", "observation",
|
||||
"hydration", "destruction", "entertainment", "education", "knowledge",
|
||||
"safety", "practice", "research", "skill", "space", "license",
|
||||
"collection", "habitat", "pollution", "health", "vibration", "wonder",
|
||||
"awe", "refreshment", "irritation", "happiness", "joy", "damage",
|
||||
"death", "pain", "thirst", "fear", "alarm", "contents", "ingredients",
|
||||
"electricity", "oxygen", "navigation", "recreation", "meditation",
|
||||
"nutrition", "celebration", "communication", "imagination", "devotion",
|
||||
"ambition", "endurance", "independence", "discipline", "cooperation",
|
||||
"sweetness", "fullness", "aroma", "flavor", "fragrance", "texture",
|
||||
"smell", "color", "contents", "surface", "bottom", "edge",
|
||||
"nutrients", "study", "outfit", "upholstery",
|
||||
}
|
||||
|
||||
# Scientific/technical — not folksy enough for folk wisdom
|
||||
EXCLUDE_TECHNICAL = {
|
||||
"cellulose", "exoskeleton", "protein", "tissue", "cells", "alloy",
|
||||
"cellulose", "enzyme", "chlorophyll", "genome", "photon",
|
||||
"organism", "molecule", "compound", "polymer", "isotope",
|
||||
"ecosystem", "metabolism", "catalyst", "membrane", "chromosome",
|
||||
"cell", "nutrient", "ingredient", "material", "content",
|
||||
}
|
||||
|
||||
# Collective/institutional nouns — not concrete individual things
|
||||
EXCLUDE_INSTITUTIONAL = {
|
||||
"orchestra", "fleet", "arsenal", "toolkit", "collection",
|
||||
"restaurant", "museum", "university", "corporation", "organization",
|
||||
"musician", "breakfast", "dinner", "meal", "dish", "sandwich",
|
||||
"seafood", "refrigerator", "garage", "basement", "park",
|
||||
}
|
||||
|
||||
# Adjectives and properties — useful as HasProperty targets but not as vocab words
|
||||
EXCLUDE_ADJECTIVES = {
|
||||
"small", "large", "heavy", "colorful", "green", "brown", "hard",
|
||||
"white", "round", "sharp", "sturdy", "long", "soft", "flat",
|
||||
"sweet", "bitter", "smooth", "rough", "bright", "dark", "dry",
|
||||
"wet", "thick", "thin", "warm", "cold", "hot", "tall", "short",
|
||||
"red", "blue", "yellow", "black", "grey", "gray", "pink",
|
||||
"fragrant", "loud", "spicy", "sour", "tough", "delicate", "strong",
|
||||
"weak", "light", "dense", "portable", "lightweight", "transparent",
|
||||
"opaque", "flexible", "rigid", "brittle", "elastic", "porous",
|
||||
"compact", "edible", "toxic", "aromatic", "nocturnal", "aquatic",
|
||||
"durable", "cylindrical", "wooden", "shiny", "solid", "narrow",
|
||||
"metallic", "pungent", "juicy", "fast", "powerful", "woody",
|
||||
"fibrous", "savory", "liquid", "enclosed", "rectangular", "wild",
|
||||
"feathered", "leafy", "crunchy", "dangerous", "fuzzy", "slimy",
|
||||
"natural", "waterproof", "electronic",
|
||||
}
|
||||
|
||||
# Words that are clearly verbs or gerunds
|
||||
EXCLUDE_VERBS = {
|
||||
"eating", "cooking", "growing", "fishing", "hunting", "flying",
|
||||
"mining", "flavoring", "singing", "blooming", "holding", "baking",
|
||||
"ripening", "opening", "cutting", "protecting", "seasoning",
|
||||
"storing", "building", "swimming", "brewing", "weaving", "carving",
|
||||
"climbing", "digging", "plowing", "sewing", "spinning", "tanning",
|
||||
"swim", "run", "grow", "eat", "hunt", "peck", "bite", "dive",
|
||||
"crawl", "cut", "shine", "sparkle",
|
||||
}
|
||||
|
||||
|
||||
def singularize(word):
|
||||
"""Best-effort singularization. Returns (singular, was_plural)."""
|
||||
# Irregular plurals
|
||||
irregulars = {
|
||||
"teeth": "tooth", "feet": "foot", "geese": "goose", "mice": "mouse",
|
||||
"lice": "louse", "dice": "die", "oxen": "ox", "children": "child",
|
||||
"leaves": "leaf", "loaves": "loaf", "halves": "half", "knives": "knife",
|
||||
"lives": "life", "wives": "wife", "wolves": "wolf", "shelves": "shelf",
|
||||
"calves": "calf",
|
||||
}
|
||||
if word in irregulars:
|
||||
return irregulars[word], True
|
||||
|
||||
# -ves -> -f (already covered some above, catch remaining)
|
||||
if word.endswith("ves"):
|
||||
candidate = word[:-3] + "f"
|
||||
return candidate, True
|
||||
|
||||
# -ies -> -y
|
||||
if word.endswith("ies") and len(word) > 4:
|
||||
return word[:-3] + "y", True
|
||||
|
||||
# -ses, -xes, -zes, -ches, -shes -> drop -es
|
||||
if word.endswith(("ses", "xes", "zes", "ches", "shes")):
|
||||
return word[:-2], True
|
||||
|
||||
# -s (but not -ss, -us, -is)
|
||||
if word.endswith("s") and not word.endswith(("ss", "us", "is")):
|
||||
return word[:-1], True
|
||||
|
||||
return word, False
|
||||
|
||||
|
||||
def is_plural_of_existing(word, existing_vocab):
|
||||
"""Check if word is likely a plural form of an existing vocab word."""
|
||||
# word + s
|
||||
if word.endswith("s") and word[:-1] in existing_vocab:
|
||||
return True
|
||||
# word + es
|
||||
if word.endswith("es") and word[:-2] in existing_vocab:
|
||||
return True
|
||||
# word ending ies -> y
|
||||
if word.endswith("ies") and word[:-3] + "y" in existing_vocab:
|
||||
return True
|
||||
# word ending ves -> f/fe
|
||||
if word.endswith("ves"):
|
||||
if word[:-3] + "f" in existing_vocab:
|
||||
return True
|
||||
if word[:-3] + "fe" in existing_vocab:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_plural_of_candidate(word, accepted_words):
|
||||
"""Check if word is a plural of another candidate, or vice versa."""
|
||||
# Is this word a plural of something accepted?
|
||||
if word.endswith("s") and word[:-1] in accepted_words:
|
||||
return True
|
||||
if word.endswith("es") and word[:-2] in accepted_words:
|
||||
return True
|
||||
if word.endswith("ies") and word[:-3] + "y" in accepted_words:
|
||||
return True
|
||||
# Is something accepted a plural of this word?
|
||||
if word + "s" in accepted_words:
|
||||
return True
|
||||
if word + "es" in accepted_words:
|
||||
return True
|
||||
if word.endswith("f") and word[:-1] + "ves" in accepted_words:
|
||||
return True
|
||||
if word.endswith("fe") and word[:-2] + "ves" in accepted_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LLM categorization
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
CATEGORIZE_SYSTEM = """You are a vocabulary categorizer. Given a list of concrete nouns, assign each one to one or more categories from this fixed list:
|
||||
|
||||
animal, beverage, bird, building, clothing, container, crop, fabric, fish, flower, food, fruit, furniture, grain, herb, insect, instrument, landscape, material, metal, mineral, organism, plant, rock, seed, shelter, spice, stone, structure, tool, tree, vegetable, vehicle, water, weapon, wood
|
||||
|
||||
Rules:
|
||||
- Use ONLY categories from the list above
|
||||
- A word can have multiple categories (e.g., "brick" -> material, stone)
|
||||
- If a word fits none of the categories well, output SKIP
|
||||
- Output format: word: category1, category2
|
||||
- One word per line"""
|
||||
|
||||
CATEGORIZE_USER = """Categorize these words:
|
||||
{word_list}"""
|
||||
|
||||
|
||||
def llm_chat_completion(messages, max_retries=3):
|
||||
"""Chat completion with retry logic."""
|
||||
import requests
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = requests.post(LLM_ENDPOINT, json={
|
||||
"model": LLM_MODEL,
|
||||
"messages": messages,
|
||||
}, timeout=120)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["choices"][0]["message"]["content"]
|
||||
except Exception as e:
|
||||
wait = (2 ** attempt)
|
||||
print(f" LLM call failed (attempt {attempt+1}/{max_retries}): {e}",
|
||||
file=sys.stderr)
|
||||
if attempt < max_retries - 1:
|
||||
print(f" Retrying in {wait}s...", file=sys.stderr)
|
||||
time.sleep(wait)
|
||||
else:
|
||||
print(f" Giving up on this batch.", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def parse_categories(response_text, valid_words):
|
||||
"""Parse LLM categorization response."""
|
||||
result = {}
|
||||
if not response_text:
|
||||
return result
|
||||
|
||||
for line in response_text.strip().split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Strip bullets/numbers
|
||||
line = re.sub(r"^[\d]+[.)]\s*", "", line)
|
||||
line = re.sub(r"^[-*•]\s*", "", line)
|
||||
line = line.strip()
|
||||
|
||||
# Match: word: cat1, cat2
|
||||
match = re.match(r"^(\w+)\s*:\s*(.+)$", line)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
word = match.group(1).strip().lower()
|
||||
cats_raw = match.group(2).strip()
|
||||
|
||||
if "SKIP" in cats_raw.upper():
|
||||
continue
|
||||
|
||||
cats = []
|
||||
for c in cats_raw.split(","):
|
||||
c = c.strip().lower()
|
||||
if c in VALID_CATEGORIES:
|
||||
cats.append(c)
|
||||
|
||||
if word in valid_words and cats:
|
||||
result[word] = cats
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def categorize_words(words, batch_size=25):
|
||||
"""Categorize words using the LLM in batches."""
|
||||
all_categories = {}
|
||||
word_set = set(words)
|
||||
|
||||
for i in range(0, len(words), batch_size):
|
||||
batch = words[i:i + batch_size]
|
||||
word_list = "\n".join(f"- {w}" for w in batch)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": CATEGORIZE_SYSTEM},
|
||||
{"role": "user", "content": CATEGORIZE_USER.format(word_list=word_list)},
|
||||
]
|
||||
|
||||
response = llm_chat_completion(messages)
|
||||
parsed = parse_categories(response, word_set)
|
||||
all_categories.update(parsed)
|
||||
|
||||
categorized = len(parsed)
|
||||
print(f" Batch {i // batch_size + 1}: {categorized}/{len(batch)} categorized")
|
||||
time.sleep(0.1)
|
||||
|
||||
return all_categories
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Expand folksy vocabulary with LLM-suggested candidates."
|
||||
)
|
||||
parser.add_argument("--min-citations", type=int, default=5,
|
||||
help="Minimum number of vocab words that suggested this candidate (default: 5)")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Show what would be added without modifying files")
|
||||
parser.add_argument("--no-llm", action="store_true",
|
||||
help="Skip LLM categorization (use placeholder categories)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load existing vocab
|
||||
existing_vocab = {}
|
||||
with open(VOCAB_CSV, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
existing_vocab[row["word"]] = row
|
||||
existing_words = set(existing_vocab.keys())
|
||||
print(f"Existing vocabulary: {len(existing_words)} words")
|
||||
|
||||
# Load candidates
|
||||
candidates = []
|
||||
with open(CANDIDATE_CSV, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
candidates.append(row)
|
||||
|
||||
# Aggregate: count unique sources per candidate word
|
||||
word_sources = defaultdict(set)
|
||||
for c in candidates:
|
||||
word_sources[c["word"]].add(c["suggested_by"])
|
||||
|
||||
print(f"Total candidate rows: {len(candidates)}")
|
||||
print(f"Unique candidate words: {len(word_sources)}")
|
||||
|
||||
# Normalize plurals: merge citation counts into singular forms
|
||||
normalized_sources = defaultdict(set)
|
||||
for word, sources in word_sources.items():
|
||||
singular, was_plural = singularize(word)
|
||||
# Merge into the singular form
|
||||
normalized_sources[singular].update(sources)
|
||||
# Replace word_sources with normalized version
|
||||
word_sources = {w: srcs for w, srcs in normalized_sources.items()}
|
||||
print(f"After singularization: {len(word_sources)} unique candidates")
|
||||
|
||||
# Filter
|
||||
accepted = []
|
||||
reject_reasons = Counter()
|
||||
|
||||
# Sort by citation count descending for consistent ordering
|
||||
sorted_candidates = sorted(word_sources.items(), key=lambda x: len(x[1]), reverse=True)
|
||||
accepted_set = set()
|
||||
|
||||
for word, sources in sorted_candidates:
|
||||
citation_count = len(sources)
|
||||
|
||||
# Minimum citation threshold
|
||||
if citation_count < args.min_citations:
|
||||
reject_reasons["below_threshold"] += 1
|
||||
continue
|
||||
|
||||
# No multi-word (underscore) candidates
|
||||
if "_" in word:
|
||||
reject_reasons["multi_word"] += 1
|
||||
continue
|
||||
|
||||
# Already in vocab
|
||||
if word in existing_words:
|
||||
reject_reasons["already_in_vocab"] += 1
|
||||
continue
|
||||
|
||||
# Exclude abstracts
|
||||
if word in EXCLUDE_ABSTRACT:
|
||||
reject_reasons["abstract"] += 1
|
||||
continue
|
||||
|
||||
# Exclude adjectives
|
||||
if word in EXCLUDE_ADJECTIVES:
|
||||
reject_reasons["adjective"] += 1
|
||||
continue
|
||||
|
||||
# Exclude verbs/gerunds
|
||||
if word in EXCLUDE_VERBS:
|
||||
reject_reasons["verb_gerund"] += 1
|
||||
continue
|
||||
|
||||
# Exclude technical/scientific
|
||||
if word in EXCLUDE_TECHNICAL:
|
||||
reject_reasons["technical"] += 1
|
||||
continue
|
||||
|
||||
# Exclude institutional/collective
|
||||
if word in EXCLUDE_INSTITUTIONAL:
|
||||
reject_reasons["institutional"] += 1
|
||||
continue
|
||||
|
||||
# Gerund pattern catch-all (but allow exceptions)
|
||||
if word.endswith("ing") and word not in {"ring", "spring", "string", "wing", "ceiling"}:
|
||||
reject_reasons["gerund_pattern"] += 1
|
||||
continue
|
||||
|
||||
# Exclude plurals of existing vocab
|
||||
if is_plural_of_existing(word, existing_words):
|
||||
reject_reasons["plural_of_existing"] += 1
|
||||
continue
|
||||
|
||||
# Exclude plurals of already-accepted candidates
|
||||
if is_plural_of_candidate(word, accepted_set):
|
||||
reject_reasons["plural_of_candidate"] += 1
|
||||
continue
|
||||
|
||||
# Single character
|
||||
if len(word) < 2:
|
||||
reject_reasons["too_short"] += 1
|
||||
continue
|
||||
|
||||
accepted.append((word, citation_count))
|
||||
accepted_set.add(word)
|
||||
|
||||
print(f"\nFiltering results:")
|
||||
print(f" Accepted: {len(accepted)}")
|
||||
for reason, count in reject_reasons.most_common():
|
||||
print(f" Rejected ({reason}): {count}")
|
||||
|
||||
if not accepted:
|
||||
print("\nNo candidates passed filtering.")
|
||||
return
|
||||
|
||||
# Show accepted words
|
||||
print(f"\nAccepted candidates ({len(accepted)}):")
|
||||
for word, count in accepted:
|
||||
print(f" {word:25s} cited by {count:3d} vocab words")
|
||||
|
||||
if args.dry_run:
|
||||
print(f"\nDry run complete. Would add {len(accepted)} words to vocabulary.")
|
||||
return
|
||||
|
||||
# Categorize with LLM
|
||||
words_to_categorize = [w for w, _ in accepted]
|
||||
|
||||
if args.no_llm:
|
||||
print("\nSkipping LLM categorization (--no-llm). Using 'material' as placeholder.")
|
||||
categories = {w: ["material"] for w in words_to_categorize}
|
||||
else:
|
||||
print(f"\nCategorizing {len(words_to_categorize)} words with LLM...")
|
||||
categories = categorize_words(words_to_categorize)
|
||||
|
||||
# Words the LLM couldn't categorize get skipped
|
||||
uncategorized = [w for w in words_to_categorize if w not in categories]
|
||||
if uncategorized:
|
||||
print(f"\n {len(uncategorized)} words could not be categorized (skipped):")
|
||||
for w in uncategorized:
|
||||
print(f" {w}")
|
||||
|
||||
# Build new vocab entries
|
||||
new_entries = []
|
||||
for word, citation_count in accepted:
|
||||
if word not in categories:
|
||||
continue
|
||||
cats = categories[word]
|
||||
new_entries.append({
|
||||
"word": word,
|
||||
"categories": ",".join(cats),
|
||||
"tangibility_score": "0.80",
|
||||
"conceptnet_edge_count": "0",
|
||||
"frequency_rank": "0",
|
||||
})
|
||||
|
||||
if not new_entries:
|
||||
print("\nNo entries to add after categorization.")
|
||||
return
|
||||
|
||||
# Backup existing vocab
|
||||
backup_path = VOCAB_CSV.with_suffix(f".csv.bak.{datetime.now().strftime('%Y%m%d_%H%M%S')}")
|
||||
shutil.copy2(VOCAB_CSV, backup_path)
|
||||
print(f"\nBacked up vocabulary to {backup_path.name}")
|
||||
|
||||
# Append to vocab CSV
|
||||
with open(VOCAB_CSV, "a", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=["word", "categories", "tangibility_score",
|
||||
"conceptnet_edge_count", "frequency_rank"])
|
||||
for entry in new_entries:
|
||||
writer.writerow(entry)
|
||||
|
||||
print(f"\nAdded {len(new_entries)} words to {VOCAB_CSV.name}")
|
||||
print(f"New vocabulary size: {len(existing_words) + len(new_entries)}")
|
||||
|
||||
# Summary by category
|
||||
cat_counts = Counter()
|
||||
for entry in new_entries:
|
||||
for c in entry["categories"].split(","):
|
||||
cat_counts[c.strip()] += 1
|
||||
print(f"\nNew words by category:")
|
||||
for cat, count in cat_counts.most_common():
|
||||
print(f" {cat:20s} {count:3d}")
|
||||
|
||||
print(f"\nNext step: run 'python scripts/enhance_graph.py --phase 1' to generate edges for new words.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
177
scripts/filter_corpus.py
Normal file
177
scripts/filter_corpus.py
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Quality filtering for polished folksy sayings.
|
||||
|
||||
Reads corpus_polished.jsonl, applies quality filters, outputs filtered corpus
|
||||
and discard analysis.
|
||||
|
||||
Usage:
|
||||
python scripts/filter_corpus.py
|
||||
python scripts/filter_corpus.py --input corpus/corpus_polished.jsonl --output corpus/corpus_filtered.jsonl
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
from difflib import SequenceMatcher
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
CORPUS_DIR = PROJECT_DIR / "corpus"
|
||||
|
||||
|
||||
def quality_filter(entry):
|
||||
"""Apply quality filters to a polished entry.
|
||||
|
||||
Returns (passed, reason) tuple.
|
||||
"""
|
||||
text = entry.get("polished_text", "")
|
||||
if not text:
|
||||
return False, "no_polished_text"
|
||||
|
||||
words = text.split()
|
||||
|
||||
# Length check
|
||||
if len(words) > 25:
|
||||
return False, "too_long"
|
||||
if len(words) < 5:
|
||||
return False, "too_short"
|
||||
|
||||
# Must contain at least 2 of the original slot-fill nouns
|
||||
slot_words = set(entry.get("slots", {}).values())
|
||||
words_present = sum(1 for w in slot_words if w.lower() in text.lower())
|
||||
if words_present < 2:
|
||||
return False, "lost_key_nouns"
|
||||
|
||||
# No raw ConceptNet artifacts (multi-word underscore phrases)
|
||||
if "_" in text:
|
||||
return False, "conceptnet_artifact"
|
||||
|
||||
# No broken templates (unfilled slots)
|
||||
if "{" in text or "}" in text:
|
||||
return False, "unfilled_slot"
|
||||
|
||||
return True, "pass"
|
||||
|
||||
|
||||
def is_near_duplicate(text_a, text_b, threshold=0.75):
|
||||
"""Check if two texts are near-duplicates."""
|
||||
return SequenceMatcher(None, text_a.lower(), text_b.lower()).ratio() > threshold
|
||||
|
||||
|
||||
def deduplicate_within_family(entries):
|
||||
"""Remove near-duplicates within each meta-template family.
|
||||
|
||||
Returns (kept, removed) lists.
|
||||
"""
|
||||
by_family = {}
|
||||
for entry in entries:
|
||||
family = entry.get("meta_template", "unknown")
|
||||
by_family.setdefault(family, []).append(entry)
|
||||
|
||||
kept = []
|
||||
removed = []
|
||||
|
||||
for family, family_entries in by_family.items():
|
||||
family_kept = []
|
||||
for entry in family_entries:
|
||||
text = entry.get("polished_text", "")
|
||||
is_dup = False
|
||||
for existing in family_kept:
|
||||
if is_near_duplicate(text, existing.get("polished_text", "")):
|
||||
is_dup = True
|
||||
break
|
||||
if is_dup:
|
||||
removed.append((entry, "near_duplicate"))
|
||||
else:
|
||||
family_kept.append(entry)
|
||||
kept.extend(family_kept)
|
||||
|
||||
return kept, removed
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Quality filtering for polished folksy sayings.")
|
||||
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
|
||||
help="Input polished JSONL file")
|
||||
parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_filtered.jsonl"),
|
||||
help="Output filtered JSONL file")
|
||||
parser.add_argument("--discard-analysis", default=str(CORPUS_DIR / "discard_analysis.csv"),
|
||||
help="Discard analysis CSV file")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
output_path = Path(args.output)
|
||||
discard_path = Path(args.discard_analysis)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: {input_path} not found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load polished entries (only those with status=polished)
|
||||
all_entries = []
|
||||
already_discarded = 0
|
||||
with open(input_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
entry = json.loads(line)
|
||||
if entry.get("status") == "polished":
|
||||
all_entries.append(entry)
|
||||
elif entry.get("status") == "discarded":
|
||||
already_discarded += 1
|
||||
|
||||
print(f"Loaded {len(all_entries)} polished entries ({already_discarded} already discarded by LLM)")
|
||||
|
||||
# Apply quality filters
|
||||
passed = []
|
||||
discards = [] # (entry, reason)
|
||||
|
||||
for entry in all_entries:
|
||||
ok, reason = quality_filter(entry)
|
||||
if ok:
|
||||
passed.append(entry)
|
||||
else:
|
||||
discards.append((entry, reason))
|
||||
|
||||
print(f"Quality filter: {len(passed)} passed, {len(discards)} discarded")
|
||||
|
||||
# Show discard breakdown
|
||||
from collections import Counter
|
||||
reason_counts = Counter(r for _, r in discards)
|
||||
for reason, count in reason_counts.most_common():
|
||||
print(f" {reason}: {count}")
|
||||
|
||||
# Near-duplicate detection within template families
|
||||
kept, dup_removed = deduplicate_within_family(passed)
|
||||
discards.extend(dup_removed)
|
||||
|
||||
print(f"Near-duplicate removal: {len(dup_removed)} removed, {len(kept)} remaining")
|
||||
|
||||
# Write filtered output
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
for entry in kept:
|
||||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"\nFiltered corpus: {len(kept)} entries -> {output_path}")
|
||||
|
||||
# Write discard analysis
|
||||
with open(discard_path, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["raw_text", "meta_template", "discard_stage", "discard_reason"])
|
||||
for entry, reason in discards:
|
||||
writer.writerow([
|
||||
entry.get("raw_text", ""),
|
||||
entry.get("meta_template", ""),
|
||||
"llm_polish" if reason == "no_polished_text" else "quality_filter",
|
||||
reason,
|
||||
])
|
||||
|
||||
print(f"Discard analysis: {len(discards)} entries -> {discard_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
385
scripts/format_training_pairs.py
Normal file
385
scripts/format_training_pairs.py
Normal file
|
|
@ -0,0 +1,385 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Format filtered sayings into training pairs for fine-tuning.
|
||||
|
||||
Each polished saying generates 3-5 training pairs with different input framings.
|
||||
Also generates fictional entity training pairs.
|
||||
|
||||
Usage:
|
||||
python scripts/format_training_pairs.py
|
||||
python scripts/format_training_pairs.py --input corpus/corpus_filtered.jsonl --output corpus/training_pairs.jsonl
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
CORPUS_DIR = PROJECT_DIR / "corpus"
|
||||
DATA_DIR = PROJECT_DIR / "data"
|
||||
EXAMPLES_DIR = PROJECT_DIR / "examples"
|
||||
|
||||
# Template name mappings for human-readable prompts
|
||||
TEMPLATE_NAMES = {
|
||||
"deconstruction": "deconstruction",
|
||||
"denial_of_consequences": "denial of consequences",
|
||||
"ironic_deficiency": "ironic deficiency",
|
||||
"futile_preparation": "futile preparation",
|
||||
"hypocritical_complaint": "hypocritical complaint",
|
||||
"tautological_wisdom": "tautological wisdom",
|
||||
"false_equivalence": "false equivalence",
|
||||
}
|
||||
|
||||
PERSONAS = ["farmer", "grandmother", "old sailor", "blacksmith", "innkeeper", "shepherd"]
|
||||
|
||||
OPEN_ENDED_PROMPTS = [
|
||||
"Tell me some folk wisdom.",
|
||||
"What do they say?",
|
||||
"Give me a proverb.",
|
||||
"Share some old-time wisdom.",
|
||||
"What's a good saying?",
|
||||
]
|
||||
|
||||
# Auto-generated fictional entities for additional training pairs
|
||||
AUTO_ENTITIES = [
|
||||
{
|
||||
"name": "Stoneclaw",
|
||||
"categories": ["animal", "predator"],
|
||||
"properties": ["fierce", "rocky", "nocturnal"],
|
||||
"relations": {"AtLocation": ["cave", "mountain"], "HasA": ["claws", "scales"], "CapableOf": ["hunting", "climbing"]},
|
||||
},
|
||||
{
|
||||
"name": "Duskmelon",
|
||||
"categories": ["fruit", "food"],
|
||||
"properties": ["purple", "sweet", "fragrant"],
|
||||
"relations": {"AtLocation": ["garden", "market"], "UsedFor": ["eating", "jam"], "MadeOf": ["seed", "juice"]},
|
||||
},
|
||||
{
|
||||
"name": "Windloom",
|
||||
"categories": ["tool", "craft"],
|
||||
"properties": ["wooden", "portable", "intricate"],
|
||||
"relations": {"UsedFor": ["weaving", "thread"], "MadeOf": ["wood", "string"], "AtLocation": ["workshop", "cottage"]},
|
||||
},
|
||||
{
|
||||
"name": "Briarvine",
|
||||
"categories": ["plant", "herb"],
|
||||
"properties": ["thorny", "green", "medicinal"],
|
||||
"relations": {"AtLocation": ["forest", "hedge"], "UsedFor": ["healing", "tea"], "HasA": ["thorn", "leaf"]},
|
||||
},
|
||||
{
|
||||
"name": "Mudhog",
|
||||
"categories": ["animal", "livestock"],
|
||||
"properties": ["muddy", "stubborn", "heavy"],
|
||||
"relations": {"AtLocation": ["farm", "swamp"], "Desires": ["food", "mud"], "CapableOf": ["digging", "rooting"]},
|
||||
},
|
||||
{
|
||||
"name": "Frostberry",
|
||||
"categories": ["fruit", "food"],
|
||||
"properties": ["cold", "blue", "tiny"],
|
||||
"relations": {"AtLocation": ["mountain", "tundra"], "UsedFor": ["eating", "preserves"], "HasProperty": ["cold", "tart"]},
|
||||
},
|
||||
{
|
||||
"name": "Lanternmoss",
|
||||
"categories": ["plant", "fungus"],
|
||||
"properties": ["glowing", "damp", "soft"],
|
||||
"relations": {"AtLocation": ["cave", "swamp"], "UsedFor": ["light", "decoration"], "HasProperty": ["luminous", "fragile"]},
|
||||
},
|
||||
{
|
||||
"name": "Cinderhawk",
|
||||
"categories": ["bird", "animal"],
|
||||
"properties": ["fiery", "fast", "red"],
|
||||
"relations": {"AtLocation": ["mountain", "volcano"], "CapableOf": ["flying", "hunting"], "HasA": ["talons", "feathers"]},
|
||||
},
|
||||
{
|
||||
"name": "Rootstone",
|
||||
"categories": ["stone", "material"],
|
||||
"properties": ["veined", "hard", "ancient"],
|
||||
"relations": {"AtLocation": ["quarry", "riverbed"], "UsedFor": ["building", "carving"], "MadeOf": ["mineral", "root"]},
|
||||
},
|
||||
{
|
||||
"name": "Silkwort",
|
||||
"categories": ["plant", "fiber"],
|
||||
"properties": ["silky", "white", "tall"],
|
||||
"relations": {"AtLocation": ["field", "meadow"], "UsedFor": ["weaving", "cloth"], "HasA": ["stem", "fiber"]},
|
||||
},
|
||||
{
|
||||
"name": "Kettlefrog",
|
||||
"categories": ["animal", "amphibian"],
|
||||
"properties": ["loud", "round", "green"],
|
||||
"relations": {"AtLocation": ["pond", "marsh"], "CapableOf": ["jumping", "croaking"], "Desires": ["flies", "water"]},
|
||||
},
|
||||
{
|
||||
"name": "Dustwheat",
|
||||
"categories": ["crop", "grain"],
|
||||
"properties": ["dry", "golden", "hardy"],
|
||||
"relations": {"AtLocation": ["field", "barn"], "UsedFor": ["bread", "flour"], "HasPrerequisite": ["rain", "soil"]},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def format_entity_description(entity):
|
||||
"""Format entity into a natural description string."""
|
||||
name = entity["name"]
|
||||
cats = entity.get("categories", [])
|
||||
props = entity.get("properties", [])
|
||||
rels = entity.get("relations", {})
|
||||
|
||||
parts = []
|
||||
|
||||
# Category description
|
||||
if props and cats:
|
||||
prop_str = ", ".join(props[:3])
|
||||
cat_str = " and ".join(cats[:2])
|
||||
parts.append(f"A {name} is a {prop_str} {cat_str}.")
|
||||
elif cats:
|
||||
parts.append(f"A {name} is a {' and '.join(cats[:2])}.")
|
||||
|
||||
# Location
|
||||
if "AtLocation" in rels:
|
||||
locs = rels["AtLocation"]
|
||||
parts.append(f"It is found near {' and '.join(locs[:2])}.")
|
||||
|
||||
# Parts/properties
|
||||
if "HasA" in rels:
|
||||
has = rels["HasA"]
|
||||
parts.append(f"It has {', '.join(has[:3])}.")
|
||||
|
||||
# Capabilities
|
||||
if "CapableOf" in rels:
|
||||
caps = rels["CapableOf"]
|
||||
parts.append(f"It can {' and '.join(caps[:2])}.")
|
||||
|
||||
# Uses
|
||||
if "UsedFor" in rels:
|
||||
uses = rels["UsedFor"]
|
||||
parts.append(f"It is used for {' and '.join(uses[:2])}.")
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def load_vocab_categories():
|
||||
"""Load vocab to get word -> categories mapping."""
|
||||
word_cats = {}
|
||||
vocab_path = DATA_DIR / "folksy_vocab.csv"
|
||||
if vocab_path.exists():
|
||||
with open(vocab_path, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
word = row["word"]
|
||||
cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
|
||||
word_cats[word] = cats
|
||||
return word_cats
|
||||
|
||||
|
||||
def generate_training_pairs(entry, word_cats):
|
||||
"""Generate 3-5 training pairs for a single polished saying."""
|
||||
polished = entry.get("polished_text", "")
|
||||
slots = entry.get("slots", {})
|
||||
meta_template = entry.get("meta_template", "")
|
||||
|
||||
# Collect source words (concrete nouns from slots)
|
||||
source_words = [v for v in slots.values()
|
||||
if v and not v.startswith("a ") and not v.startswith("an ") and len(v) > 1]
|
||||
|
||||
# Determine categories of slot words
|
||||
slot_categories = set()
|
||||
for word in source_words:
|
||||
word_lower = word.lower().replace(" ", "_")
|
||||
if word_lower in word_cats:
|
||||
slot_categories.update(word_cats[word_lower])
|
||||
|
||||
pairs = []
|
||||
base = {
|
||||
"output": polished,
|
||||
"meta_template": meta_template,
|
||||
"source_words": source_words,
|
||||
}
|
||||
|
||||
# 1. Word-seeded (always include)
|
||||
if source_words:
|
||||
word = random.choice(source_words)
|
||||
pairs.append({**base, "input": f"Tell me something about {word}."})
|
||||
|
||||
# 2. Category-seeded (always include if we have categories)
|
||||
if slot_categories:
|
||||
cat = random.choice(list(slot_categories))
|
||||
pairs.append({**base, "input": f"Tell me a saying about {cat}."})
|
||||
|
||||
# 3. Persona-seeded (always include)
|
||||
persona = random.choice(PERSONAS)
|
||||
if source_words:
|
||||
word = random.choice(source_words)
|
||||
pairs.append({**base, "input": f"What would a {persona} say about {word}?"})
|
||||
|
||||
# 4. Template-seeded (include ~70% of the time)
|
||||
if random.random() < 0.7:
|
||||
template_name = TEMPLATE_NAMES.get(meta_template, meta_template)
|
||||
pairs.append({**base, "input": f"Give me a {template_name} proverb."})
|
||||
|
||||
# 5. Open-ended (include ~30% of the time)
|
||||
if random.random() < 0.3:
|
||||
prompt = random.choice(OPEN_ENDED_PROMPTS)
|
||||
pairs.append({**base, "input": prompt})
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def generate_fictional_pairs(entities):
|
||||
"""Generate training pairs for fictional entities.
|
||||
|
||||
These pairs include the entity description in the input.
|
||||
"""
|
||||
pairs = []
|
||||
|
||||
# Generate 15-25 pairs per entity
|
||||
for entity in entities:
|
||||
name = entity["name"]
|
||||
desc = format_entity_description(entity)
|
||||
props = entity.get("properties", [])
|
||||
rels = entity.get("relations", {})
|
||||
|
||||
# Collect words related to this entity
|
||||
related_words = []
|
||||
for targets in rels.values():
|
||||
related_words.extend(targets)
|
||||
|
||||
n_pairs = random.randint(15, 25)
|
||||
|
||||
for _ in range(n_pairs):
|
||||
framing = random.choice(["persona", "word", "category", "open"])
|
||||
|
||||
if framing == "persona":
|
||||
persona = random.choice(PERSONAS)
|
||||
input_text = f"{desc} What would a {persona} say about a {name}?"
|
||||
elif framing == "word" and related_words:
|
||||
word = random.choice(related_words)
|
||||
input_text = f"{desc} Tell me a saying about {name} and {word}."
|
||||
elif framing == "category":
|
||||
cats = entity.get("categories", ["thing"])
|
||||
cat = random.choice(cats)
|
||||
input_text = f"{desc} Give me folk wisdom about this {cat}."
|
||||
else:
|
||||
input_text = f"{desc} Tell me some folk wisdom about {name}."
|
||||
|
||||
# Placeholder output — these would ideally be generated through the
|
||||
# template engine with fictional entities loaded, then polished.
|
||||
# For now, generate a structural placeholder that indicates the
|
||||
# entity relationships.
|
||||
pairs.append({
|
||||
"input": input_text,
|
||||
"output": "", # Will be filled by actual generation
|
||||
"meta_template": "fictional",
|
||||
"source_words": [name] + related_words[:3],
|
||||
"_needs_generation": True,
|
||||
"_entity": entity,
|
||||
})
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Format training pairs for fine-tuning.")
|
||||
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_filtered.jsonl"),
|
||||
help="Input filtered JSONL file")
|
||||
parser.add_argument("--output", default=str(CORPUS_DIR / "training_pairs.jsonl"),
|
||||
help="Output training pairs JSONL file")
|
||||
parser.add_argument("--entities", default=str(EXAMPLES_DIR / "my_world.json"),
|
||||
help="Fictional entities JSON file")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
output_path = Path(args.output)
|
||||
entities_path = Path(args.entities)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: {input_path} not found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load vocab categories
|
||||
word_cats = load_vocab_categories()
|
||||
|
||||
# Load filtered entries
|
||||
entries = []
|
||||
with open(input_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
entries.append(json.loads(line))
|
||||
|
||||
print(f"Loaded {len(entries)} filtered entries")
|
||||
|
||||
# Generate training pairs for each entry
|
||||
all_pairs = []
|
||||
for entry in entries:
|
||||
pairs = generate_training_pairs(entry, word_cats)
|
||||
all_pairs.extend(pairs)
|
||||
|
||||
print(f"Generated {len(all_pairs)} training pairs from polished sayings")
|
||||
|
||||
# Generate fictional entity pairs
|
||||
fictional_entities = []
|
||||
if entities_path.exists():
|
||||
with open(entities_path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
fictional_entities = data.get("entities", [])
|
||||
print(f"Loaded {len(fictional_entities)} fictional entities from {entities_path}")
|
||||
|
||||
# Add auto-generated entities
|
||||
fictional_entities.extend(AUTO_ENTITIES)
|
||||
print(f"Total fictional entities (file + auto-generated): {len(fictional_entities)}")
|
||||
|
||||
fictional_pairs = generate_fictional_pairs(fictional_entities)
|
||||
|
||||
# Filter out placeholder pairs (those that still need generation)
|
||||
# In a full pipeline, these would be generated through the template engine.
|
||||
# For now, skip any with empty output.
|
||||
real_fictional = [p for p in fictional_pairs if p.get("output")]
|
||||
placeholder_fictional = [p for p in fictional_pairs if not p.get("output")]
|
||||
|
||||
if placeholder_fictional:
|
||||
print(f" {len(placeholder_fictional)} fictional pairs need generation via template engine")
|
||||
print(f" (Run folksy_generator.py with --entities to generate these, then re-run this script)")
|
||||
|
||||
all_pairs.extend(real_fictional)
|
||||
|
||||
# Clean up internal fields before writing
|
||||
for pair in all_pairs:
|
||||
pair.pop("_needs_generation", None)
|
||||
pair.pop("_entity", None)
|
||||
|
||||
# Write output
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
for pair in all_pairs:
|
||||
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
|
||||
|
||||
# Stats
|
||||
from collections import Counter
|
||||
input_types = Counter()
|
||||
for pair in all_pairs:
|
||||
inp = pair["input"]
|
||||
if inp.startswith("Tell me something about"):
|
||||
input_types["word_seeded"] += 1
|
||||
elif inp.startswith("Tell me a saying about"):
|
||||
input_types["category_seeded"] += 1
|
||||
elif inp.startswith("What would a"):
|
||||
input_types["persona_seeded"] += 1
|
||||
elif inp.startswith("Give me a") and "proverb" in inp:
|
||||
input_types["template_seeded"] += 1
|
||||
elif any(inp.startswith(p) for p in ["Tell me some folk", "What do they", "Give me a proverb", "Share some", "What's a good"]):
|
||||
input_types["open_ended"] += 1
|
||||
else:
|
||||
input_types["fictional"] += 1
|
||||
|
||||
print(f"\nTotal training pairs: {len(all_pairs)}")
|
||||
print("Distribution by input type:")
|
||||
for itype, count in sorted(input_types.items()):
|
||||
print(f" {itype:20s} {count:5d}")
|
||||
|
||||
print(f"\nOutput: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
61
scripts/generate_raw_batch.sh
Executable file
61
scripts/generate_raw_batch.sh
Executable file
|
|
@ -0,0 +1,61 @@
|
|||
#!/usr/bin/env bash
|
||||
# Generate raw folksy sayings across all 7 templates.
|
||||
# Output: corpus/corpus_raw.jsonl (~10,500 entries)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
CORPUS_DIR="$PROJECT_DIR/corpus"
|
||||
GENERATOR="$PROJECT_DIR/folksy_generator.py"
|
||||
|
||||
COUNT_PER_TEMPLATE=${1:-1500}
|
||||
|
||||
mkdir -p "$CORPUS_DIR"
|
||||
|
||||
OUTPUT="$CORPUS_DIR/corpus_raw.jsonl"
|
||||
# Clear existing file
|
||||
> "$OUTPUT"
|
||||
|
||||
TEMPLATES=(
|
||||
deconstruction
|
||||
denial_of_consequences
|
||||
ironic_deficiency
|
||||
futile_preparation
|
||||
hypocritical_complaint
|
||||
tautological_wisdom
|
||||
false_equivalence
|
||||
)
|
||||
|
||||
echo "Generating $COUNT_PER_TEMPLATE sayings per template (${#TEMPLATES[@]} templates)..."
|
||||
echo "Output: $OUTPUT"
|
||||
|
||||
total=0
|
||||
for template in "${TEMPLATES[@]}"; do
|
||||
echo -n " $template ($COUNT_PER_TEMPLATE)... "
|
||||
before=$(wc -l < "$OUTPUT")
|
||||
python "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
|
||||
after=$(wc -l < "$OUTPUT")
|
||||
generated=$((after - before))
|
||||
total=$((total + generated))
|
||||
echo "$generated generated"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Total: $total raw sayings in $OUTPUT"
|
||||
echo ""
|
||||
|
||||
# Check template distribution
|
||||
echo "Template distribution:"
|
||||
python -c "
|
||||
import json, sys
|
||||
from collections import Counter
|
||||
counts = Counter()
|
||||
with open('$OUTPUT') as f:
|
||||
for line in f:
|
||||
entry = json.loads(line)
|
||||
counts[entry['meta_template']] += 1
|
||||
for template, count in sorted(counts.items()):
|
||||
print(f' {template:30s} {count:5d}')
|
||||
print(f\" {'TOTAL':30s} {sum(counts.values()):5d}\")
|
||||
"
|
||||
215
scripts/polish_corpus.py
Normal file
215
scripts/polish_corpus.py
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
#!/usr/bin/env python3
|
||||
"""LLM polish pipeline for raw folksy sayings.
|
||||
|
||||
Reads corpus_raw.jsonl, sends each to GLM4-32B for polish.
|
||||
Output file is the checkpoint — append mode with resume detection.
|
||||
|
||||
Usage:
|
||||
python scripts/polish_corpus.py
|
||||
python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
CORPUS_DIR = PROJECT_DIR / "corpus"
|
||||
|
||||
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
|
||||
LLM_MODEL = "THUDM-GLM4-32B"
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """You are an editor specializing in folk sayings and rural proverbs. You will receive a rough draft of a fake folksy saying along with the relationship chain it encodes.
|
||||
|
||||
Your job:
|
||||
1. Fix grammar, articles, and pluralization
|
||||
2. Make it sound natural — like something a weathered farmer would say while leaning on a fence post
|
||||
3. Preserve the core nouns and the relationship between them — do not swap out the key words
|
||||
4. You MAY add small colorful details (adjectives, folksy verb choices, regional flavor) but keep it concise — real proverbs are short
|
||||
5. You MAY lightly restructure the sentence for better rhythm, but keep the same meaning pattern
|
||||
6. If the saying is unsalvageable nonsense (the nouns don't relate in any meaningful way, or the combination is unintentionally offensive), respond with exactly: DISCARD
|
||||
|
||||
Output ONLY the polished saying on a single line. No quotes, no explanation, no preamble.
|
||||
|
||||
Examples of good polish:
|
||||
|
||||
Raw: "Don't build the coffee and act surprised when the water show up."
|
||||
Chain: coffee MadeOf water
|
||||
Polished: Don't brew the coffee and act surprised when the water's all gone.
|
||||
|
||||
Raw: "The chest's children always goes without hold books."
|
||||
Chain: chest UsedFor hold_books
|
||||
Polished: The bookshelf-maker's kids always end up reading off the floor.
|
||||
|
||||
Raw: "A pineapple is just a nectarine that's got an attitude."
|
||||
Chain: pineapple IsA fruit, nectarine IsA fruit, pineapple HasProperty prickly
|
||||
Polished: A pineapple is just a peach that grew itself some armor.
|
||||
|
||||
Raw: "You know what they say, a steel with no iron is just a harder than gold iron."
|
||||
Chain: steel MadeOf iron, steel HasProperty hard
|
||||
Polished: You know what they say — steel without the iron is just a dream of being hard.
|
||||
|
||||
Raw: "Funny how the bamboo never has enough grow very quickly for itself."
|
||||
Chain: bamboo CapableOf grow_quickly
|
||||
Polished: DISCARD
|
||||
|
||||
Raw: "That's just funning the canoe and praying for boiling food."
|
||||
Chain: canoe UsedFor transport, fire UsedFor boiling_food
|
||||
Polished: DISCARD"""
|
||||
|
||||
|
||||
def llm_chat_completion(messages, max_retries=3):
|
||||
"""Chat completion with retry logic."""
|
||||
import requests
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = requests.post(LLM_ENDPOINT, json={
|
||||
"model": LLM_MODEL,
|
||||
"messages": messages,
|
||||
}, timeout=120)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["choices"][0]["message"]["content"].strip()
|
||||
except Exception as e:
|
||||
wait = (2 ** attempt)
|
||||
print(f" LLM error (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(wait)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def format_chain(chain_edges):
|
||||
"""Format chain_edges list into readable string for LLM context."""
|
||||
if not chain_edges:
|
||||
return "(no chain data)"
|
||||
parts = []
|
||||
for edge in chain_edges:
|
||||
start = edge.get("start", "?")
|
||||
rel = edge.get("relation", "?")
|
||||
end = edge.get("end", "?")
|
||||
weight = edge.get("weight", 0)
|
||||
parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})")
|
||||
return ", ".join(parts)
|
||||
|
||||
|
||||
def format_slots(slots):
|
||||
"""Format slots dict for LLM context."""
|
||||
return ", ".join(f"{k}={v}" for k, v in slots.items())
|
||||
|
||||
|
||||
def load_already_processed(output_path):
|
||||
"""Load set of raw_text strings already processed (for resume)."""
|
||||
processed = set()
|
||||
if output_path.exists():
|
||||
with open(output_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
processed.add(entry.get("raw_text", ""))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return processed
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="LLM polish pipeline for folksy sayings.")
|
||||
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_raw.jsonl"),
|
||||
help="Input JSONL file")
|
||||
parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
|
||||
help="Output JSONL file (also serves as checkpoint)")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
output_path = Path(args.output)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: {input_path} not found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load raw entries
|
||||
raw_entries = []
|
||||
with open(input_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
raw_entries.append(json.loads(line))
|
||||
|
||||
print(f"Loaded {len(raw_entries)} raw entries from {input_path}")
|
||||
|
||||
# Check what's already been processed
|
||||
already_processed = load_already_processed(output_path)
|
||||
remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed]
|
||||
|
||||
print(f"Already processed: {len(already_processed)}")
|
||||
print(f"Remaining: {len(remaining)}")
|
||||
|
||||
if not remaining:
|
||||
print("Nothing to process.")
|
||||
return
|
||||
|
||||
discards = 0
|
||||
polished = 0
|
||||
errors = 0
|
||||
|
||||
with open(output_path, "a", encoding="utf-8") as out:
|
||||
for i, entry in enumerate(remaining):
|
||||
raw_text = entry.get("raw_text", "")
|
||||
meta_template = entry.get("meta_template", "")
|
||||
chain = format_chain(entry.get("chain", []))
|
||||
slots = format_slots(entry.get("slots", {}))
|
||||
|
||||
user_prompt = (
|
||||
f"Meta-template: {meta_template}\n"
|
||||
f"Relationship chain: {chain}\n"
|
||||
f"Slot fills: {slots}\n"
|
||||
f"Raw saying: {raw_text}"
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
response = llm_chat_completion(messages)
|
||||
|
||||
if response is None:
|
||||
entry["status"] = "error"
|
||||
errors += 1
|
||||
elif response.strip().upper() == "DISCARD":
|
||||
entry["status"] = "discarded"
|
||||
discards += 1
|
||||
else:
|
||||
entry["polished_text"] = response.strip()
|
||||
entry["status"] = "polished"
|
||||
polished += 1
|
||||
|
||||
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
out.flush()
|
||||
total_done = len(already_processed) + i + 1
|
||||
print(f" [{total_done}/{len(raw_entries)}] "
|
||||
f"polished={polished}, discarded={discards}, errors={errors}")
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
total_done = len(already_processed) + len(remaining)
|
||||
print(f"\nDone: {total_done} total entries processed.")
|
||||
print(f" Polished: {polished}")
|
||||
print(f" Discarded: {discards}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Discard rate: {discards/(polished+discards)*100:.1f}%" if (polished+discards) else " N/A")
|
||||
print(f"Output: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue