corpus generation (work from mid february)
This commit is contained in:
parent
8c8a058301
commit
356b62c6ea
16 changed files with 25872 additions and 38 deletions
|
|
@ -212,26 +212,45 @@ class Deconstruction(MetaTemplate):
|
|||
|
||||
# Find what A is made of / requires
|
||||
ingredients = []
|
||||
ingredient_rels = [] # track which relation found each ingredient
|
||||
for rel in ("MadeOf", "HasPrerequisite", "HasA"):
|
||||
ingredients.extend(_short_concepts(self.graph.neighbors(a, rel, min_weight=0.5)))
|
||||
found = _short_concepts(self.graph.neighbors(a, rel, min_weight=0.5))
|
||||
for item in found:
|
||||
ingredients.append(item)
|
||||
ingredient_rels.append(rel)
|
||||
|
||||
if len(ingredients) < 2:
|
||||
for rel in ("MadeOf", "HasPrerequisite"):
|
||||
for (start, w, s) in self.graph.reverse.get((a, rel), []):
|
||||
if len(start.split("_")) <= 2:
|
||||
ingredients.append((start, w, s))
|
||||
ingredient_rels.append(rel)
|
||||
|
||||
if len(ingredients) < 2:
|
||||
return None, None
|
||||
|
||||
random.shuffle(ingredients)
|
||||
b_word = _readable(ingredients[0][0])
|
||||
d_word = _readable(ingredients[1][0])
|
||||
# Shuffle together
|
||||
combined = list(zip(ingredients, ingredient_rels))
|
||||
random.shuffle(combined)
|
||||
ingredients, ingredient_rels = zip(*combined)
|
||||
|
||||
b_edge = ingredients[0]
|
||||
b_word = _readable(b_edge[0])
|
||||
b_rel = ingredient_rels[0]
|
||||
d_edge = ingredients[1]
|
||||
d_word = _readable(d_edge[0])
|
||||
d_rel = ingredient_rels[1]
|
||||
|
||||
# Find a property for D
|
||||
chain_edges = [
|
||||
{"start": a, "relation": b_rel, "end": b_edge[0], "weight": b_edge[1], "surface_text": b_edge[2]},
|
||||
{"start": a, "relation": d_rel, "end": d_edge[0], "weight": d_edge[1], "surface_text": d_edge[2]},
|
||||
]
|
||||
props = self.graph.neighbors(ingredients[1][0], "HasProperty")
|
||||
if props:
|
||||
c_word = _readable(random.choice(props)[0])
|
||||
c_prop = random.choice(props)
|
||||
c_word = _readable(c_prop[0])
|
||||
chain_edges.append({"start": d_edge[0], "relation": "HasProperty", "end": c_prop[0], "weight": c_prop[1], "surface_text": c_prop[2]})
|
||||
else:
|
||||
c_word = random.choice(["plain", "sorry", "old", "humble", "dry", "wet", "cold"])
|
||||
|
||||
|
|
@ -242,6 +261,7 @@ class Deconstruction(MetaTemplate):
|
|||
"template_family": self.id,
|
||||
"template": template,
|
||||
"chain": f"{a} MadeOf/Has [{b_word}, {d_word}]; {d_word} HasProperty {c_word}",
|
||||
"chain_edges": chain_edges,
|
||||
"slots": {"A": a, "B": b_word, "C": c_word, "D": d_word},
|
||||
}
|
||||
return saying, debug
|
||||
|
|
@ -265,23 +285,31 @@ class DenialOfConsequences(MetaTemplate):
|
|||
return None, None
|
||||
|
||||
# What is found at A? (reverse: B AtLocation A)
|
||||
attracted = []
|
||||
attracted = [] # (word, weight, surface_text, relation)
|
||||
for (b, w, s) in self.graph.reverse.get((a, "AtLocation"), []):
|
||||
attracted.append((b, w))
|
||||
attracted.append((b, w, s, "AtLocation"))
|
||||
|
||||
# Also: what does A attract/cause?
|
||||
for rel in ("Causes", "CausesDesire"):
|
||||
for (b, w, s) in self.graph.edges.get((a, rel), []):
|
||||
attracted.append((b, w))
|
||||
attracted.append((b, w, s, rel))
|
||||
|
||||
if not attracted:
|
||||
for (bridge, target, w1, w2) in self.graph.two_hop(a, "UsedFor", "AtLocation"):
|
||||
attracted.append((target, w1 + w2))
|
||||
attracted.append((target, w1 + w2, "", "AtLocation"))
|
||||
|
||||
if not attracted:
|
||||
return None, None
|
||||
|
||||
b_word = _readable(random.choice(attracted)[0])
|
||||
b_choice = random.choice(attracted)
|
||||
b_word = _readable(b_choice[0])
|
||||
|
||||
chain_edges = [
|
||||
{"start": b_choice[0] if b_choice[3] == "AtLocation" else a,
|
||||
"relation": b_choice[3],
|
||||
"end": a if b_choice[3] == "AtLocation" else b_choice[0],
|
||||
"weight": b_choice[1], "surface_text": b_choice[2]},
|
||||
]
|
||||
|
||||
create_verbs = {
|
||||
"pond": "dig", "birdhouse": "hang", "fence": "build", "trap": "set",
|
||||
|
|
@ -301,6 +329,7 @@ class DenialOfConsequences(MetaTemplate):
|
|||
"template_family": self.id,
|
||||
"template": template,
|
||||
"chain": f"{b_word} AtLocation {a}; {a} created by {c_word}",
|
||||
"chain_edges": chain_edges,
|
||||
"slots": {"A": a, "B": b_word, "C": c_word},
|
||||
}
|
||||
return saying, debug
|
||||
|
|
@ -324,14 +353,21 @@ class IronicDeficiency(MetaTemplate):
|
|||
return None, None
|
||||
|
||||
products = []
|
||||
product_rels = []
|
||||
for rel in ("UsedFor", "CapableOf", "Causes"):
|
||||
products.extend(self.graph.neighbors(a, rel, min_weight=0.5))
|
||||
found = self.graph.neighbors(a, rel, min_weight=0.5)
|
||||
for item in found:
|
||||
products.append(item)
|
||||
product_rels.append(rel)
|
||||
|
||||
products = _short_concepts(products)
|
||||
if not products:
|
||||
# Filter to short concepts while keeping rel tracking
|
||||
filtered = [(p, r) for p, r in zip(products, product_rels) if len(p[0].split("_")) <= 3]
|
||||
if not filtered:
|
||||
return None, None
|
||||
|
||||
x_word = _readable(random.choice(products)[0])
|
||||
choice_idx = random.randrange(len(filtered))
|
||||
x_edge, x_rel = filtered[choice_idx]
|
||||
x_word = _readable(x_edge[0])
|
||||
|
||||
family_members = ["wife", "children", "household", "family", "own kind"]
|
||||
f_word = random.choice(family_members)
|
||||
|
|
@ -339,10 +375,15 @@ class IronicDeficiency(MetaTemplate):
|
|||
template = self._pick_template()
|
||||
saying = template.format(A=a, X=x_word, F=f_word)
|
||||
|
||||
chain_edges = [
|
||||
{"start": a, "relation": x_rel, "end": x_edge[0], "weight": x_edge[1], "surface_text": x_edge[2]},
|
||||
]
|
||||
|
||||
debug = {
|
||||
"template_family": self.id,
|
||||
"template": template,
|
||||
"chain": f"{a} UsedFor/Produces {x_word}; irony: {a} lacks {x_word}",
|
||||
"chain_edges": chain_edges,
|
||||
"slots": {"A": a, "X": x_word, "F": f_word},
|
||||
}
|
||||
return saying, debug
|
||||
|
|
@ -371,7 +412,12 @@ class FutilePreparation(MetaTemplate):
|
|||
if not uses:
|
||||
return None, None
|
||||
|
||||
action_word = random.choice(uses)[0]
|
||||
action_edge = random.choice(uses)
|
||||
action_word = action_edge[0]
|
||||
|
||||
chain_edges = [
|
||||
{"start": seed, "relation": "UsedFor", "end": action_edge[0], "weight": action_edge[1], "surface_text": action_edge[2]},
|
||||
]
|
||||
|
||||
# Find a different outcome in a related domain via 2-hop
|
||||
outcomes = []
|
||||
|
|
@ -392,7 +438,8 @@ class FutilePreparation(MetaTemplate):
|
|||
if not outcomes:
|
||||
return None, None
|
||||
|
||||
y_word = random.choice(outcomes)[0]
|
||||
y_choice = random.choice(outcomes)
|
||||
y_word = y_choice[0]
|
||||
|
||||
gerund = _gerund(action_word)
|
||||
verb = _readable(action_word)
|
||||
|
|
@ -405,6 +452,7 @@ class FutilePreparation(MetaTemplate):
|
|||
"template_family": self.id,
|
||||
"template": template,
|
||||
"chain": f"{seed} UsedFor {action_word}; different domain: {y_word}",
|
||||
"chain_edges": chain_edges,
|
||||
"slots": {"seed": seed, "action": action_word, "Y": y_word},
|
||||
}
|
||||
return saying, debug
|
||||
|
|
@ -430,21 +478,37 @@ class HypocriticalComplaint(MetaTemplate):
|
|||
|
||||
# Find parts of Z
|
||||
parts = []
|
||||
part_rels = []
|
||||
for rel in ("HasA", "PartOf", "MadeOf"):
|
||||
parts.extend(_short_concepts(self.graph.neighbors(z, rel, min_weight=0.5)))
|
||||
found = _short_concepts(self.graph.neighbors(z, rel, min_weight=0.5))
|
||||
for item in found:
|
||||
parts.append(item)
|
||||
part_rels.append(rel)
|
||||
for (start, w, s) in self.graph.reverse.get((z, "PartOf"), []):
|
||||
if len(start.split("_")) <= 2:
|
||||
parts.append((start, w, s))
|
||||
part_rels.append("PartOf")
|
||||
for (start, w, s) in self.graph.reverse.get((z, "HasA"), []):
|
||||
if len(start.split("_")) <= 2:
|
||||
parts.append((start, w, s))
|
||||
part_rels.append("HasA")
|
||||
|
||||
if len(parts) < 2:
|
||||
return None, None
|
||||
|
||||
random.shuffle(parts)
|
||||
x_word = _readable(parts[0][0])
|
||||
y_word = _readable(parts[1][0])
|
||||
combined = list(zip(parts, part_rels))
|
||||
random.shuffle(combined)
|
||||
parts, part_rels = zip(*combined)
|
||||
|
||||
x_edge = parts[0]
|
||||
x_word = _readable(x_edge[0])
|
||||
y_edge = parts[1]
|
||||
y_word = _readable(y_edge[0])
|
||||
|
||||
chain_edges = [
|
||||
{"start": z, "relation": part_rels[0], "end": x_edge[0], "weight": x_edge[1], "surface_text": x_edge[2]},
|
||||
{"start": z, "relation": part_rels[1], "end": y_edge[0], "weight": y_edge[1], "surface_text": y_edge[2]},
|
||||
]
|
||||
|
||||
consume_verbs = ["eat", "drink", "take", "pick", "use up", "grab"]
|
||||
verb = random.choice(consume_verbs)
|
||||
|
|
@ -456,6 +520,7 @@ class HypocriticalComplaint(MetaTemplate):
|
|||
"template_family": self.id,
|
||||
"template": template,
|
||||
"chain": f"{x_word} PartOf/HasA {z}; {y_word} PartOf/HasA {z}",
|
||||
"chain_edges": chain_edges,
|
||||
"slots": {"Z": z, "X": x_word, "Y": y_word, "verb": verb},
|
||||
}
|
||||
return saying, debug
|
||||
|
|
@ -480,19 +545,25 @@ class TautologicalWisdom(MetaTemplate):
|
|||
return None, None
|
||||
|
||||
# seed HasPrerequisite/Causes something
|
||||
# Store (x_word, y_word, weight, edge_info) where edge_info captures the raw edge
|
||||
chains = []
|
||||
for (target, w, s) in self.graph.edges.get((seed, "HasPrerequisite"), []):
|
||||
chains.append((_readable(target), seed, w)) # X=prereq, Y=seed
|
||||
chains.append((_readable(target), seed, w,
|
||||
{"start": seed, "relation": "HasPrerequisite", "end": target, "weight": w, "surface_text": s}))
|
||||
for (target, w, s) in self.graph.edges.get((seed, "Causes"), []):
|
||||
chains.append((seed, _readable(target), w)) # X=seed, Y=effect
|
||||
chains.append((seed, _readable(target), w,
|
||||
{"start": seed, "relation": "Causes", "end": target, "weight": w, "surface_text": s}))
|
||||
# Also: what does seed require?
|
||||
for (source, w, s) in self.graph.reverse.get((seed, "HasPrerequisite"), []):
|
||||
chains.append((seed, _readable(source), w))
|
||||
chains.append((seed, _readable(source), w,
|
||||
{"start": source, "relation": "HasPrerequisite", "end": seed, "weight": w, "surface_text": s}))
|
||||
|
||||
if not chains:
|
||||
return None, None
|
||||
|
||||
x_word, y_word, _ = random.choice(chains)
|
||||
choice = random.choice(chains)
|
||||
x_word, y_word = choice[0], choice[1]
|
||||
chain_edge = choice[3]
|
||||
|
||||
template = self._pick_template()
|
||||
saying = template.format(X=x_word, Y=y_word)
|
||||
|
|
@ -501,6 +572,7 @@ class TautologicalWisdom(MetaTemplate):
|
|||
"template_family": self.id,
|
||||
"template": template,
|
||||
"chain": f"{x_word} -> {y_word} (prerequisite/cause)",
|
||||
"chain_edges": [chain_edge],
|
||||
"slots": {"X": x_word, "Y": y_word},
|
||||
}
|
||||
return saying, debug
|
||||
|
|
@ -543,15 +615,22 @@ class FalseEquivalence(MetaTemplate):
|
|||
a_props = _short_concepts(self.graph.neighbors(a, "HasProperty"), max_words=2)
|
||||
b_props = set(p[0] for p in self.graph.neighbors(b_word, "HasProperty"))
|
||||
|
||||
chain_edges = []
|
||||
differentiators = [p for p in a_props if p[0] not in b_props]
|
||||
if differentiators:
|
||||
p_word = _readable(random.choice(differentiators)[0])
|
||||
p_edge = random.choice(differentiators)
|
||||
p_word = _readable(p_edge[0])
|
||||
chain_edges.append({"start": a, "relation": "HasProperty", "end": p_edge[0], "weight": p_edge[1], "surface_text": p_edge[2]})
|
||||
elif a_props:
|
||||
p_word = _readable(random.choice(a_props)[0])
|
||||
p_edge = random.choice(a_props)
|
||||
p_word = _readable(p_edge[0])
|
||||
chain_edges.append({"start": a, "relation": "HasProperty", "end": p_edge[0], "weight": p_edge[1], "surface_text": p_edge[2]})
|
||||
else:
|
||||
a_caps = self.graph.neighbors(a, "CapableOf")
|
||||
if a_caps:
|
||||
p_word = _readable(random.choice(a_caps)[0])
|
||||
p_edge = random.choice(a_caps)
|
||||
p_word = _readable(p_edge[0])
|
||||
chain_edges.append({"start": a, "relation": "CapableOf", "end": p_edge[0], "weight": p_edge[1], "surface_text": p_edge[2]})
|
||||
else:
|
||||
p_word = random.choice(["ambition", "an attitude", "a plan", "patience"])
|
||||
|
||||
|
|
@ -562,6 +641,7 @@ class FalseEquivalence(MetaTemplate):
|
|||
"template_family": self.id,
|
||||
"template": template,
|
||||
"chain": f"{a} IsA same category as {b_word}; {a} HasProperty {p_word}",
|
||||
"chain_edges": chain_edges,
|
||||
"slots": {"A": a, "B": b_word, "P": p_word},
|
||||
}
|
||||
return saying, debug
|
||||
|
|
@ -621,7 +701,10 @@ TEMPLATE_REGISTRY = {
|
|||
|
||||
def generate_one(graph, template_id=None, seed_word=None, seed_category=None,
|
||||
debug=False, max_retries=20):
|
||||
"""Generate a single folksy saying."""
|
||||
"""Generate a single folksy saying.
|
||||
|
||||
When debug=True, always returns (saying, debug_dict) with chain_edges included.
|
||||
"""
|
||||
for _ in range(max_retries):
|
||||
if template_id:
|
||||
tid = template_id
|
||||
|
|
@ -631,7 +714,7 @@ def generate_one(graph, template_id=None, seed_word=None, seed_category=None,
|
|||
cls = TEMPLATE_REGISTRY.get(tid)
|
||||
if not cls:
|
||||
print(f"Unknown template: {tid}", file=sys.stderr)
|
||||
return None
|
||||
return None, None
|
||||
|
||||
tmpl = cls(graph)
|
||||
saying, dbg = tmpl.generate(seed_word=seed_word, seed_category=seed_category)
|
||||
|
|
@ -643,6 +726,16 @@ def generate_one(graph, template_id=None, seed_word=None, seed_category=None,
|
|||
return None, None
|
||||
|
||||
|
||||
def _get_seed_word(dbg):
|
||||
"""Extract the primary seed word from debug slots for dedup tracking."""
|
||||
slots = dbg.get("slots", {})
|
||||
# Templates use different slot names for the seed
|
||||
for key in ("A", "Z", "seed", "X"):
|
||||
if key in slots:
|
||||
return slots[key]
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate folksy fake-proverbs using ConceptNet relationships."
|
||||
|
|
@ -655,8 +748,13 @@ def main():
|
|||
parser.add_argument("--count", "-n", type=int, default=1, help="Number of sayings to generate")
|
||||
parser.add_argument("--output", "-o", help="Output file (default: stdout)")
|
||||
parser.add_argument("--debug", "-d", action="store_true", help="Show relationship chain debug info")
|
||||
parser.add_argument("--json", action="store_true", help="Output JSONL format with full metadata")
|
||||
parser.add_argument("--vocab", help="Path to folksy_vocab.csv")
|
||||
parser.add_argument("--relations", help="Path to folksy_relations.csv")
|
||||
parser.add_argument("--pure-conceptnet", action="store_true",
|
||||
help="Skip loading augmented relations file")
|
||||
parser.add_argument("--llm-weight-boost", type=float, default=0.0,
|
||||
help="Boost weight of LLM-augmented edges with weight < 1.0 (default: 0.0)")
|
||||
parser.add_argument("--list-templates", action="store_true", help="List available templates")
|
||||
parser.add_argument("--list-categories", action="store_true", help="List available categories")
|
||||
|
||||
|
|
@ -679,6 +777,30 @@ def main():
|
|||
print("Run scripts/extract_from_conceptnet.py first to generate data files.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load augmented relations if available
|
||||
if not args.pure_conceptnet:
|
||||
augmented_path = DATA_DIR / "folksy_relations_augmented.csv"
|
||||
if augmented_path.exists():
|
||||
boost = args.llm_weight_boost
|
||||
with open(augmented_path, newline="", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
count = 0
|
||||
for row in reader:
|
||||
sw = row["start_word"]
|
||||
ew = row["end_word"]
|
||||
rel = row["relation"]
|
||||
w = float(row["weight"])
|
||||
if w < 1.0 and boost:
|
||||
w = min(w + boost, 1.0)
|
||||
surf = row.get("surface_text", "")
|
||||
graph.edges[(sw, rel)].append((ew, w, surf))
|
||||
graph.reverse[(ew, rel)].append((sw, w, surf))
|
||||
graph.all_edges[sw].append((ew, rel, w))
|
||||
graph.all_edges[ew].append((sw, rel, w))
|
||||
count += 1
|
||||
if count:
|
||||
print(f"Loaded {count} augmented edges.", file=sys.stderr)
|
||||
|
||||
if args.list_categories:
|
||||
for cat in sorted(graph.by_category.keys()):
|
||||
print(f" {cat:20s} ({len(graph.by_category[cat])} words)")
|
||||
|
|
@ -688,26 +810,96 @@ def main():
|
|||
if args.entities:
|
||||
graph.merge_fictional(args.entities)
|
||||
|
||||
# JSON mode implies debug internally
|
||||
use_debug = args.debug or args.json
|
||||
|
||||
# Generate
|
||||
out = open(args.output, "w", encoding="utf-8") if args.output else sys.stdout
|
||||
try:
|
||||
for i in range(args.count):
|
||||
if args.count > 1:
|
||||
# Deduplication tracking for batch mode
|
||||
seen_text = set()
|
||||
seen_slots = set()
|
||||
seed_usage = defaultdict(int)
|
||||
generated = 0
|
||||
max_outer_attempts = args.count * 10 # generous outer limit
|
||||
attempts = 0
|
||||
|
||||
while generated < args.count and attempts < max_outer_attempts:
|
||||
attempts += 1
|
||||
saying, dbg = generate_one(
|
||||
graph,
|
||||
template_id=args.template,
|
||||
seed_word=args.seed,
|
||||
seed_category=args.category,
|
||||
debug=use_debug,
|
||||
)
|
||||
if not saying:
|
||||
continue
|
||||
|
||||
# Dedup checks (failures don't count against retry limit)
|
||||
if saying in seen_text:
|
||||
continue
|
||||
|
||||
if dbg:
|
||||
slots_key = (dbg["template_family"], frozenset(dbg["slots"].items()))
|
||||
if slots_key in seen_slots:
|
||||
continue
|
||||
|
||||
seed_w = _get_seed_word(dbg)
|
||||
if seed_w and seed_usage[seed_w] >= 30:
|
||||
continue
|
||||
if seed_w:
|
||||
seed_usage[seed_w] += 1
|
||||
seen_slots.add(slots_key)
|
||||
|
||||
seen_text.add(saying)
|
||||
generated += 1
|
||||
|
||||
if args.json and dbg:
|
||||
record = {
|
||||
"raw_text": saying,
|
||||
"meta_template": dbg["template_family"],
|
||||
"surface_template": dbg["template"],
|
||||
"slots": dbg["slots"],
|
||||
"chain": dbg.get("chain_edges", []),
|
||||
}
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
else:
|
||||
out.write(saying + "\n")
|
||||
if args.debug and dbg:
|
||||
out.write(f" [DEBUG] family={dbg['template_family']}\n")
|
||||
out.write(f" [DEBUG] chain: {dbg['chain']}\n")
|
||||
out.write(f" [DEBUG] slots: {dbg['slots']}\n")
|
||||
out.write("\n")
|
||||
else:
|
||||
# Single generation (no dedup needed)
|
||||
saying, dbg = generate_one(
|
||||
graph,
|
||||
template_id=args.template,
|
||||
seed_word=args.seed,
|
||||
seed_category=args.category,
|
||||
debug=args.debug,
|
||||
debug=use_debug,
|
||||
)
|
||||
if saying:
|
||||
out.write(saying + "\n")
|
||||
if args.debug and dbg:
|
||||
out.write(f" [DEBUG] family={dbg['template_family']}\n")
|
||||
out.write(f" [DEBUG] chain: {dbg['chain']}\n")
|
||||
out.write(f" [DEBUG] slots: {dbg['slots']}\n")
|
||||
out.write("\n")
|
||||
if args.json and dbg:
|
||||
record = {
|
||||
"raw_text": saying,
|
||||
"meta_template": dbg["template_family"],
|
||||
"surface_template": dbg["template"],
|
||||
"slots": dbg["slots"],
|
||||
"chain": dbg.get("chain_edges", []),
|
||||
}
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
else:
|
||||
out.write(saying + "\n")
|
||||
if args.debug and dbg:
|
||||
out.write(f" [DEBUG] family={dbg['template_family']}\n")
|
||||
out.write(f" [DEBUG] chain: {dbg['chain']}\n")
|
||||
out.write(f" [DEBUG] slots: {dbg['slots']}\n")
|
||||
out.write("\n")
|
||||
else:
|
||||
out.write(f"(failed to generate saying #{i+1} after retries)\n")
|
||||
out.write("(failed to generate saying after retries)\n")
|
||||
finally:
|
||||
if args.output:
|
||||
out.close()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue