Initial 'folksy idiom' generator

This commit is contained in:
John McCardle 2026-02-15 14:04:25 -05:00
commit 8c8a058301
11 changed files with 14485 additions and 0 deletions

View file

@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""
Extract ConceptNet relationships between words in the folksy vocabulary.
Reads folksy_vocab.csv, queries PostgreSQL conceptnet5 database, and writes
folksy_relations.csv with columns: start_word, end_word, relation, weight, surface_text
"""
import csv
import psycopg2
INPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_vocab.csv"
OUTPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_relations.csv"
RELATION_TYPES = [
"UsedFor", "AtLocation", "CapableOf", "HasA", "PartOf", "Causes",
"CausesDesire", "HasPrerequisite", "ReceivesAction", "Desires",
"LocatedNear", "CreatedBy", "MadeOf", "HasProperty", "MotivatedByGoal",
"HasSubevent",
]
def main():
# Step 1: Read the word list from folksy_vocab.csv
words = []
with open(INPUT_PATH, "r", newline="") as f:
reader = csv.DictReader(f)
for row in reader:
words.append(row["word"].strip())
print(f"Read {len(words)} words from {INPUT_PATH}")
# Build node URIs
word_uris = [f"/c/en/{w}" for w in words]
# Build relation URIs
relation_uris = [f"/r/{r}" for r in RELATION_TYPES]
conn = psycopg2.connect(dbname="conceptnet5")
cur = conn.cursor()
# Step 2: Look up all node IDs for these words
cur.execute("SELECT id, uri FROM nodes WHERE uri = ANY(%s)", (word_uris,))
node_rows = cur.fetchall()
uri_to_id = {uri: nid for nid, uri in node_rows}
id_to_uri = {nid: uri for nid, uri in node_rows}
found_words = [uri.replace("/c/en/", "") for uri in uri_to_id]
missing_words = set(words) - set(found_words)
print(f"Found {len(uri_to_id)} node IDs out of {len(words)} words")
if missing_words:
print(f"Missing {len(missing_words)} words: {sorted(missing_words)[:20]}...")
node_ids = list(uri_to_id.values())
# Step 3: Look up relation IDs
cur.execute("SELECT id, uri FROM relations WHERE uri = ANY(%s)", (relation_uris,))
rel_rows = cur.fetchall()
rel_id_to_name = {rid: uri.replace("/r/", "") for rid, uri in rel_rows}
rel_ids = list(rel_id_to_name.keys())
print(f"Found {len(rel_ids)} relation types: {sorted(rel_id_to_name.values())}")
# Step 4: Query edges where both start and end are in our folksy node set,
# relation is one of our types, and weight >= 1.0
cur.execute(
"""
SELECT e.start_id, e.end_id, e.relation_id, e.weight, e.data->>'surfaceText'
FROM edges e
WHERE e.start_id = ANY(%s)
AND e.end_id = ANY(%s)
AND e.relation_id = ANY(%s)
AND e.weight >= 1.0
ORDER BY e.weight DESC
""",
(node_ids, node_ids, rel_ids),
)
rows = cur.fetchall()
print(f"Found {len(rows)} edges")
cur.close()
conn.close()
# Step 5: Convert node IDs back to word strings and write CSV
results = []
for start_id, end_id, relation_id, weight, surface_text in rows:
start_word = id_to_uri[start_id].replace("/c/en/", "")
end_word = id_to_uri[end_id].replace("/c/en/", "")
relation = rel_id_to_name[relation_id]
results.append((start_word, end_word, relation, weight, surface_text or ""))
# Step 6: Write output CSV sorted by weight descending (already sorted by query)
with open(OUTPUT_PATH, "w", newline="") as f:
writer = csv.writer(f, quoting=csv.QUOTE_ALL)
writer.writerow(["start_word", "end_word", "relation", "weight", "surface_text"])
for row in results:
writer.writerow(row)
print(f"Wrote {len(results)} relationships to {OUTPUT_PATH}")
if __name__ == "__main__":
main()