#!/usr/bin/env python3 """ Extract ConceptNet relationships between words in the folksy vocabulary. Reads folksy_vocab.csv, queries PostgreSQL conceptnet5 database, and writes folksy_relations.csv with columns: start_word, end_word, relation, weight, surface_text """ import csv import psycopg2 INPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_vocab.csv" OUTPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_relations.csv" RELATION_TYPES = [ "UsedFor", "AtLocation", "CapableOf", "HasA", "PartOf", "Causes", "CausesDesire", "HasPrerequisite", "ReceivesAction", "Desires", "LocatedNear", "CreatedBy", "MadeOf", "HasProperty", "MotivatedByGoal", "HasSubevent", ] def main(): # Step 1: Read the word list from folksy_vocab.csv words = [] with open(INPUT_PATH, "r", newline="") as f: reader = csv.DictReader(f) for row in reader: words.append(row["word"].strip()) print(f"Read {len(words)} words from {INPUT_PATH}") # Build node URIs word_uris = [f"/c/en/{w}" for w in words] # Build relation URIs relation_uris = [f"/r/{r}" for r in RELATION_TYPES] conn = psycopg2.connect(dbname="conceptnet5") cur = conn.cursor() # Step 2: Look up all node IDs for these words cur.execute("SELECT id, uri FROM nodes WHERE uri = ANY(%s)", (word_uris,)) node_rows = cur.fetchall() uri_to_id = {uri: nid for nid, uri in node_rows} id_to_uri = {nid: uri for nid, uri in node_rows} found_words = [uri.replace("/c/en/", "") for uri in uri_to_id] missing_words = set(words) - set(found_words) print(f"Found {len(uri_to_id)} node IDs out of {len(words)} words") if missing_words: print(f"Missing {len(missing_words)} words: {sorted(missing_words)[:20]}...") node_ids = list(uri_to_id.values()) # Step 3: Look up relation IDs cur.execute("SELECT id, uri FROM relations WHERE uri = ANY(%s)", (relation_uris,)) rel_rows = cur.fetchall() rel_id_to_name = {rid: uri.replace("/r/", "") for rid, uri in rel_rows} rel_ids = list(rel_id_to_name.keys()) print(f"Found {len(rel_ids)} relation types: {sorted(rel_id_to_name.values())}") # Step 4: Query edges where both start and end are in our folksy node set, # relation is one of our types, and weight >= 1.0 cur.execute( """ SELECT e.start_id, e.end_id, e.relation_id, e.weight, e.data->>'surfaceText' FROM edges e WHERE e.start_id = ANY(%s) AND e.end_id = ANY(%s) AND e.relation_id = ANY(%s) AND e.weight >= 1.0 ORDER BY e.weight DESC """, (node_ids, node_ids, rel_ids), ) rows = cur.fetchall() print(f"Found {len(rows)} edges") cur.close() conn.close() # Step 5: Convert node IDs back to word strings and write CSV results = [] for start_id, end_id, relation_id, weight, surface_text in rows: start_word = id_to_uri[start_id].replace("/c/en/", "") end_word = id_to_uri[end_id].replace("/c/en/", "") relation = rel_id_to_name[relation_id] results.append((start_word, end_word, relation, weight, surface_text or "")) # Step 6: Write output CSV sorted by weight descending (already sorted by query) with open(OUTPUT_PATH, "w", newline="") as f: writer = csv.writer(f, quoting=csv.QUOTE_ALL) writer.writerow(["start_word", "end_word", "relation", "weight", "surface_text"]) for row in results: writer.writerow(row) print(f"Wrote {len(results)} relationships to {OUTPUT_PATH}") if __name__ == "__main__": main()