folksy_idioms/scripts/enhance_graph.py

#!/usr/bin/env python3
"""LLM-augmented graph enhancement for the folksy subgraph.

Three phases:
  Phase 1: Per-word relationship expansion
  Phase 2: Cross-word bridge discovery
  Phase 3: Property enrichment for false_equivalence templates

Usage:
  python scripts/enhance_graph.py --phase 1        # Run phase 1 only
  python scripts/enhance_graph.py --phase 2        # Run phase 2 only
  python scripts/enhance_graph.py --phase 3        # Run phase 3 only
  python scripts/enhance_graph.py --all             # Run all phases
  python scripts/enhance_graph.py --phase 1 --dry-run  # Print prompts without calling LLM
"""

import argparse
import csv
import os
import random
import re
import sys
import time
from collections import defaultdict
from datetime import datetime
from pathlib import Path

# Paths
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
DATA_DIR = PROJECT_DIR / "data"

LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
LLM_MODEL = "THUDM-GLM4-32B"

VALID_RELATIONS = {
    "AtLocation", "MadeOf", "PartOf", "UsedFor", "HasA", "HasProperty",
    "Causes", "HasPrerequisite", "CapableOf", "ReceivesAction", "Desires",
    "CausesDesire", "LocatedNear", "CreatedBy", "MotivatedByGoal", "HasSubevent",
}

AUGMENTED_CSV = DATA_DIR / "folksy_relations_augmented.csv"
CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"
LOG_CSV = DATA_DIR / "enhancement_log.csv"

# ---------------------------------------------------------------------------
# Infrastructure
# ---------------------------------------------------------------------------

def llm_chat_completion(messages, max_retries=3):
    """Chat completion with retry logic."""
    import requests

    for attempt in range(max_retries):
        try:
            resp = requests.post(LLM_ENDPOINT, json={
                "model": LLM_MODEL,
                "messages": messages,
            }, timeout=120)
            resp.raise_for_status()
            data = resp.json()
            return data["choices"][0]["message"]["content"]
        except Exception as e:
            wait = (2 ** attempt)
            print(f"  LLM call failed (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
            if attempt < max_retries - 1:
                print(f"  Retrying in {wait}s...", file=sys.stderr)
                time.sleep(wait)
            else:
                print(f"  Giving up on this word.", file=sys.stderr)
                return None


def load_vocab():
    """Load folksy vocabulary."""
    vocab = {}
    with open(DATA_DIR / "folksy_vocab.csv", newline="", encoding="utf-8") as f:
        for row in csv.DictReader(f):
            word = row["word"]
            cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
            vocab[word] = {
                "categories": cats,
                "tangibility": float(row.get("tangibility_score", 0)),
                "edge_count": int(row.get("conceptnet_edge_count", 0)),
            }
    return vocab


def load_relations():
    """Load existing relations (ConceptNet + any existing augmented)."""
    edges = defaultdict(list)  # (start, relation) -> [(end, weight, surface)]
    existing_triples = set()   # (start, end, relation) for dedup

    for path in [DATA_DIR / "folksy_relations.csv", AUGMENTED_CSV]:
        if not path.exists():
            continue
        with open(path, newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                sw = row["start_word"]
                ew = row["end_word"]
                rel = row["relation"]
                if not row['weight']: continue # corruption / skip?
                w = float(row["weight"])
                surf = row.get("surface_text", "")
                edges[(sw, rel)].append((ew, w, surf))
                existing_triples.add((sw, ew, rel))

    return edges, existing_triples


def load_checkpoint():
    """Load enhancement log to determine what's already been processed."""
    processed = set()  # (word, phase)
    if LOG_CSV.exists():
        with open(LOG_CSV, newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                processed.add((row["source_word"], row["phase"]))
    return processed


def append_log(word, phase, edges_generated, edges_accepted, edges_duplicate, edges_oov):
    """Append a row to the enhancement log."""
    write_header = not LOG_CSV.exists()
    with open(LOG_CSV, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(["source_word", "phase", "timestamp",
                             "edges_generated", "edges_accepted", "edges_duplicate", "edges_oov"])
        writer.writerow([word, phase, datetime.now().isoformat(),
                         edges_generated, edges_accepted, edges_duplicate, edges_oov])


def append_augmented_edges(edges):
    """Append edges to the augmented relations CSV."""
    write_header = not AUGMENTED_CSV.exists()
    with open(AUGMENTED_CSV, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(["start_word", "end_word", "relation", "weight", "surface_text", "source"])
        for e in edges:
            writer.writerow([e["start_word"], e["end_word"], e["relation"],
                             e["weight"], e["surface_text"], e["source"]])


def append_candidates(candidates):
    """Append candidate words to the candidate additions CSV."""
    write_header = not CANDIDATE_CSV.exists()
    with open(CANDIDATE_CSV, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(["word", "suggested_by", "relation_context", "frequency"])
        for c in candidates:
            writer.writerow([c["word"], c["suggested_by"], c["relation_context"], c["frequency"]])


# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------

def parse_llm_relations(response_text, source_word):
    """Parse structured LLM output into edge dicts.

    Handles bullets, numbering, extra whitespace, multi-word targets.
    """
    edges = []
    if not response_text:
        return edges

    for line in response_text.strip().split("\n"):
        line = line.strip()
        if not line:
            continue

        # Strip leading bullets/numbers: "- ", "1. ", "* ", etc.
        line = re.sub(r"^[\d]+[.)]\s*", "", line)
        line = re.sub(r"^[-*•]\s*", "", line)
        line = line.strip()

        if not line or "NONE" in line.upper():
            continue

        # Match: RELATION_TYPE: target_word(s) | surface text
        match = re.match(r"^(\w+):\s*(.+?)\s*\|\s*(.+)$", line)
        if not match:
            continue

        relation, target_raw, surface = match.groups()
        relation = relation.strip()

        if relation not in VALID_RELATIONS:
            continue

        # Normalize target: lowercase, replace spaces with underscores for multi-word
        target = target_raw.strip().lower()
        target = re.sub(r"\s+", "_", target)

        # Skip self-loops
        if target == source_word:
            continue

        edges.append({
            "start_word": source_word,
            "end_word": target,
            "relation": relation,
            "weight": 0.8,
            "surface_text": surface.strip(),
            "source": "llm_augmented",
        })

    return edges


def parse_bridge_response(response_text, word_a, word_b):
    """Parse bridge discovery LLM output."""
    edges = []
    if not response_text:
        return edges

    for line in response_text.strip().split("\n"):
        line = line.strip()
        if not line:
            continue

        # Strip common prefixes
        line = re.sub(r"^[\d]+[.)]\s*", "", line)
        line = re.sub(r"^[-*•]\s*", "", line)
        line = re.sub(r"^BRIDGE:\s*", "", line, flags=re.IGNORECASE)
        line = line.strip()

        if not line:
            continue

        # BRIDGE_WORD | relation_to_first: TYPE | relation_to_second: TYPE | explanation
        parts = [p.strip() for p in line.split("|")]
        if len(parts) < 3:
            continue

        bridge_word = parts[0].strip().lower().replace(" ", "_")

        # Parse relation_to_first
        rel1_match = re.search(r"(?:relation_to_first|first):\s*(\w+)", parts[1], re.IGNORECASE)
        rel2_match = re.search(r"(?:relation_to_second|second):\s*(\w+)", parts[2], re.IGNORECASE)

        if not rel1_match or not rel2_match:
            # Try simpler format: just the relation type
            rel1_match = re.match(r"(\w+)", parts[1].split(":")[-1].strip())
            rel2_match = re.match(r"(\w+)", parts[2].split(":")[-1].strip())

        if not rel1_match or not rel2_match:
            continue

        rel1 = rel1_match.group(1)
        rel2 = rel2_match.group(1)

        if rel1 not in VALID_RELATIONS or rel2 not in VALID_RELATIONS:
            continue

        explanation = parts[3].strip() if len(parts) > 3 else ""

        # Create edges: word_a -> bridge and bridge -> word_b
        edges.append({
            "start_word": word_a,
            "end_word": bridge_word,
            "relation": rel1,
            "weight": 0.8,
            "surface_text": explanation,
            "source": "llm_bridge",
        })
        edges.append({
            "start_word": bridge_word,
            "end_word": word_b,
            "relation": rel2,
            "weight": 0.8,
            "surface_text": explanation,
            "source": "llm_bridge",
        })

    return edges


def parse_property_response(response_text, word):
    """Parse property enrichment LLM output."""
    edges = []
    if not response_text:
        return edges

    for line in response_text.strip().split("\n"):
        line = line.strip()
        if not line:
            continue

        line = re.sub(r"^[\d]+[.)]\s*", "", line)
        line = re.sub(r"^[-*•]\s*", "", line)
        line = line.strip()

        if not line:
            continue

        # PROPERTY | explanation
        parts = [p.strip() for p in line.split("|")]
        if len(parts) < 1:
            continue

        prop = parts[0].strip().lower().replace(" ", "_")
        explanation = parts[1].strip() if len(parts) > 1 else f"{word} is {prop}"

        if not prop or prop == word:
            continue

        edges.append({
            "start_word": word,
            "end_word": prop,
            "relation": "HasProperty",
            "weight": 0.8,
            "surface_text": explanation,
            "source": "llm_property",
        })

    return edges


# ---------------------------------------------------------------------------
# Phase 1: Per-Word Expansion
# ---------------------------------------------------------------------------

PHASE1_SYSTEM = """You are a commonsense knowledge annotator. You will be given a concrete noun and its known relationships. Your job is to generate ADDITIONAL commonsense relationships that are missing.

Rules:
- Only generate relationships involving concrete, tangible things (animals, foods, tools, plants, buildings, weather, landscape, household objects)
- Every relationship must be something a typical adult would agree is true
- Do not repeat any relationship already listed as "known"
- Target words should be common English words (top 3000 frequency preferred)
- Output ONLY the structured format shown below, one relationship per line
- If you cannot think of good relationships for a given type, output NONE for that type
- Aim for 3-5 relationships per type where possible

Output format (one per line):
RELATION_TYPE: target_word | short natural phrasing

Example output:
AtLocation: barn | you find a horse in a barn
UsedFor: riding | a horse is used for riding
HasA: mane | a horse has a mane
CapableOf: gallop | a horse can gallop
MadeOf: NONE
PartOf: herd | a horse is part of a herd"""


PHASE1_USER = """Word: {word}
Categories: {categories}

Known relationships:
{existing_edges}

Generate additional relationships for these types:
- AtLocation (where is it found?)
- UsedFor (what is it used for?)
- HasA (what does it have / contain?)
- PartOf (what is it part of?)
- CapableOf (what can it do?)
- MadeOf (what is it made of?)
- HasPrerequisite (what do you need before you can have/use it?)
- Causes (what does it cause or lead to?)
- HasProperty (what adjectives describe it? — limit to physical/sensory properties)"""


def format_existing_edges(edges_dict, word):
    """Format existing edges for a word grouped by relation type."""
    relation_types = ["AtLocation", "UsedFor", "HasA", "PartOf", "CapableOf",
                      "MadeOf", "HasPrerequisite", "Causes", "HasProperty"]

    lines = []
    for rel in relation_types:
        targets = edges_dict.get((word, rel), [])
        if targets:
            formatted = ", ".join(f"{t[0]} (weight {t[1]:.1f})" for t in targets[:10])
            lines.append(f"{rel}: {formatted}")
        else:
            lines.append(f"{rel}: (none in database)")
    return "\n".join(lines)


def run_phase1(vocab, edges, existing_triples, checkpoint, dry_run=False):
    """Phase 1: Per-word relationship expansion."""
    words = sorted(vocab.keys())
    total = len(words)
    total_accepted = 0
    total_skipped = 0

    print(f"Phase 1: Processing {total} words...")

    for i, word in enumerate(words):
        if (word, "1") in checkpoint:
            total_skipped += 1
            continue

        categories = ", ".join(vocab[word]["categories"])
        existing = format_existing_edges(edges, word)

        user_prompt = PHASE1_USER.format(
            word=word, categories=categories, existing_edges=existing
        )

        messages = [
            {"role": "system", "content": PHASE1_SYSTEM},
            {"role": "user", "content": user_prompt},
        ]

        if dry_run:
            if i < 3:  # Show first 3 prompts
                print(f"\n--- Prompt for '{word}' ---")
                print(f"System: {PHASE1_SYSTEM[:200]}...")
                print(f"User:\n{user_prompt}")
            elif i == 3:
                print(f"\n... ({total - 3} more words) ...")
            continue

        response = llm_chat_completion(messages)
        parsed = parse_llm_relations(response, word) if response else []

        # Classify edges
        accepted = []
        candidates = []
        duplicates = 0

        for edge in parsed:
            triple = (edge["start_word"], edge["end_word"], edge["relation"])
            if triple in existing_triples:
                duplicates += 1
                continue

            existing_triples.add(triple)

            if edge["end_word"] in vocab:
                accepted.append(edge)
            else:
                candidates.append({
                    "word": edge["end_word"],
                    "suggested_by": word,
                    "relation_context": f"{edge['relation']}: {edge['surface_text']}",
                    "frequency": 1,
                })

        if accepted:
            append_augmented_edges(accepted)
            # Also update in-memory edges for subsequent words
            for e in accepted:
                edges[(e["start_word"], e["relation"])].append(
                    (e["end_word"], e["weight"], e["surface_text"]))

        if candidates:
            append_candidates(candidates)

        total_accepted += len(accepted)

        append_log(word, "1", len(parsed), len(accepted), duplicates, len(candidates))

        if (i + 1) % 50 == 0:
            print(f"  [{i+1}/{total}] {total_accepted} edges accepted so far")

        time.sleep(0.1)

    if dry_run:
        print(f"\nDry run complete. Would process {total - total_skipped} words.")
    else:
        print(f"\nPhase 1 complete: {total_accepted} new edges accepted.")


# ---------------------------------------------------------------------------
# Phase 2: Cross-Word Bridge Discovery
# ---------------------------------------------------------------------------

PHASE2_SYSTEM = """You are a commonsense knowledge annotator. You will be given two concrete nouns. Your job is to identify a BRIDGE word that connects them — something that relates to both.

Rules:
- The bridge word must be a common, concrete noun
- State the relationship type for each connection
- Valid relationship types: AtLocation, UsedFor, HasA, PartOf, CapableOf, MadeOf, HasPrerequisite, Causes, HasProperty, ReceivesAction, Desires, CausesDesire, LocatedNear, CreatedBy
- Output format: BRIDGE_WORD | relation_to_first: TYPE | relation_to_second: TYPE | explanation

Example:
Words: "cow" and "butter"
milk | relation_to_first: CapableOf | relation_to_second: MadeOf | milk connects production to product"""


PHASE2_USER = """Words: "{word_a}" and "{word_b}"
Categories: {word_a} is {categories_a}, {word_b} is {categories_b}
Find 1-3 bridge words that connect them."""


def build_reachability(vocab, edges):
    """Build 2-hop reachability from vocab words to other vocab words."""
    vocab_set = set(vocab.keys())
    reachable = defaultdict(set)  # word -> set of reachable vocab words

    for word in vocab:
        # Direct (1-hop) neighbors in vocab
        for (sw, rel), targets in edges.items():
            if sw == word:
                for (ew, w, s) in targets:
                    if ew in vocab_set and ew != word:
                        reachable[word].add(ew)
                        # 2-hop from this neighbor
                        for (sw2, rel2), targets2 in edges.items():
                            if sw2 == ew:
                                for (ew2, w2, s2) in targets2:
                                    if ew2 in vocab_set and ew2 != word:
                                        reachable[word].add(ew2)

    return reachable


def run_phase2(vocab, edges, existing_triples, checkpoint, dry_run=False):
    """Phase 2: Cross-word bridge discovery."""
    print("Phase 2: Building reachability matrix...")
    reachable = build_reachability(vocab, edges)

    # Find low-connectivity words
    vocab_set = set(vocab.keys())
    low_connectivity = []
    for word in vocab:
        reach_count = len(reachable.get(word, set()))
        if reach_count < 10:
            low_connectivity.append((word, reach_count))

    low_connectivity.sort(key=lambda x: x[1])
    print(f"  {len(low_connectivity)} words with <10 reachable vocab words")

    # Build category index
    by_category = defaultdict(list)
    for word, info in vocab.items():
        for cat in info["categories"]:
            by_category[cat].append(word)

    total_accepted = 0
    pairs_processed = 0
    total_skipped = 0

    for word, reach_count in low_connectivity:
        if (word, "2") in checkpoint:
            total_skipped += 1
            continue

        word_cats = vocab[word]["categories"]
        word_reachable = reachable.get(word, set())

        # Find same-category words that are unreachable
        unreachable = []
        for cat in word_cats:
            for peer in by_category.get(cat, []):
                if peer != word and peer not in word_reachable:
                    unreachable.append(peer)

        if not unreachable:
            append_log(word, "2", 0, 0, 0, 0)
            continue

        # Sample 5-10 unreachable peers
        sample = random.sample(unreachable, min(10, len(unreachable)))

        accepted_for_word = 0

        for peer in sample:
            pair_key = f"{word}:{peer}"
            if (pair_key, "2") in checkpoint:
                continue

            categories_a = ", ".join(vocab[word]["categories"])
            categories_b = ", ".join(vocab[peer]["categories"])

            user_prompt = PHASE2_USER.format(
                word_a=word, word_b=peer,
                categories_a=categories_a, categories_b=categories_b,
            )

            messages = [
                {"role": "system", "content": PHASE2_SYSTEM},
                {"role": "user", "content": user_prompt},
            ]

            if dry_run:
                if pairs_processed < 3:
                    print(f"\n--- Bridge prompt: '{word}' <-> '{peer}' ---")
                    print(f"User:\n{user_prompt}")
                elif pairs_processed == 3:
                    print(f"\n... (more pairs) ...")
                pairs_processed += 1
                continue

            response = llm_chat_completion(messages)
            parsed = parse_bridge_response(response, word, peer) if response else []

            accepted = []
            duplicates = 0
            oov = 0

            for edge in parsed:
                triple = (edge["start_word"], edge["end_word"], edge["relation"])
                if triple in existing_triples:
                    duplicates += 1
                    continue
                existing_triples.add(triple)

                # For bridge edges, both endpoints should ideally be in vocab
                if edge["start_word"] in vocab_set and edge["end_word"] in vocab_set:
                    accepted.append(edge)
                elif edge["start_word"] in vocab_set or edge["end_word"] in vocab_set:
                    # At least one end in vocab — still useful
                    accepted.append(edge)
                else:
                    oov += 1

            if accepted:
                append_augmented_edges(accepted)
                for e in accepted:
                    edges[(e["start_word"], e["relation"])].append(
                        (e["end_word"], e["weight"], e["surface_text"]))
                accepted_for_word += len(accepted)

            pairs_processed += 1
            time.sleep(0.1)

        total_accepted += accepted_for_word
        append_log(word, "2", 0, accepted_for_word, 0, 0)

        if (pairs_processed) % 20 == 0:
            print(f"  {pairs_processed} pairs processed, {total_accepted} edges accepted")

    if dry_run:
        print(f"\nDry run complete. Would process {pairs_processed} word pairs.")
    else:
        print(f"\nPhase 2 complete: {total_accepted} bridge edges accepted from {pairs_processed} pairs.")


# ---------------------------------------------------------------------------
# Phase 3: Property Enrichment
# ---------------------------------------------------------------------------

PHASE3_SYSTEM = """You are a commonsense knowledge annotator. Given a concrete noun, list its most distinctive physical or sensory properties — things you could see, touch, hear, smell, or taste. Also list behavioral properties for animals.

Rules:
- Only physical/sensory/behavioral properties, not abstract qualities
- Properties should DISTINGUISH this thing from similar things in its category
- Output one property per line as: PROPERTY | brief explanation
- Aim for 5-8 properties"""


PHASE3_USER = """Word: {word}
Category: {categories}
Other words in same category: {peers}

What properties distinguish {word} from the others listed?"""


def run_phase3(vocab, edges, existing_triples, checkpoint, dry_run=False):
    """Phase 3: Property enrichment for false_equivalence templates."""
    by_category = defaultdict(list)
    for word, info in vocab.items():
        for cat in info["categories"]:
            by_category[cat].append(word)

    words = sorted(vocab.keys())
    total = len(words)
    total_accepted = 0
    total_skipped = 0

    print(f"Phase 3: Property enrichment for {total} words...")

    for i, word in enumerate(words):
        if (word, "3") in checkpoint:
            total_skipped += 1
            continue

        word_cats = vocab[word]["categories"]
        categories = ", ".join(word_cats)

        # Gather same-category peers (sample of 10)
        peers = set()
        for cat in word_cats:
            for peer in by_category.get(cat, []):
                if peer != word:
                    peers.add(peer)
        peer_sample = random.sample(list(peers), min(10, len(peers))) if peers else []

        if not peer_sample:
            append_log(word, "3", 0, 0, 0, 0)
            continue

        user_prompt = PHASE3_USER.format(
            word=word, categories=categories,
            peers=", ".join(peer_sample),
        )

        messages = [
            {"role": "system", "content": PHASE3_SYSTEM},
            {"role": "user", "content": user_prompt},
        ]

        if dry_run:
            if i < 3:
                print(f"\n--- Property prompt for '{word}' ---")
                print(f"User:\n{user_prompt}")
            elif i == 3:
                print(f"\n... ({total - 3} more words) ...")
            continue

        response = llm_chat_completion(messages)
        parsed = parse_property_response(response, word) if response else []

        accepted = []
        duplicates = 0

        for edge in parsed:
            triple = (edge["start_word"], edge["end_word"], edge["relation"])
            if triple in existing_triples:
                duplicates += 1
                continue
            existing_triples.add(triple)
            accepted.append(edge)

        if accepted:
            append_augmented_edges(accepted)
            for e in accepted:
                edges[(e["start_word"], e["relation"])].append(
                    (e["end_word"], e["weight"], e["surface_text"]))

        total_accepted += len(accepted)
        append_log(word, "3", len(parsed), len(accepted), duplicates, 0)

        if (i + 1) % 50 == 0:
            print(f"  [{i+1}/{total}] {total_accepted} properties accepted so far")

        time.sleep(0.1)

    if dry_run:
        print(f"\nDry run complete. Would process {total - total_skipped} words.")
    else:
        print(f"\nPhase 3 complete: {total_accepted} new HasProperty edges accepted.")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="LLM-augmented graph enhancement for folksy subgraph."
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--phase", type=int, choices=[1, 2, 3],
                       help="Run a specific phase (1, 2, or 3)")
    group.add_argument("--all", action="store_true",
                       help="Run all three phases in sequence")
    parser.add_argument("--dry-run", action="store_true",
                        help="Print prompts without calling LLM")

    args = parser.parse_args()

    vocab = load_vocab()
    edges, existing_triples = load_relations()
    checkpoint = load_checkpoint()

    print(f"Loaded {len(vocab)} vocab words, {len(existing_triples)} existing edge triples.")
    print(f"Checkpoint: {len(checkpoint)} (word, phase) pairs already processed.")

    phases = [args.phase] if args.phase else [1, 2, 3]

    for phase in phases:
        print(f"\n{'='*60}")
        print(f"Running Phase {phase}")
        print(f"{'='*60}")

        if phase == 1:
            run_phase1(vocab, edges, existing_triples, checkpoint, args.dry_run)
        elif phase == 2:
            run_phase2(vocab, edges, existing_triples, checkpoint, args.dry_run)
        elif phase == 3:
            run_phase3(vocab, edges, existing_triples, checkpoint, args.dry_run)

        # Reload checkpoint after each phase for resumability
        checkpoint = load_checkpoint()

    print("\nDone.")


if __name__ == "__main__":
    main()