feat: Implement enhanced action economy for LLM agent orchestration (#156)

- Add action economy system with free (LOOK, SPEAK) vs turn-ending (GO, WAIT, TAKE) actions - Implement LOOK action with detailed descriptions for doors, objects, entities, directions - Add SPEAK/ANNOUNCE speech system with room-wide and proximity-based message delivery - Create multi-tile pathing with FOV interrupt detection (path cancels when new entity visible) - Implement TAKE action with adjacency requirement and clear error messages - Add conversation history and error feedback loop so agents learn from failed actions - Create structured simulation logging for offline viewer replay - Document offline viewer requirements in OFFLINE_VIEWER_SPEC.md - Fix import path in 1_multi_agent_demo.py for standalone execution 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 20:50:00 -05:00 · 2025-12-28 20:50:00 -05:00 · 335efc5514
commit 335efc5514
parent 85e90088d5
6 changed files with 2232 additions and 2 deletions
--- a/tests/vllm_demo/4_enhanced_action_demo.py
+++ b/tests/vllm_demo/4_enhanced_action_demo.py
@ -0,0 +1,436 @@
+#!/usr/bin/env python3
+"""
+Enhanced Action Demo
+====================
+
+Demonstrates the enhanced action economy system:
+- Free actions (LOOK, SPEAK/ANNOUNCE) vs turn-ending (MOVE, WAIT)
+- Points of interest targeting for LOOK/MOVE
+- Speech system with room-wide ANNOUNCE and proximity SPEAK
+- Multi-tile path continuation with FOV interrupts
+- Enhanced logging for offline viewer replay
+
+This implements the turn-based LLM agent orchestration from issue #156.
+"""
+
+import sys
+import os
+# Add the vllm_demo directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import mcrfpy
+from mcrfpy import automation
+import requests
+import base64
+
+from world_graph import (
+    WorldGraph, Room, Door, WorldObject, Direction, AgentInfo,
+    create_two_room_scenario, create_button_door_scenario
+)
+from action_parser import parse_action
+from enhanced_executor import EnhancedExecutor
+from enhanced_orchestrator import EnhancedOrchestrator, EnhancedSimulationLog
+
+# Configuration
+VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions"
+SCREENSHOT_DIR = "/tmp/vllm_enhanced_demo"
+LOG_PATH = "/tmp/vllm_enhanced_demo/simulation_log.json"
+MAX_TURNS = 3
+
+# Sprites
+FLOOR_TILE = 0
+WALL_TILE = 40
+WIZARD_SPRITE = 84
+KNIGHT_SPRITE = 96
+RAT_SPRITE = 123
+
+
+class Agent:
+    """Agent with WorldGraph integration."""
+
+    def __init__(self, name: str, display_name: str, entity, world: WorldGraph):
+        self.name = name
+        self.display_name = display_name
+        self.entity = entity
+        self.world = world
+        self.message_history = []
+
+    @property
+    def pos(self) -> tuple:
+        return (int(self.entity.pos[0]), int(self.entity.pos[1]))
+
+    @property
+    def current_room(self) -> str:
+        room = self.world.room_at(*self.pos)
+        return room.name if room else None
+
+    def get_context(self, visible_agents: list) -> dict:
+        """Build context for LLM query."""
+        room_name = self.current_room
+        agent_infos = [
+            AgentInfo(
+                name=a.name,
+                display_name=a.display_name,
+                position=a.pos,
+                is_player=(a.name == self.name)
+            )
+            for a in visible_agents
+        ]
+        return {
+            "location": self.world.describe_room(room_name, agent_infos, self.name),
+            "available_actions": self.world.get_available_actions(room_name),
+            "recent_messages": self.message_history[-5:],
+        }
+
+
+def file_to_base64(path: str) -> str:
+    """Convert file to base64 string."""
+    with open(path, 'rb') as f:
+        return base64.b64encode(f.read()).decode('utf-8')
+
+
+def llm_query(agent, screenshot_path: str, context: dict) -> str:
+    """
+    Query VLLM for agent action with enhanced context.
+
+    Includes points of interest, action economy hints, error feedback,
+    and conversation history.
+    """
+    system_prompt = f"""You are {agent.display_name} exploring a dungeon.
+You receive visual and text information about your surroundings.
+
+ACTION ECONOMY:
+- LOOK <target>: Free action. Examine something, then choose another action.
+- SPEAK "<message>" or ANNOUNCE "<message>": Free action (once per turn). Then choose another action.
+- GO <direction>: Ends your turn. Move one tile in that direction (NORTH/SOUTH/EAST/WEST).
+- TAKE <item>: Ends your turn. Pick up an item you are standing next to.
+- WAIT: Ends your turn without moving.
+
+IMPORTANT: You can only TAKE items that are adjacent to you (1 tile away). If something is far away, GO towards it first.
+
+You can LOOK or SPEAK, then still MOVE in the same turn.
+Always end your final response with: Action: <YOUR_ACTION>"""
+
+    # Build enhanced prompt
+    parts = [context["location"]]
+
+    # Add received messages
+    if context.get("messages"):
+        parts.append("\nMessages received this turn:")
+        for msg in context["messages"]:
+            sender = msg.get("sender", "someone")
+            content = msg.get("content", "")
+            parts.append(f'  {sender} says: "{content}"')
+
+    # Add points of interest
+    if context.get("poi_prompt"):
+        parts.append(f"\n{context['poi_prompt']}")
+
+    # Add available actions
+    actions_str = ", ".join(context.get("available_actions", []))
+    parts.append(f"\nAvailable actions: {actions_str}")
+
+    # Add action economy hint
+    if context.get("has_spoken"):
+        parts.append("\n[You have already spoken this turn - you can still MOVE or WAIT]")
+
+    # Add error feedback from last failed action
+    if context.get("last_error"):
+        parts.append(f"\n[ERROR: {context['last_error']}]")
+        parts.append("[Your last action failed. Please try a different action.]")
+
+    # Add conversation history from this turn
+    if context.get("conversation_history"):
+        parts.append("\n[Previous attempts this turn:")
+        for exch in context["conversation_history"]:
+            action_str = f"{exch.get('action_type', '?')} {exch.get('action_args', '')}"
+            if exch.get("error"):
+                parts.append(f"  - You tried: {action_str} -> FAILED: {exch['error']}")
+            else:
+                parts.append(f"  - You did: {action_str}")
+        parts.append("]")
+
+    parts.append("\n[Screenshot attached showing your current view]")
+    parts.append("\nWhat do you do? Brief reasoning (1-2 sentences), then Action: <action>")
+
+    user_prompt = "\n".join(parts)
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {"type": "image_url", "image_url": {
+                    "url": "data:image/png;base64," + file_to_base64(screenshot_path)
+                }}
+            ]
+        }
+    ]
+
+    try:
+        resp = requests.post(VLLM_URL, json={'messages': messages}, timeout=60)
+        data = resp.json()
+        if "error" in data:
+            return f"[VLLM Error: {data['error']}]"
+        return data.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
+    except Exception as e:
+        return f"[Connection Error: {e}]"
+
+
+def setup_scene(world: WorldGraph):
+    """Create McRogueFace scene from WorldGraph."""
+    mcrfpy.createScene("enhanced_demo")
+    mcrfpy.setScene("enhanced_demo")
+    ui = mcrfpy.sceneUI("enhanced_demo")
+
+    texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16)
+
+    grid = mcrfpy.Grid(
+        grid_size=(25, 15),
+        texture=texture,
+        pos=(5, 5),
+        size=(1014, 700)
+    )
+    grid.fill_color = mcrfpy.Color(20, 20, 30)
+    grid.zoom = 2.0
+    ui.append(grid)
+
+    # Initialize all as walls
+    for x in range(25):
+        for y in range(15):
+            p = grid.at(x, y)
+            p.tilesprite = WALL_TILE
+            p.walkable = False
+            p.transparent = False
+
+    # Carve rooms from WorldGraph
+    for room in world.rooms.values():
+        for rx in range(room.x, room.x + room.width):
+            for ry in range(room.y, room.y + room.height):
+                if 0 <= rx < 25 and 0 <= ry < 15:
+                    p = grid.at(rx, ry)
+                    p.tilesprite = FLOOR_TILE
+                    p.walkable = True
+                    p.transparent = True
+
+    # Place doors
+    for door in world.doors:
+        dx, dy = door.position
+        if 0 <= dx < 25 and 0 <= dy < 15:
+            p = grid.at(dx, dy)
+            p.tilesprite = FLOOR_TILE
+            p.walkable = not door.locked
+            p.transparent = True
+
+    # FOV layer
+    fov_layer = grid.add_layer('color', z_index=10)
+    fov_layer.fill(mcrfpy.Color(0, 0, 0, 255))
+
+    return grid, fov_layer, texture
+
+
+def create_agents(grid, world: WorldGraph, texture) -> list:
+    """Create agents in their starting rooms."""
+    agents = []
+
+    # Wizard in guard_room (left)
+    room_a = world.rooms["guard_room"]
+    wizard = mcrfpy.Entity(
+        grid_pos=room_a.center,
+        texture=texture,
+        sprite_index=WIZARD_SPRITE
+    )
+    wizard.name = "wizard"
+    grid.entities.append(wizard)
+    agents.append(Agent("Wizard", "a wizard", wizard, world))
+
+    # Knight in armory (right)
+    room_b = world.rooms["armory"]
+    knight = mcrfpy.Entity(
+        grid_pos=room_b.center,
+        texture=texture,
+        sprite_index=KNIGHT_SPRITE
+    )
+    knight.name = "knight"
+    grid.entities.append(knight)
+    agents.append(Agent("Knight", "a knight", knight, world))
+
+    return agents
+
+
+def add_rat(grid, world: WorldGraph, texture, position: tuple):
+    """Add a rat entity at the specified position."""
+    rat = mcrfpy.Entity(
+        grid_pos=position,
+        texture=texture,
+        sprite_index=RAT_SPRITE
+    )
+    rat.name = "rat"
+    grid.entities.append(rat)
+    return rat
+
+
+def run_demo():
+    """Run enhanced action demo."""
+    print("=" * 70)
+    print("Enhanced Action Demo")
+    print("=" * 70)
+    print("""
+Features demonstrated:
+- LOOK as free action (doesn't end turn)
+- SPEAK/ANNOUNCE as free action (once per turn)
+- Points of interest targeting
+- Enhanced logging for offline viewer
+""")
+
+    os.makedirs(SCREENSHOT_DIR, exist_ok=True)
+
+    # Create world
+    print("Creating world...")
+    world = create_two_room_scenario()
+    print(f"  Rooms: {list(world.rooms.keys())}")
+    print(f"  Objects: {list(world.objects.keys())}")
+
+    # Setup scene
+    print("\nSetting up scene...")
+    grid, fov_layer, texture = setup_scene(world)
+
+    # Create agents
+    print("\nCreating agents...")
+    agents = create_agents(grid, world, texture)
+
+    # Add a rat near the door for interest
+    rat = add_rat(grid, world, texture, (9, 4))
+    print(f"  Added rat at (9, 4)")
+
+    for agent in agents:
+        print(f"  {agent.name} at {agent.pos} in {agent.current_room}")
+
+    # Create enhanced orchestrator
+    print("\nInitializing enhanced orchestrator...")
+    orchestrator = EnhancedOrchestrator(
+        grid=grid,
+        fov_layer=fov_layer,
+        world=world,
+        agents=agents,
+        screenshot_dir=SCREENSHOT_DIR,
+        llm_query_fn=llm_query
+    )
+
+    # Run simulation
+    print(f"\nRunning simulation ({MAX_TURNS} turns)...")
+    log = orchestrator.run_simulation(max_turns=MAX_TURNS)
+
+    # Save enhanced log
+    log.save(LOG_PATH)
+
+    # Print summary
+    print("\n" + "=" * 70)
+    print("SIMULATION SUMMARY")
+    print("=" * 70)
+
+    for turn in range(1, orchestrator.turn_number + 1):
+        print(log.get_turn_summary(turn))
+
+    # Print speech log
+    if log.speech_log:
+        print("\n" + "-" * 40)
+        print("SPEECH LOG")
+        print("-" * 40)
+        for entry in log.speech_log:
+            print(f"  Turn {entry['turn']}: {entry['speaker']} {entry['type']}s: \"{entry['content'][:50]}...\"")
+            if entry['recipients']:
+                print(f"    -> Heard by: {', '.join(entry['recipients'])}")
+
+    print("\n" + "=" * 70)
+    print("Demo Complete")
+    print("=" * 70)
+    print(f"\nScreenshots saved to: {SCREENSHOT_DIR}/")
+    print(f"Simulation log saved to: {LOG_PATH}")
+    print("\nLog structure (for offline viewer):")
+    print("  - metadata: simulation info")
+    print("  - steps[]: per-agent-turn records with:")
+    print("    - screenshot_path, position, room")
+    print("    - llm_prompt_user, llm_response")
+    print("    - free_actions[] (LOOK, SPEAK)")
+    print("    - final_action (MOVE, WAIT)")
+    print("  - speech_log[]: all speech events")
+
+    return True
+
+
+def replay_log(log_path: str):
+    """
+    Replay a simulation from log file.
+
+    This is a text-based preview of what the offline viewer would show.
+    """
+    print(f"Loading simulation from: {log_path}")
+
+    try:
+        log = EnhancedSimulationLog.load(log_path)
+    except FileNotFoundError:
+        print(f"Log file not found: {log_path}")
+        return
+
+    print("\n" + "=" * 70)
+    print("SIMULATION REPLAY")
+    print("=" * 70)
+    print(f"Turns: {log.metadata.get('total_turns', '?')}")
+    print(f"Agents: {', '.join(log.metadata.get('agent_names', []))}")
+    print(f"Rooms: {', '.join(log.metadata.get('world_rooms', []))}")
+
+    for step in log.steps:
+        print(f"\n{'='*40}")
+        print(f"Turn {step.turn}: {step.agent_id}")
+        print(f"{'='*40}")
+        print(f"Position: {step.position_start} -> {step.position_end}")
+        print(f"Room: {step.room}")
+
+        if step.pending_messages:
+            print(f"\nMessages received:")
+            for msg in step.pending_messages:
+                print(f"  {msg.get('sender')}: \"{msg.get('content', '')[:40]}...\"")
+
+        if step.llm_was_queried:
+            print(f"\nLLM Response (truncated):")
+            print(f"  {step.llm_response[:200]}...")
+        else:
+            print(f"\n[Path continuation - no LLM query]")
+
+        if step.free_actions:
+            print(f"\nFree actions:")
+            for fa in step.free_actions:
+                print(f"  - {fa['action_type']}: {fa.get('args', ())}")
+
+        status = "OK" if step.final_action_success else "FAIL"
+        print(f"\nFinal: {step.final_action_type} {step.final_action_args} [{status}]")
+        print(f"  {step.final_action_message}")
+
+    # Speech summary
+    if log.speech_log:
+        print("\n" + "=" * 40)
+        print("ALL SPEECH")
+        print("=" * 40)
+        for entry in log.speech_log:
+            print(f"Turn {entry['turn']}: {entry['speaker']} -> {entry['recipients']}")
+            print(f"  \"{entry['content']}\"")
+
+
+if __name__ == "__main__":
+    # Check for replay mode
+    if len(sys.argv) > 1 and sys.argv[1] == "--replay":
+        log_file = sys.argv[2] if len(sys.argv) > 2 else LOG_PATH
+        replay_log(log_file)
+        sys.exit(0)
+
+    # Normal execution
+    try:
+        success = run_demo()
+        print("\nPASS" if success else "\nFAIL")
+        sys.exit(0 if success else 1)
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)