McRogueFace/tests/vllm_demo/4_enhanced_action_demo.py
John McCardle 335efc5514 feat: Implement enhanced action economy for LLM agent orchestration (#156)
- Add action economy system with free (LOOK, SPEAK) vs turn-ending (GO, WAIT, TAKE) actions
- Implement LOOK action with detailed descriptions for doors, objects, entities, directions
- Add SPEAK/ANNOUNCE speech system with room-wide and proximity-based message delivery
- Create multi-tile pathing with FOV interrupt detection (path cancels when new entity visible)
- Implement TAKE action with adjacency requirement and clear error messages
- Add conversation history and error feedback loop so agents learn from failed actions
- Create structured simulation logging for offline viewer replay
- Document offline viewer requirements in OFFLINE_VIEWER_SPEC.md
- Fix import path in 1_multi_agent_demo.py for standalone execution

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 20:50:00 -05:00

436 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Enhanced Action Demo
====================
Demonstrates the enhanced action economy system:
- Free actions (LOOK, SPEAK/ANNOUNCE) vs turn-ending (MOVE, WAIT)
- Points of interest targeting for LOOK/MOVE
- Speech system with room-wide ANNOUNCE and proximity SPEAK
- Multi-tile path continuation with FOV interrupts
- Enhanced logging for offline viewer replay
This implements the turn-based LLM agent orchestration from issue #156.
"""
import sys
import os
# Add the vllm_demo directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import mcrfpy
from mcrfpy import automation
import requests
import base64
from world_graph import (
WorldGraph, Room, Door, WorldObject, Direction, AgentInfo,
create_two_room_scenario, create_button_door_scenario
)
from action_parser import parse_action
from enhanced_executor import EnhancedExecutor
from enhanced_orchestrator import EnhancedOrchestrator, EnhancedSimulationLog
# Configuration
VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions"
SCREENSHOT_DIR = "/tmp/vllm_enhanced_demo"
LOG_PATH = "/tmp/vllm_enhanced_demo/simulation_log.json"
MAX_TURNS = 3
# Sprites
FLOOR_TILE = 0
WALL_TILE = 40
WIZARD_SPRITE = 84
KNIGHT_SPRITE = 96
RAT_SPRITE = 123
class Agent:
"""Agent with WorldGraph integration."""
def __init__(self, name: str, display_name: str, entity, world: WorldGraph):
self.name = name
self.display_name = display_name
self.entity = entity
self.world = world
self.message_history = []
@property
def pos(self) -> tuple:
return (int(self.entity.pos[0]), int(self.entity.pos[1]))
@property
def current_room(self) -> str:
room = self.world.room_at(*self.pos)
return room.name if room else None
def get_context(self, visible_agents: list) -> dict:
"""Build context for LLM query."""
room_name = self.current_room
agent_infos = [
AgentInfo(
name=a.name,
display_name=a.display_name,
position=a.pos,
is_player=(a.name == self.name)
)
for a in visible_agents
]
return {
"location": self.world.describe_room(room_name, agent_infos, self.name),
"available_actions": self.world.get_available_actions(room_name),
"recent_messages": self.message_history[-5:],
}
def file_to_base64(path: str) -> str:
"""Convert file to base64 string."""
with open(path, 'rb') as f:
return base64.b64encode(f.read()).decode('utf-8')
def llm_query(agent, screenshot_path: str, context: dict) -> str:
"""
Query VLLM for agent action with enhanced context.
Includes points of interest, action economy hints, error feedback,
and conversation history.
"""
system_prompt = f"""You are {agent.display_name} exploring a dungeon.
You receive visual and text information about your surroundings.
ACTION ECONOMY:
- LOOK <target>: Free action. Examine something, then choose another action.
- SPEAK "<message>" or ANNOUNCE "<message>": Free action (once per turn). Then choose another action.
- GO <direction>: Ends your turn. Move one tile in that direction (NORTH/SOUTH/EAST/WEST).
- TAKE <item>: Ends your turn. Pick up an item you are standing next to.
- WAIT: Ends your turn without moving.
IMPORTANT: You can only TAKE items that are adjacent to you (1 tile away). If something is far away, GO towards it first.
You can LOOK or SPEAK, then still MOVE in the same turn.
Always end your final response with: Action: <YOUR_ACTION>"""
# Build enhanced prompt
parts = [context["location"]]
# Add received messages
if context.get("messages"):
parts.append("\nMessages received this turn:")
for msg in context["messages"]:
sender = msg.get("sender", "someone")
content = msg.get("content", "")
parts.append(f' {sender} says: "{content}"')
# Add points of interest
if context.get("poi_prompt"):
parts.append(f"\n{context['poi_prompt']}")
# Add available actions
actions_str = ", ".join(context.get("available_actions", []))
parts.append(f"\nAvailable actions: {actions_str}")
# Add action economy hint
if context.get("has_spoken"):
parts.append("\n[You have already spoken this turn - you can still MOVE or WAIT]")
# Add error feedback from last failed action
if context.get("last_error"):
parts.append(f"\n[ERROR: {context['last_error']}]")
parts.append("[Your last action failed. Please try a different action.]")
# Add conversation history from this turn
if context.get("conversation_history"):
parts.append("\n[Previous attempts this turn:")
for exch in context["conversation_history"]:
action_str = f"{exch.get('action_type', '?')} {exch.get('action_args', '')}"
if exch.get("error"):
parts.append(f" - You tried: {action_str} -> FAILED: {exch['error']}")
else:
parts.append(f" - You did: {action_str}")
parts.append("]")
parts.append("\n[Screenshot attached showing your current view]")
parts.append("\nWhat do you do? Brief reasoning (1-2 sentences), then Action: <action>")
user_prompt = "\n".join(parts)
messages = [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{"type": "image_url", "image_url": {
"url": "data:image/png;base64," + file_to_base64(screenshot_path)
}}
]
}
]
try:
resp = requests.post(VLLM_URL, json={'messages': messages}, timeout=60)
data = resp.json()
if "error" in data:
return f"[VLLM Error: {data['error']}]"
return data.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
except Exception as e:
return f"[Connection Error: {e}]"
def setup_scene(world: WorldGraph):
"""Create McRogueFace scene from WorldGraph."""
mcrfpy.createScene("enhanced_demo")
mcrfpy.setScene("enhanced_demo")
ui = mcrfpy.sceneUI("enhanced_demo")
texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16)
grid = mcrfpy.Grid(
grid_size=(25, 15),
texture=texture,
pos=(5, 5),
size=(1014, 700)
)
grid.fill_color = mcrfpy.Color(20, 20, 30)
grid.zoom = 2.0
ui.append(grid)
# Initialize all as walls
for x in range(25):
for y in range(15):
p = grid.at(x, y)
p.tilesprite = WALL_TILE
p.walkable = False
p.transparent = False
# Carve rooms from WorldGraph
for room in world.rooms.values():
for rx in range(room.x, room.x + room.width):
for ry in range(room.y, room.y + room.height):
if 0 <= rx < 25 and 0 <= ry < 15:
p = grid.at(rx, ry)
p.tilesprite = FLOOR_TILE
p.walkable = True
p.transparent = True
# Place doors
for door in world.doors:
dx, dy = door.position
if 0 <= dx < 25 and 0 <= dy < 15:
p = grid.at(dx, dy)
p.tilesprite = FLOOR_TILE
p.walkable = not door.locked
p.transparent = True
# FOV layer
fov_layer = grid.add_layer('color', z_index=10)
fov_layer.fill(mcrfpy.Color(0, 0, 0, 255))
return grid, fov_layer, texture
def create_agents(grid, world: WorldGraph, texture) -> list:
"""Create agents in their starting rooms."""
agents = []
# Wizard in guard_room (left)
room_a = world.rooms["guard_room"]
wizard = mcrfpy.Entity(
grid_pos=room_a.center,
texture=texture,
sprite_index=WIZARD_SPRITE
)
wizard.name = "wizard"
grid.entities.append(wizard)
agents.append(Agent("Wizard", "a wizard", wizard, world))
# Knight in armory (right)
room_b = world.rooms["armory"]
knight = mcrfpy.Entity(
grid_pos=room_b.center,
texture=texture,
sprite_index=KNIGHT_SPRITE
)
knight.name = "knight"
grid.entities.append(knight)
agents.append(Agent("Knight", "a knight", knight, world))
return agents
def add_rat(grid, world: WorldGraph, texture, position: tuple):
"""Add a rat entity at the specified position."""
rat = mcrfpy.Entity(
grid_pos=position,
texture=texture,
sprite_index=RAT_SPRITE
)
rat.name = "rat"
grid.entities.append(rat)
return rat
def run_demo():
"""Run enhanced action demo."""
print("=" * 70)
print("Enhanced Action Demo")
print("=" * 70)
print("""
Features demonstrated:
- LOOK as free action (doesn't end turn)
- SPEAK/ANNOUNCE as free action (once per turn)
- Points of interest targeting
- Enhanced logging for offline viewer
""")
os.makedirs(SCREENSHOT_DIR, exist_ok=True)
# Create world
print("Creating world...")
world = create_two_room_scenario()
print(f" Rooms: {list(world.rooms.keys())}")
print(f" Objects: {list(world.objects.keys())}")
# Setup scene
print("\nSetting up scene...")
grid, fov_layer, texture = setup_scene(world)
# Create agents
print("\nCreating agents...")
agents = create_agents(grid, world, texture)
# Add a rat near the door for interest
rat = add_rat(grid, world, texture, (9, 4))
print(f" Added rat at (9, 4)")
for agent in agents:
print(f" {agent.name} at {agent.pos} in {agent.current_room}")
# Create enhanced orchestrator
print("\nInitializing enhanced orchestrator...")
orchestrator = EnhancedOrchestrator(
grid=grid,
fov_layer=fov_layer,
world=world,
agents=agents,
screenshot_dir=SCREENSHOT_DIR,
llm_query_fn=llm_query
)
# Run simulation
print(f"\nRunning simulation ({MAX_TURNS} turns)...")
log = orchestrator.run_simulation(max_turns=MAX_TURNS)
# Save enhanced log
log.save(LOG_PATH)
# Print summary
print("\n" + "=" * 70)
print("SIMULATION SUMMARY")
print("=" * 70)
for turn in range(1, orchestrator.turn_number + 1):
print(log.get_turn_summary(turn))
# Print speech log
if log.speech_log:
print("\n" + "-" * 40)
print("SPEECH LOG")
print("-" * 40)
for entry in log.speech_log:
print(f" Turn {entry['turn']}: {entry['speaker']} {entry['type']}s: \"{entry['content'][:50]}...\"")
if entry['recipients']:
print(f" -> Heard by: {', '.join(entry['recipients'])}")
print("\n" + "=" * 70)
print("Demo Complete")
print("=" * 70)
print(f"\nScreenshots saved to: {SCREENSHOT_DIR}/")
print(f"Simulation log saved to: {LOG_PATH}")
print("\nLog structure (for offline viewer):")
print(" - metadata: simulation info")
print(" - steps[]: per-agent-turn records with:")
print(" - screenshot_path, position, room")
print(" - llm_prompt_user, llm_response")
print(" - free_actions[] (LOOK, SPEAK)")
print(" - final_action (MOVE, WAIT)")
print(" - speech_log[]: all speech events")
return True
def replay_log(log_path: str):
"""
Replay a simulation from log file.
This is a text-based preview of what the offline viewer would show.
"""
print(f"Loading simulation from: {log_path}")
try:
log = EnhancedSimulationLog.load(log_path)
except FileNotFoundError:
print(f"Log file not found: {log_path}")
return
print("\n" + "=" * 70)
print("SIMULATION REPLAY")
print("=" * 70)
print(f"Turns: {log.metadata.get('total_turns', '?')}")
print(f"Agents: {', '.join(log.metadata.get('agent_names', []))}")
print(f"Rooms: {', '.join(log.metadata.get('world_rooms', []))}")
for step in log.steps:
print(f"\n{'='*40}")
print(f"Turn {step.turn}: {step.agent_id}")
print(f"{'='*40}")
print(f"Position: {step.position_start} -> {step.position_end}")
print(f"Room: {step.room}")
if step.pending_messages:
print(f"\nMessages received:")
for msg in step.pending_messages:
print(f" {msg.get('sender')}: \"{msg.get('content', '')[:40]}...\"")
if step.llm_was_queried:
print(f"\nLLM Response (truncated):")
print(f" {step.llm_response[:200]}...")
else:
print(f"\n[Path continuation - no LLM query]")
if step.free_actions:
print(f"\nFree actions:")
for fa in step.free_actions:
print(f" - {fa['action_type']}: {fa.get('args', ())}")
status = "OK" if step.final_action_success else "FAIL"
print(f"\nFinal: {step.final_action_type} {step.final_action_args} [{status}]")
print(f" {step.final_action_message}")
# Speech summary
if log.speech_log:
print("\n" + "=" * 40)
print("ALL SPEECH")
print("=" * 40)
for entry in log.speech_log:
print(f"Turn {entry['turn']}: {entry['speaker']} -> {entry['recipients']}")
print(f" \"{entry['content']}\"")
if __name__ == "__main__":
# Check for replay mode
if len(sys.argv) > 1 and sys.argv[1] == "--replay":
log_file = sys.argv[2] if len(sys.argv) > 2 else LOG_PATH
replay_log(log_file)
sys.exit(0)
# Normal execution
try:
success = run_demo()
print("\nPASS" if success else "\nFAIL")
sys.exit(0 if success else 1)
except Exception as e:
import traceback
traceback.print_exc()
sys.exit(1)