feat: Implement enhanced action economy for LLM agent orchestration (#156)
- Add action economy system with free (LOOK, SPEAK) vs turn-ending (GO, WAIT, TAKE) actions - Implement LOOK action with detailed descriptions for doors, objects, entities, directions - Add SPEAK/ANNOUNCE speech system with room-wide and proximity-based message delivery - Create multi-tile pathing with FOV interrupt detection (path cancels when new entity visible) - Implement TAKE action with adjacency requirement and clear error messages - Add conversation history and error feedback loop so agents learn from failed actions - Create structured simulation logging for offline viewer replay - Document offline viewer requirements in OFFLINE_VIEWER_SPEC.md - Fix import path in 1_multi_agent_demo.py for standalone execution 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
85e90088d5
commit
335efc5514
6 changed files with 2232 additions and 2 deletions
436
tests/vllm_demo/4_enhanced_action_demo.py
Normal file
436
tests/vllm_demo/4_enhanced_action_demo.py
Normal file
|
|
@ -0,0 +1,436 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enhanced Action Demo
|
||||
====================
|
||||
|
||||
Demonstrates the enhanced action economy system:
|
||||
- Free actions (LOOK, SPEAK/ANNOUNCE) vs turn-ending (MOVE, WAIT)
|
||||
- Points of interest targeting for LOOK/MOVE
|
||||
- Speech system with room-wide ANNOUNCE and proximity SPEAK
|
||||
- Multi-tile path continuation with FOV interrupts
|
||||
- Enhanced logging for offline viewer replay
|
||||
|
||||
This implements the turn-based LLM agent orchestration from issue #156.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
# Add the vllm_demo directory to path for imports
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
import mcrfpy
|
||||
from mcrfpy import automation
|
||||
import requests
|
||||
import base64
|
||||
|
||||
from world_graph import (
|
||||
WorldGraph, Room, Door, WorldObject, Direction, AgentInfo,
|
||||
create_two_room_scenario, create_button_door_scenario
|
||||
)
|
||||
from action_parser import parse_action
|
||||
from enhanced_executor import EnhancedExecutor
|
||||
from enhanced_orchestrator import EnhancedOrchestrator, EnhancedSimulationLog
|
||||
|
||||
# Configuration
|
||||
VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions"
|
||||
SCREENSHOT_DIR = "/tmp/vllm_enhanced_demo"
|
||||
LOG_PATH = "/tmp/vllm_enhanced_demo/simulation_log.json"
|
||||
MAX_TURNS = 3
|
||||
|
||||
# Sprites
|
||||
FLOOR_TILE = 0
|
||||
WALL_TILE = 40
|
||||
WIZARD_SPRITE = 84
|
||||
KNIGHT_SPRITE = 96
|
||||
RAT_SPRITE = 123
|
||||
|
||||
|
||||
class Agent:
|
||||
"""Agent with WorldGraph integration."""
|
||||
|
||||
def __init__(self, name: str, display_name: str, entity, world: WorldGraph):
|
||||
self.name = name
|
||||
self.display_name = display_name
|
||||
self.entity = entity
|
||||
self.world = world
|
||||
self.message_history = []
|
||||
|
||||
@property
|
||||
def pos(self) -> tuple:
|
||||
return (int(self.entity.pos[0]), int(self.entity.pos[1]))
|
||||
|
||||
@property
|
||||
def current_room(self) -> str:
|
||||
room = self.world.room_at(*self.pos)
|
||||
return room.name if room else None
|
||||
|
||||
def get_context(self, visible_agents: list) -> dict:
|
||||
"""Build context for LLM query."""
|
||||
room_name = self.current_room
|
||||
agent_infos = [
|
||||
AgentInfo(
|
||||
name=a.name,
|
||||
display_name=a.display_name,
|
||||
position=a.pos,
|
||||
is_player=(a.name == self.name)
|
||||
)
|
||||
for a in visible_agents
|
||||
]
|
||||
return {
|
||||
"location": self.world.describe_room(room_name, agent_infos, self.name),
|
||||
"available_actions": self.world.get_available_actions(room_name),
|
||||
"recent_messages": self.message_history[-5:],
|
||||
}
|
||||
|
||||
|
||||
def file_to_base64(path: str) -> str:
|
||||
"""Convert file to base64 string."""
|
||||
with open(path, 'rb') as f:
|
||||
return base64.b64encode(f.read()).decode('utf-8')
|
||||
|
||||
|
||||
def llm_query(agent, screenshot_path: str, context: dict) -> str:
|
||||
"""
|
||||
Query VLLM for agent action with enhanced context.
|
||||
|
||||
Includes points of interest, action economy hints, error feedback,
|
||||
and conversation history.
|
||||
"""
|
||||
system_prompt = f"""You are {agent.display_name} exploring a dungeon.
|
||||
You receive visual and text information about your surroundings.
|
||||
|
||||
ACTION ECONOMY:
|
||||
- LOOK <target>: Free action. Examine something, then choose another action.
|
||||
- SPEAK "<message>" or ANNOUNCE "<message>": Free action (once per turn). Then choose another action.
|
||||
- GO <direction>: Ends your turn. Move one tile in that direction (NORTH/SOUTH/EAST/WEST).
|
||||
- TAKE <item>: Ends your turn. Pick up an item you are standing next to.
|
||||
- WAIT: Ends your turn without moving.
|
||||
|
||||
IMPORTANT: You can only TAKE items that are adjacent to you (1 tile away). If something is far away, GO towards it first.
|
||||
|
||||
You can LOOK or SPEAK, then still MOVE in the same turn.
|
||||
Always end your final response with: Action: <YOUR_ACTION>"""
|
||||
|
||||
# Build enhanced prompt
|
||||
parts = [context["location"]]
|
||||
|
||||
# Add received messages
|
||||
if context.get("messages"):
|
||||
parts.append("\nMessages received this turn:")
|
||||
for msg in context["messages"]:
|
||||
sender = msg.get("sender", "someone")
|
||||
content = msg.get("content", "")
|
||||
parts.append(f' {sender} says: "{content}"')
|
||||
|
||||
# Add points of interest
|
||||
if context.get("poi_prompt"):
|
||||
parts.append(f"\n{context['poi_prompt']}")
|
||||
|
||||
# Add available actions
|
||||
actions_str = ", ".join(context.get("available_actions", []))
|
||||
parts.append(f"\nAvailable actions: {actions_str}")
|
||||
|
||||
# Add action economy hint
|
||||
if context.get("has_spoken"):
|
||||
parts.append("\n[You have already spoken this turn - you can still MOVE or WAIT]")
|
||||
|
||||
# Add error feedback from last failed action
|
||||
if context.get("last_error"):
|
||||
parts.append(f"\n[ERROR: {context['last_error']}]")
|
||||
parts.append("[Your last action failed. Please try a different action.]")
|
||||
|
||||
# Add conversation history from this turn
|
||||
if context.get("conversation_history"):
|
||||
parts.append("\n[Previous attempts this turn:")
|
||||
for exch in context["conversation_history"]:
|
||||
action_str = f"{exch.get('action_type', '?')} {exch.get('action_args', '')}"
|
||||
if exch.get("error"):
|
||||
parts.append(f" - You tried: {action_str} -> FAILED: {exch['error']}")
|
||||
else:
|
||||
parts.append(f" - You did: {action_str}")
|
||||
parts.append("]")
|
||||
|
||||
parts.append("\n[Screenshot attached showing your current view]")
|
||||
parts.append("\nWhat do you do? Brief reasoning (1-2 sentences), then Action: <action>")
|
||||
|
||||
user_prompt = "\n".join(parts)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": user_prompt},
|
||||
{"type": "image_url", "image_url": {
|
||||
"url": "data:image/png;base64," + file_to_base64(screenshot_path)
|
||||
}}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
try:
|
||||
resp = requests.post(VLLM_URL, json={'messages': messages}, timeout=60)
|
||||
data = resp.json()
|
||||
if "error" in data:
|
||||
return f"[VLLM Error: {data['error']}]"
|
||||
return data.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
|
||||
except Exception as e:
|
||||
return f"[Connection Error: {e}]"
|
||||
|
||||
|
||||
def setup_scene(world: WorldGraph):
|
||||
"""Create McRogueFace scene from WorldGraph."""
|
||||
mcrfpy.createScene("enhanced_demo")
|
||||
mcrfpy.setScene("enhanced_demo")
|
||||
ui = mcrfpy.sceneUI("enhanced_demo")
|
||||
|
||||
texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16)
|
||||
|
||||
grid = mcrfpy.Grid(
|
||||
grid_size=(25, 15),
|
||||
texture=texture,
|
||||
pos=(5, 5),
|
||||
size=(1014, 700)
|
||||
)
|
||||
grid.fill_color = mcrfpy.Color(20, 20, 30)
|
||||
grid.zoom = 2.0
|
||||
ui.append(grid)
|
||||
|
||||
# Initialize all as walls
|
||||
for x in range(25):
|
||||
for y in range(15):
|
||||
p = grid.at(x, y)
|
||||
p.tilesprite = WALL_TILE
|
||||
p.walkable = False
|
||||
p.transparent = False
|
||||
|
||||
# Carve rooms from WorldGraph
|
||||
for room in world.rooms.values():
|
||||
for rx in range(room.x, room.x + room.width):
|
||||
for ry in range(room.y, room.y + room.height):
|
||||
if 0 <= rx < 25 and 0 <= ry < 15:
|
||||
p = grid.at(rx, ry)
|
||||
p.tilesprite = FLOOR_TILE
|
||||
p.walkable = True
|
||||
p.transparent = True
|
||||
|
||||
# Place doors
|
||||
for door in world.doors:
|
||||
dx, dy = door.position
|
||||
if 0 <= dx < 25 and 0 <= dy < 15:
|
||||
p = grid.at(dx, dy)
|
||||
p.tilesprite = FLOOR_TILE
|
||||
p.walkable = not door.locked
|
||||
p.transparent = True
|
||||
|
||||
# FOV layer
|
||||
fov_layer = grid.add_layer('color', z_index=10)
|
||||
fov_layer.fill(mcrfpy.Color(0, 0, 0, 255))
|
||||
|
||||
return grid, fov_layer, texture
|
||||
|
||||
|
||||
def create_agents(grid, world: WorldGraph, texture) -> list:
|
||||
"""Create agents in their starting rooms."""
|
||||
agents = []
|
||||
|
||||
# Wizard in guard_room (left)
|
||||
room_a = world.rooms["guard_room"]
|
||||
wizard = mcrfpy.Entity(
|
||||
grid_pos=room_a.center,
|
||||
texture=texture,
|
||||
sprite_index=WIZARD_SPRITE
|
||||
)
|
||||
wizard.name = "wizard"
|
||||
grid.entities.append(wizard)
|
||||
agents.append(Agent("Wizard", "a wizard", wizard, world))
|
||||
|
||||
# Knight in armory (right)
|
||||
room_b = world.rooms["armory"]
|
||||
knight = mcrfpy.Entity(
|
||||
grid_pos=room_b.center,
|
||||
texture=texture,
|
||||
sprite_index=KNIGHT_SPRITE
|
||||
)
|
||||
knight.name = "knight"
|
||||
grid.entities.append(knight)
|
||||
agents.append(Agent("Knight", "a knight", knight, world))
|
||||
|
||||
return agents
|
||||
|
||||
|
||||
def add_rat(grid, world: WorldGraph, texture, position: tuple):
|
||||
"""Add a rat entity at the specified position."""
|
||||
rat = mcrfpy.Entity(
|
||||
grid_pos=position,
|
||||
texture=texture,
|
||||
sprite_index=RAT_SPRITE
|
||||
)
|
||||
rat.name = "rat"
|
||||
grid.entities.append(rat)
|
||||
return rat
|
||||
|
||||
|
||||
def run_demo():
|
||||
"""Run enhanced action demo."""
|
||||
print("=" * 70)
|
||||
print("Enhanced Action Demo")
|
||||
print("=" * 70)
|
||||
print("""
|
||||
Features demonstrated:
|
||||
- LOOK as free action (doesn't end turn)
|
||||
- SPEAK/ANNOUNCE as free action (once per turn)
|
||||
- Points of interest targeting
|
||||
- Enhanced logging for offline viewer
|
||||
""")
|
||||
|
||||
os.makedirs(SCREENSHOT_DIR, exist_ok=True)
|
||||
|
||||
# Create world
|
||||
print("Creating world...")
|
||||
world = create_two_room_scenario()
|
||||
print(f" Rooms: {list(world.rooms.keys())}")
|
||||
print(f" Objects: {list(world.objects.keys())}")
|
||||
|
||||
# Setup scene
|
||||
print("\nSetting up scene...")
|
||||
grid, fov_layer, texture = setup_scene(world)
|
||||
|
||||
# Create agents
|
||||
print("\nCreating agents...")
|
||||
agents = create_agents(grid, world, texture)
|
||||
|
||||
# Add a rat near the door for interest
|
||||
rat = add_rat(grid, world, texture, (9, 4))
|
||||
print(f" Added rat at (9, 4)")
|
||||
|
||||
for agent in agents:
|
||||
print(f" {agent.name} at {agent.pos} in {agent.current_room}")
|
||||
|
||||
# Create enhanced orchestrator
|
||||
print("\nInitializing enhanced orchestrator...")
|
||||
orchestrator = EnhancedOrchestrator(
|
||||
grid=grid,
|
||||
fov_layer=fov_layer,
|
||||
world=world,
|
||||
agents=agents,
|
||||
screenshot_dir=SCREENSHOT_DIR,
|
||||
llm_query_fn=llm_query
|
||||
)
|
||||
|
||||
# Run simulation
|
||||
print(f"\nRunning simulation ({MAX_TURNS} turns)...")
|
||||
log = orchestrator.run_simulation(max_turns=MAX_TURNS)
|
||||
|
||||
# Save enhanced log
|
||||
log.save(LOG_PATH)
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 70)
|
||||
print("SIMULATION SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
for turn in range(1, orchestrator.turn_number + 1):
|
||||
print(log.get_turn_summary(turn))
|
||||
|
||||
# Print speech log
|
||||
if log.speech_log:
|
||||
print("\n" + "-" * 40)
|
||||
print("SPEECH LOG")
|
||||
print("-" * 40)
|
||||
for entry in log.speech_log:
|
||||
print(f" Turn {entry['turn']}: {entry['speaker']} {entry['type']}s: \"{entry['content'][:50]}...\"")
|
||||
if entry['recipients']:
|
||||
print(f" -> Heard by: {', '.join(entry['recipients'])}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Demo Complete")
|
||||
print("=" * 70)
|
||||
print(f"\nScreenshots saved to: {SCREENSHOT_DIR}/")
|
||||
print(f"Simulation log saved to: {LOG_PATH}")
|
||||
print("\nLog structure (for offline viewer):")
|
||||
print(" - metadata: simulation info")
|
||||
print(" - steps[]: per-agent-turn records with:")
|
||||
print(" - screenshot_path, position, room")
|
||||
print(" - llm_prompt_user, llm_response")
|
||||
print(" - free_actions[] (LOOK, SPEAK)")
|
||||
print(" - final_action (MOVE, WAIT)")
|
||||
print(" - speech_log[]: all speech events")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def replay_log(log_path: str):
|
||||
"""
|
||||
Replay a simulation from log file.
|
||||
|
||||
This is a text-based preview of what the offline viewer would show.
|
||||
"""
|
||||
print(f"Loading simulation from: {log_path}")
|
||||
|
||||
try:
|
||||
log = EnhancedSimulationLog.load(log_path)
|
||||
except FileNotFoundError:
|
||||
print(f"Log file not found: {log_path}")
|
||||
return
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("SIMULATION REPLAY")
|
||||
print("=" * 70)
|
||||
print(f"Turns: {log.metadata.get('total_turns', '?')}")
|
||||
print(f"Agents: {', '.join(log.metadata.get('agent_names', []))}")
|
||||
print(f"Rooms: {', '.join(log.metadata.get('world_rooms', []))}")
|
||||
|
||||
for step in log.steps:
|
||||
print(f"\n{'='*40}")
|
||||
print(f"Turn {step.turn}: {step.agent_id}")
|
||||
print(f"{'='*40}")
|
||||
print(f"Position: {step.position_start} -> {step.position_end}")
|
||||
print(f"Room: {step.room}")
|
||||
|
||||
if step.pending_messages:
|
||||
print(f"\nMessages received:")
|
||||
for msg in step.pending_messages:
|
||||
print(f" {msg.get('sender')}: \"{msg.get('content', '')[:40]}...\"")
|
||||
|
||||
if step.llm_was_queried:
|
||||
print(f"\nLLM Response (truncated):")
|
||||
print(f" {step.llm_response[:200]}...")
|
||||
else:
|
||||
print(f"\n[Path continuation - no LLM query]")
|
||||
|
||||
if step.free_actions:
|
||||
print(f"\nFree actions:")
|
||||
for fa in step.free_actions:
|
||||
print(f" - {fa['action_type']}: {fa.get('args', ())}")
|
||||
|
||||
status = "OK" if step.final_action_success else "FAIL"
|
||||
print(f"\nFinal: {step.final_action_type} {step.final_action_args} [{status}]")
|
||||
print(f" {step.final_action_message}")
|
||||
|
||||
# Speech summary
|
||||
if log.speech_log:
|
||||
print("\n" + "=" * 40)
|
||||
print("ALL SPEECH")
|
||||
print("=" * 40)
|
||||
for entry in log.speech_log:
|
||||
print(f"Turn {entry['turn']}: {entry['speaker']} -> {entry['recipients']}")
|
||||
print(f" \"{entry['content']}\"")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Check for replay mode
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--replay":
|
||||
log_file = sys.argv[2] if len(sys.argv) > 2 else LOG_PATH
|
||||
replay_log(log_file)
|
||||
sys.exit(0)
|
||||
|
||||
# Normal execution
|
||||
try:
|
||||
success = run_demo()
|
||||
print("\nPASS" if success else "\nFAIL")
|
||||
sys.exit(0 if success else 1)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Loading…
Add table
Add a link
Reference in a new issue