# Hours 3-4: Integration and Multi-Turn Demo **Issues**: #154, #155, #156 (integration) **Goal**: Complete turn-based simulation with proper context and logging **Dependencies**: Hour 1 (Action Parser/Executor), Hour 2 (WorldGraph) --- ## Hour 3: Integration ### Goal Wire WorldGraph into the demo so agents receive proper IF-style descriptions. ### Deliverables 1. `2_integrated_demo.py` - New demo combining WorldGraph + Action execution 2. Enhanced `ActionExecutor` with room-aware movement --- ### File: `2_integrated_demo.py` ```python #!/usr/bin/env python3 """ Integrated VLLM Demo ==================== Combines: - WorldGraph for structured room descriptions (#155) - Action parsing and execution (#156) - Per-agent perspective rendering This is the foundation for multi-turn simulation. """ import mcrfpy from mcrfpy import automation import sys import os import requests import base64 from world_graph import WorldGraph, Room, Door, WorldObject, Direction, AgentInfo, create_two_room_scenario from action_parser import parse_action, ActionType from action_executor import ActionExecutor # Configuration VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions" SCREENSHOT_DIR = "/tmp/vllm_integrated" # Sprite constants FLOOR_TILE = 0 WALL_TILE = 40 WIZARD_SPRITE = 84 KNIGHT_SPRITE = 96 class Agent: """Agent wrapper with WorldGraph integration.""" def __init__(self, name: str, display_name: str, entity, world: WorldGraph): self.name = name self.display_name = display_name self.entity = entity self.world = world self.message_history = [] # For speech system @property def pos(self) -> tuple: return (int(self.entity.pos[0]), int(self.entity.pos[1])) @property def current_room(self) -> str: room = self.world.room_at(*self.pos) return room.name if room else None def get_context(self, visible_agents: list) -> dict: """Build complete context for LLM query.""" room_name = self.current_room # Convert to AgentInfo for WorldGraph agent_infos = [ AgentInfo(a.name, a.display_name, a.pos, is_player=(a.name == self.name)) for a in visible_agents ] return { "location": self.world.describe_room( room_name, visible_agents=agent_infos, observer_name=self.name ), "available_actions": self.world.get_available_actions(room_name), "recent_messages": self.message_history[-5:], } def file_to_base64(file_path): with open(file_path, 'rb') as f: return base64.b64encode(f.read()).decode('utf-8') def llm_chat_completion(messages: list): try: response = requests.post(VLLM_URL, json={'messages': messages}, timeout=60) return response.json() except requests.exceptions.RequestException as e: return {"error": str(e)} def message_with_image(text, image_path): image_data = file_to_base64(image_path) return { "role": "user", "content": [ {"type": "text", "text": text}, {"type": "image_url", "image_url": {"url": "data:image/png;base64," + image_data}} ] } def setup_scene(world: WorldGraph): """Create scene from WorldGraph.""" mcrfpy.createScene("integrated_demo") mcrfpy.setScene("integrated_demo") ui = mcrfpy.sceneUI("integrated_demo") texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16) # Create grid sized for the world grid = mcrfpy.Grid( grid_size=(25, 15), texture=texture, pos=(5, 5), size=(1014, 700) ) grid.fill_color = mcrfpy.Color(20, 20, 30) grid.zoom = 2.0 ui.append(grid) # Initialize all as walls for x in range(25): for y in range(15): point = grid.at(x, y) point.tilesprite = WALL_TILE point.walkable = False point.transparent = False # Carve out rooms from WorldGraph for room in world.rooms.values(): for rx in range(room.x, room.x + room.width): for ry in range(room.y, room.y + room.height): if 0 <= rx < 25 and 0 <= ry < 15: point = grid.at(rx, ry) point.tilesprite = FLOOR_TILE point.walkable = True point.transparent = True # Place doors for door in world.doors: dx, dy = door.position if 0 <= dx < 25 and 0 <= dy < 15: point = grid.at(dx, dy) point.tilesprite = FLOOR_TILE point.walkable = not door.locked point.transparent = True # Create FOV layer fov_layer = grid.add_layer('color', z_index=10) fov_layer.fill(mcrfpy.Color(0, 0, 0, 255)) return grid, fov_layer def create_agents(grid, world: WorldGraph, texture) -> list: """Create agent entities in their starting rooms.""" agents = [] # Agent A: Wizard in guard_room guard_room = world.rooms["guard_room"] wizard_entity = mcrfpy.Entity( grid_pos=guard_room.center, texture=texture, sprite_index=WIZARD_SPRITE ) grid.entities.append(wizard_entity) agents.append(Agent("Wizard", "a wizard", wizard_entity, world)) # Agent B: Knight in armory armory = world.rooms["armory"] knight_entity = mcrfpy.Entity( grid_pos=armory.center, texture=texture, sprite_index=KNIGHT_SPRITE ) grid.entities.append(knight_entity) agents.append(Agent("Knight", "a knight", knight_entity, world)) return agents def switch_perspective(grid, fov_layer, agent): """Switch view to agent's perspective.""" fov_layer.fill(mcrfpy.Color(0, 0, 0, 255)) fov_layer.apply_perspective( entity=agent.entity, visible=mcrfpy.Color(0, 0, 0, 0), discovered=mcrfpy.Color(40, 40, 60, 180), unknown=mcrfpy.Color(0, 0, 0, 255) ) agent.entity.update_visibility() px, py = agent.pos grid.center = (px * 16 + 8, py * 16 + 8) def get_visible_agents(grid, observer, all_agents) -> list: """Get agents visible to the observer.""" visible = [] for agent in all_agents: if agent.name == observer.name: continue ax, ay = agent.pos if grid.is_in_fov(ax, ay): visible.append(agent) return visible def query_agent_llm(agent, screenshot_path, context) -> str: """Query VLLM for agent's action.""" system_prompt = f"""You are {agent.display_name} in a roguelike dungeon game. You see the world through screenshots and receive text descriptions. Your goal is to explore and interact with your environment. Always end your response with a clear action declaration: "Action: " """ # Build the user prompt with WorldGraph context actions_str = ", ".join(context["available_actions"]) user_prompt = f"""{context["location"]} Available actions: {actions_str} Look at the screenshot showing your current view. The dark areas are outside your field of vision. What would you like to do? State your reasoning briefly (1-2 sentences), then declare your action. Example: "I see a key on the ground that might be useful. Action: TAKE brass_key" """ messages = [ {"role": "system", "content": system_prompt}, message_with_image(user_prompt, screenshot_path) ] resp = llm_chat_completion(messages) if "error" in resp: return f"[VLLM Error: {resp['error']}]" return resp.get('choices', [{}])[0].get('message', {}).get('content', 'No response') def run_single_turn(grid, fov_layer, agents, executor, turn_num): """Execute one turn for all agents.""" print(f"\n{'='*70}") print(f"TURN {turn_num}") print("=" * 70) results = [] for agent in agents: print(f"\n--- {agent.name}'s Turn ---") print(f"Position: {agent.pos} | Room: {agent.current_room}") # Switch perspective switch_perspective(grid, fov_layer, agent) mcrfpy.step(0.016) # Screenshot screenshot_path = os.path.join(SCREENSHOT_DIR, f"turn{turn_num}_{agent.name.lower()}.png") automation.screenshot(screenshot_path) # Get context using WorldGraph visible = get_visible_agents(grid, agent, agents) context = agent.get_context(visible + [agent]) # Include self for filtering print(f"Context: {context['location']}") print(f"Actions: {context['available_actions']}") # Query LLM print(f"\nQuerying VLLM...") response = query_agent_llm(agent, screenshot_path, context) print(f"Response: {response[:200]}...") # Parse and execute action = parse_action(response) print(f"Parsed: {action.type.value} {action.args}") result = executor.execute(agent, action) print(f"Result: {'SUCCESS' if result.success else 'FAILED'} - {result.message}") results.append({ "agent": agent.name, "context": context, "response": response, "action": action, "result": result }) return results def run_demo(): """Main demo: single integrated turn.""" print("=" * 70) print("Integrated WorldGraph + Action Demo") print("=" * 70) os.makedirs(SCREENSHOT_DIR, exist_ok=True) # Create world from WorldGraph world = create_two_room_scenario() # Setup scene grid, fov_layer = setup_scene(world) # Create agents texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16) agents = create_agents(grid, world, texture) # Create executor executor = ActionExecutor(grid) # Run one turn results = run_single_turn(grid, fov_layer, agents, executor, turn_num=1) print("\n" + "=" * 70) print("Demo Complete") print("=" * 70) return True if __name__ == "__main__": try: success = run_demo() sys.exit(0 if success else 1) except Exception as e: import traceback traceback.print_exc() sys.exit(1) ``` --- ## Hour 4: Multi-Turn Demo ### Goal Run multiple turns with simulation logging for replay. ### Deliverables 1. `turn_orchestrator.py` - Turn management and logging 2. `3_multi_turn_demo.py` - Complete multi-turn simulation 3. `simulation_log.json` - Saved output for replay --- ### File: `turn_orchestrator.py` ```python """ Turn Orchestrator ================= Manages multi-turn simulation with logging for replay. """ import json import os from dataclasses import dataclass, asdict from typing import List, Dict, Any, Optional from datetime import datetime from world_graph import WorldGraph from action_parser import Action, ActionType, parse_action from action_executor import ActionExecutor, ActionResult @dataclass class SimulationStep: """Record of one agent's turn.""" turn: int agent_id: str agent_position: tuple room: str perception: Dict[str, Any] # Context shown to LLM llm_response: str # Raw LLM output parsed_action_type: str # Action type as string parsed_action_args: tuple # Action arguments result_success: bool result_message: str new_position: Optional[tuple] = None path: Optional[List[tuple]] = None # For animation timestamp: str = "" def __post_init__(self): if not self.timestamp: self.timestamp = datetime.now().isoformat() @dataclass class SimulationLog: """Complete simulation record.""" metadata: Dict[str, Any] steps: List[SimulationStep] def save(self, path: str): """Save log to JSON file.""" data = { "metadata": self.metadata, "steps": [asdict(s) for s in self.steps] } with open(path, 'w') as f: json.dump(data, f, indent=2, default=str) @classmethod def load(cls, path: str) -> 'SimulationLog': """Load log from JSON file.""" with open(path) as f: data = json.load(f) steps = [SimulationStep(**s) for s in data["steps"]] return cls(metadata=data["metadata"], steps=steps) class TurnOrchestrator: """ Orchestrates multi-turn simulation. Handles: - Turn sequencing - Perspective switching - LLM queries - Action execution - Simulation logging """ def __init__(self, grid, fov_layer, world: WorldGraph, agents: list, screenshot_dir: str, llm_query_fn): self.grid = grid self.fov_layer = fov_layer self.world = world self.agents = agents self.screenshot_dir = screenshot_dir self.llm_query_fn = llm_query_fn # Function to query LLM self.executor = ActionExecutor(grid) self.turn_number = 0 self.steps: List[SimulationStep] = [] os.makedirs(screenshot_dir, exist_ok=True) def run_turn(self) -> List[SimulationStep]: """Execute one full turn (all agents act once).""" self.turn_number += 1 turn_steps = [] for agent in self.agents: step = self._run_agent_turn(agent) turn_steps.append(step) self.steps.append(step) return turn_steps def run_simulation(self, max_turns: int = 10, stop_condition=None) -> SimulationLog: """ Run complete simulation. Args: max_turns: Maximum number of turns to run stop_condition: Optional callable(orchestrator) -> bool Returns: SimulationLog with all steps """ print(f"\nStarting simulation: max {max_turns} turns") print("=" * 50) for turn in range(max_turns): print(f"\n--- Turn {turn + 1}/{max_turns} ---") self.run_turn() # Check stop condition if stop_condition and stop_condition(self): print(f"Stop condition met at turn {turn + 1}") break # Create log log = SimulationLog( metadata={ "total_turns": self.turn_number, "num_agents": len(self.agents), "agent_names": [a.name for a in self.agents], "timestamp": datetime.now().isoformat(), "world_rooms": list(self.world.rooms.keys()), }, steps=self.steps ) return log def _run_agent_turn(self, agent) -> SimulationStep: """Execute one agent's turn.""" from mcrfpy import automation import mcrfpy # Switch perspective self._switch_perspective(agent) mcrfpy.step(0.016) # Screenshot screenshot_path = os.path.join( self.screenshot_dir, f"turn{self.turn_number}_{agent.name.lower()}.png" ) automation.screenshot(screenshot_path) # Build context visible_agents = self._get_visible_agents(agent) context = agent.get_context(visible_agents + [agent]) # Query LLM llm_response = self.llm_query_fn(agent, screenshot_path, context) # Parse and execute action = parse_action(llm_response) result = self.executor.execute(agent, action) # Log print(f" {agent.name}: {action.type.value} -> {result.message}") return SimulationStep( turn=self.turn_number, agent_id=agent.name, agent_position=agent.pos, room=agent.current_room, perception=context, llm_response=llm_response, parsed_action_type=action.type.value, parsed_action_args=action.args, result_success=result.success, result_message=result.message, new_position=result.new_position, path=result.path ) def _switch_perspective(self, agent): """Switch grid view to agent's perspective.""" import mcrfpy self.fov_layer.fill(mcrfpy.Color(0, 0, 0, 255)) self.fov_layer.apply_perspective( entity=agent.entity, visible=mcrfpy.Color(0, 0, 0, 0), discovered=mcrfpy.Color(40, 40, 60, 180), unknown=mcrfpy.Color(0, 0, 0, 255) ) agent.entity.update_visibility() px, py = agent.pos self.grid.center = (px * 16 + 8, py * 16 + 8) def _get_visible_agents(self, observer) -> list: """Get agents visible to observer.""" visible = [] for agent in self.agents: if agent.name == observer.name: continue ax, ay = agent.pos if self.grid.is_in_fov(ax, ay): visible.append(agent) return visible ``` --- ### File: `3_multi_turn_demo.py` ```python #!/usr/bin/env python3 """ Multi-Turn Simulation Demo ========================== Runs multiple turns of agent interaction with full logging. This is the Phase 1 implementation from issue #154. """ import mcrfpy from mcrfpy import automation import sys import os import requests import base64 from world_graph import create_two_room_scenario, AgentInfo from action_parser import parse_action from action_executor import ActionExecutor from turn_orchestrator import TurnOrchestrator, SimulationLog # Configuration VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions" SCREENSHOT_DIR = "/tmp/vllm_multi_turn" LOG_PATH = "/tmp/vllm_multi_turn/simulation_log.json" MAX_TURNS = 5 # Sprites FLOOR_TILE = 0 WALL_TILE = 40 WIZARD_SPRITE = 84 KNIGHT_SPRITE = 96 class Agent: """Agent with WorldGraph integration.""" def __init__(self, name, display_name, entity, world): self.name = name self.display_name = display_name self.entity = entity self.world = world self.message_history = [] @property def pos(self): return (int(self.entity.pos[0]), int(self.entity.pos[1])) @property def current_room(self): room = self.world.room_at(*self.pos) return room.name if room else None def get_context(self, visible_agents): room_name = self.current_room agent_infos = [ AgentInfo(a.name, a.display_name, a.pos, is_player=(a.name == self.name)) for a in visible_agents ] return { "location": self.world.describe_room(room_name, agent_infos, self.name), "available_actions": self.world.get_available_actions(room_name), "recent_messages": self.message_history[-5:], } def file_to_base64(path): with open(path, 'rb') as f: return base64.b64encode(f.read()).decode('utf-8') def llm_query(agent, screenshot_path, context) -> str: """Query VLLM for agent action.""" system = f"""You are {agent.display_name} exploring a dungeon. You receive visual and text information about your surroundings. Always end with: Action: """ actions_str = ", ".join(context["available_actions"]) user = f"""{context["location"]} Available: {actions_str} [Screenshot attached showing your view] What do you do? Brief reasoning, then Action: """ messages = [ {"role": "system", "content": system}, { "role": "user", "content": [ {"type": "text", "text": user}, {"type": "image_url", "image_url": { "url": "data:image/png;base64," + file_to_base64(screenshot_path) }} ] } ] try: resp = requests.post(VLLM_URL, json={'messages': messages}, timeout=60) data = resp.json() if "error" in data: return f"[Error: {data['error']}]" return data.get('choices', [{}])[0].get('message', {}).get('content', 'No response') except Exception as e: return f"[Error: {e}]" def setup_scene(world): """Create scene from WorldGraph.""" mcrfpy.createScene("multi_turn") mcrfpy.setScene("multi_turn") ui = mcrfpy.sceneUI("multi_turn") texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16) grid = mcrfpy.Grid( grid_size=(25, 15), texture=texture, pos=(5, 5), size=(1014, 700) ) grid.fill_color = mcrfpy.Color(20, 20, 30) grid.zoom = 2.0 ui.append(grid) # Walls everywhere first for x in range(25): for y in range(15): p = grid.at(x, y) p.tilesprite = WALL_TILE p.walkable = False p.transparent = False # Carve rooms for room in world.rooms.values(): for rx in range(room.x, room.x + room.width): for ry in range(room.y, room.y + room.height): if 0 <= rx < 25 and 0 <= ry < 15: p = grid.at(rx, ry) p.tilesprite = FLOOR_TILE p.walkable = True p.transparent = True # Place doors for door in world.doors: dx, dy = door.position if 0 <= dx < 25 and 0 <= dy < 15: p = grid.at(dx, dy) p.tilesprite = FLOOR_TILE p.walkable = not door.locked p.transparent = True # FOV layer fov_layer = grid.add_layer('color', z_index=10) fov_layer.fill(mcrfpy.Color(0, 0, 0, 255)) return grid, fov_layer, texture def create_agents(grid, world, texture): """Create agents in starting positions.""" agents = [] # Wizard in guard_room room_a = world.rooms["guard_room"] wizard = mcrfpy.Entity(grid_pos=room_a.center, texture=texture, sprite_index=WIZARD_SPRITE) grid.entities.append(wizard) agents.append(Agent("Wizard", "a wizard", wizard, world)) # Knight in armory room_b = world.rooms["armory"] knight = mcrfpy.Entity(grid_pos=room_b.center, texture=texture, sprite_index=KNIGHT_SPRITE) grid.entities.append(knight) agents.append(Agent("Knight", "a knight", knight, world)) return agents def run_demo(): """Run multi-turn simulation.""" print("=" * 70) print("Multi-Turn Simulation Demo") print(f"Running {MAX_TURNS} turns with 2 agents") print("=" * 70) os.makedirs(SCREENSHOT_DIR, exist_ok=True) # Setup world = create_two_room_scenario() grid, fov_layer, texture = setup_scene(world) agents = create_agents(grid, world, texture) # Create orchestrator orchestrator = TurnOrchestrator( grid=grid, fov_layer=fov_layer, world=world, agents=agents, screenshot_dir=SCREENSHOT_DIR, llm_query_fn=llm_query ) # Run simulation log = orchestrator.run_simulation(max_turns=MAX_TURNS) # Save log log.save(LOG_PATH) print(f"\nSimulation log saved to: {LOG_PATH}") # Summary print("\n" + "=" * 70) print("SIMULATION SUMMARY") print("=" * 70) print(f"Total turns: {log.metadata['total_turns']}") print(f"Total steps: {len(log.steps)}") # Per-agent summary for agent_name in log.metadata['agent_names']: agent_steps = [s for s in log.steps if s.agent_id == agent_name] successes = sum(1 for s in agent_steps if s.result_success) print(f"\n{agent_name}:") print(f" Actions: {len(agent_steps)}") print(f" Successful: {successes}") print(f" Final position: {agent_steps[-1].new_position or agent_steps[-1].agent_position}") return True if __name__ == "__main__": try: success = run_demo() print("\nPASS" if success else "\nFAIL") sys.exit(0 if success else 1) except Exception as e: import traceback traceback.print_exc() sys.exit(1) ``` --- ## Success Criteria ### Hour 3 Integration - [ ] WorldGraph generates scene tiles correctly - [ ] Agents receive IF-style room descriptions from WorldGraph - [ ] Available actions list appears in LLM prompt - [ ] Actions are parsed and executed - [ ] Single turn completes successfully ### Hour 4 Multi-Turn - [ ] TurnOrchestrator cycles through all agents - [ ] Multiple turns run sequentially - [ ] SimulationLog captures all steps - [ ] Log saves to JSON correctly - [ ] Log can be loaded back - [ ] Summary shows agent actions and positions --- ## Example Output ``` ====================================================================== Multi-Turn Simulation Demo Running 5 turns with 2 agents ====================================================================== Starting simulation: max 5 turns ================================================== --- Turn 1/5 --- Wizard: GO EAST -> Moved east to (6, 4) Knight: WAIT -> Waited and observed surroundings --- Turn 2/5 --- Wizard: GO EAST -> Moved east to (7, 4) Knight: GO WEST -> Moved west to (14, 4) [... more turns ...] ====================================================================== SIMULATION SUMMARY ====================================================================== Total turns: 5 Total steps: 10 Wizard: Actions: 5 Successful: 4 Final position: (9, 4) Knight: Actions: 5 Successful: 3 Final position: (11, 4) Simulation log saved to: /tmp/vllm_multi_turn/simulation_log.json PASS ``` --- ## Next Steps (Future Sessions) After Hours 3-4 are complete: 1. **Speech System** - Add ANNOUNCE/SPEAK actions with message passing 2. **Button-Door Puzzle** - Use `create_button_door_scenario()` for coordination test 3. **Animated Replay** - Play back simulation with movement animations 4. **NPC Behaviors** - Add scripted entities (patrol, flee, etc.) 5. **Affordance Learning** - Track what agents discover about objects