Enhance testing harness stability and update repository documentation (#3983)

* Refactor skill turn harness, fix session serialization, and resolve E2E test failures

* Ignore symlinks during workspace copying and enforce sandbox boundaries in playbooks

* Refactor interaction loop to use clean async generator-based Event flow

* Introduce dedicated async generator test and improve autonomous tester instructions

* Enforce strict sandbox awareness and Step 8 policy import gates

* Track and display conversation context size next to turn headers

* Streamline token usage display to only appear in turn step headers

* Refactor token usage tracking to show actual active context size

* Implement progress tracking block and human recovery in test harness

* docs: document and categorize repository skills and tools

* docs: add maintenance instructions for updating FACTORIES.md tables

* docs: add missing data-catalog-policy-tag factory in FACTORIES.md

* docs: add missing networking stage sub-factories in FACTORIES.md

* docs: add systematic commands for discovering module/stage factories in FACTORIES.md

* docs: add missing vpcs factories in 0-org-setup and 2-project-factory stages
This commit is contained in:
Ludovico Magnocavallo
2026-05-24 12:25:50 +02:00
committed by GitHub
parent 81f72e8068
commit c24dae395b
13 changed files with 417 additions and 179 deletions

View File

@@ -66,6 +66,7 @@ python3 harness.py playbooks/my-playbook.yaml
- `--keep-workspace` (Optional): Preserve the temporary workspace directory (`/tmp/gemini_harness_*`) after execution to inspect files generated by the agent.
- `--agent-model <model>` (Optional): Override the model the agent uses (e.g., `gemini-2.5-pro`). Overrides playbook definition.
- `--evaluator-model <model>` (Optional): Override the model the test harness uses to grade and simulate (e.g., `gemini-2.5-flash`). Overrides playbook definition.
- `--max-deviations <number>` (Optional): Set the maximum number of minor deviations/mistakes (such as rule violations or incorrect tool calls) the agent can make during autonomous/hybrid mode before the harness fails the test run. Defaults to `3`.
- `--debug` (Optional): Enable verbose debug logging for the SDK (e.g., WebSocket traffic).
⚠️ **Security Warning regarding Logs:**
@@ -74,18 +75,27 @@ A default `.gitignore` is provided in the `logs/` directory to prevent committin
### Expected Output
The harness executes the CLI steps, evaluates the responses, and streams the results to the console:
The harness executes the playbook, rendering thoughts and tool calls in real-time, and streams the results to the console with active context usage stats:
```text
--- Tuning: FAST Setup PoC | Workspace: /tmp/gemini_harness_abc123 ---
[Step 1] Input: Hi, please activate the fast-setup-poc skill and let's configure FAST.
[Step 1] Output: Hi, let's configure FAST. Please provide your Google Cloud Project ID.
[Step 1]
Tester:
Hi, please activate the fast-setup-poc skill and let's configure FAST.
🧠 Thinking:
Let's activate the fast-setup-poc skill and check the requirements.
🛠️ [Tool Call]: list_directory(path=.)
...
✅ [PASS Step 1]: The agent greeted the user ('Hi'), confirmed it was configuring FAST, and asked for the Project ID. All parts of the objective were fulfilled.
[Step 2] [Context: 4,512]
Tester:
my-super-project-123
...
✅ [SUCCESS] Playbook 'FAST Setup PoC' completed successfully.
📄 Session JSON saved to: logs/FAST_Setup_PoC_session.json
📄 Markdown log saved to: logs/FAST_Setup_PoC_log.md
```
@@ -137,15 +147,12 @@ Playbooks are written in YAML. For autocompletion and validation in VS Code, add
If your playbook requires environment variables (e.g., secrets), declare them in the `env` array. You can then reference them in your `steps` using `${VAR_NAME}`. If a variable is declared but not found in the environment (or passed via `--env-file`), the harness will safely halt before execution.
To run the test in a specific directory (e.g., the repository root), specify `working_dir`. If omitted, a temporary isolated workspace is created.
```yaml
# yaml-language-server: $schema=../playbooks/playbook.schema.json
name: "My Test Playbook"
timeout: 120
agent_model: "gemini-2.5-pro"
evaluator_model: "gemini-2.5-flash"
working_dir: "." # Run in the directory where harness is executed
env:
- MY_API_KEY
steps:

View File

@@ -45,7 +45,24 @@ import tempfile
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import Optional, Dict
from typing import Optional, Dict, Union, AsyncIterator
@dataclass
class ThinkingDeltaEvent:
text: str
@dataclass
class ToolCallEvent:
name: str
args: dict
@dataclass
class ErrorEvent:
message: str
# Third-party imports
import click
@@ -189,6 +206,65 @@ class StreamingTrimmer:
self.whitespace_buffer = ""
class ConsoleRenderer:
"""Handles console formatting and streaming output for interaction turns."""
def __init__(self):
self.trimmer = StreamingTrimmer()
self.need_newline = False
self.at_start_of_line = True
def render_thinking(self, text: str):
to_print = self.trimmer.process_delta(text)
if to_print:
if not self.need_newline:
print(f" {format_color('🧠 Thinking:', C_GRAY)}", flush=True)
self.need_newline = True
self.at_start_of_line = True
parts = to_print.split('\n')
for i, part in enumerate(parts):
if i > 0:
print('\n', end='')
self.at_start_of_line = True
if part:
if self.at_start_of_line:
print(' ', end='')
self.at_start_of_line = False
print(format_color(part, C_GRAY), end='', flush=True)
def render_tool_call(self, name: str, args: dict):
if self.need_newline:
self.trimmer.flush_remaining()
print()
self.need_newline = False
cleaned_args = {
k: v for k, v in args.items() if k not in {
"output",
"results",
"num_results",
"diff_block",
"exit_code",
"combined_output",
"image_name",
}
}
args_str = ", ".join(f"{k}={v}" for k, v in cleaned_args.items())
print(f" 🛠️ {format_color(f'[Tool Call]: {name}({args_str})', C_GRAY)}")
def render_error(self, message: str):
if self.need_newline:
self.trimmer.flush_remaining()
print()
self.need_newline = False
print(f" ❌ [Error]: {message}")
def finalize(self):
if self.need_newline:
self.trimmer.flush_remaining()
print()
def init_markdown_log(md_log_path: str, playbook_name: str):
'''Initializes the markdown log file with a header.
@@ -386,12 +462,12 @@ def check_files_contain(files_contain: dict, workspace_dir: str) -> bool:
def check_tool_calls_contain(tool_calls_criteria: dict,
workspace_dir: str) -> bool:
executed_tool_calls: list) -> bool:
'''Checks if the agent's tool calls contain expected literal strings in their arguments.
Args:
tool_calls_criteria: A dictionary mapping tool names to lists of expected strings.
workspace_dir: The temporary workspace directory path.
executed_tool_calls: A list of recorded tool calls, each being a dict with 'name' and 'args'.
Returns:
True if all tool calls contain their expected strings, False otherwise.
@@ -400,32 +476,14 @@ def check_tool_calls_contain(tool_calls_criteria: dict,
return True
passed = True
workspace_name = os.path.basename(workspace_dir)
slugified_name = re.sub(r'[^a-zA-Z0-9]+', '-',
workspace_name).strip('-').lower()
session_files = glob.glob(
os.path.expanduser(
f'~/.gemini/tmp/{slugified_name}/chats/session-*.json'))
if not session_files:
print(
"❌ [CHECK FAILED]: Expected session JSON file not found in workspace for tool validation."
)
return False
session_files.sort(key=os.path.getmtime, reverse=True)
try:
with open(session_files[0], 'r') as f:
session_data = json.load(f)
extracted_calls: Dict[str, str] = {}
for m in session_data.get('messages', []):
for tc in m.get('toolCalls', []):
name = tc.get('name')
args_str = json.dumps(tc.get('args', {}))
if name not in extracted_calls:
extracted_calls[name] = ""
extracted_calls[name] += args_str + "\n"
for tc in executed_tool_calls:
name = tc['name']
args_str = json.dumps(tc['args'])
if name not in extracted_calls:
extracted_calls[name] = ""
extracted_calls[name] += args_str + "\n"
for tool_name, expected_strings in tool_calls_criteria.items():
if tool_name not in extracted_calls:
@@ -443,19 +501,21 @@ def check_tool_calls_contain(tool_calls_criteria: dict,
passed = False
except Exception as e:
print(f"❌ [CHECK FAILED]: Failed to parse session JSON: {e}")
print(f"❌ [CHECK FAILED]: Failed to process tool calls: {e}")
passed = False
return passed
def perform_deterministic_checks(success_criteria: dict, workspace_dir: str,
executed_tool_calls: list,
full_stdout: str) -> bool:
'''Evaluates the deterministic checks defined in the persona success_criteria.
Args:
success_criteria: The success_criteria dictionary from the playbook.
workspace_dir: The temporary workspace directory path.
executed_tool_calls: A list of recorded tool calls.
full_stdout: The combined stdout of all CLI invocations.
Returns:
@@ -468,7 +528,7 @@ def perform_deterministic_checks(success_criteria: dict, workspace_dir: str,
passed = False
if not check_tool_calls_contain(
success_criteria.get('tool_calls_contain', {}), workspace_dir):
success_criteria.get('tool_calls_contain', {}), executed_tool_calls):
passed = False
if not check_files_exist(success_criteria.get('files_exist', []),
@@ -490,74 +550,43 @@ def _view_file_directory_check(args: dict) -> bool:
return False
async def run_turn(agent: Agent, user_input: str) -> None:
"""Sends user input and streams steps in real-time, logging tool calls and errors."""
async def run_turn(
agent: Agent, user_input: str
) -> AsyncIterator[Union[ThinkingDeltaEvent, ToolCallEvent, ErrorEvent]]:
"""Sends user input and yields interaction events in real-time."""
await agent.conversation.send(user_input)
printed_calls = set()
need_newline = False
at_start_of_line = True
trimmer = StreamingTrimmer()
async for step_obj in agent.conversation.receive_steps():
if step_obj.thinking_delta:
to_print = trimmer.process_delta(step_obj.thinking_delta)
if to_print:
if not need_newline:
print(f" {format_color('🧠 Thinking:', C_GRAY)}", flush=True)
need_newline = True
at_start_of_line = True
parts = to_print.split('\n')
for i, part in enumerate(parts):
if i > 0:
print('\n', end='')
at_start_of_line = True
if part:
if at_start_of_line:
print(' ', end='')
at_start_of_line = False
print(format_color(part, C_GRAY), end='', flush=True)
yield ThinkingDeltaEvent(text=step_obj.thinking_delta)
if step_obj.type == agy_types.StepType.TOOL_CALL:
for tc in step_obj.tool_calls:
if tc.id not in printed_calls:
printed_calls.add(tc.id)
if need_newline:
trimmer.flush_remaining()
print()
need_newline = False
cleaned_args = {
k: v for k, v in tc.args.items() if k not in {
"output",
"results",
"num_results",
"diff_block",
"exit_code",
"combined_output",
"image_name",
}
}
args_str = ", ".join(f"{k}={v}" for k, v in cleaned_args.items())
print(
f" 🛠️ {format_color(f'[Tool Call]: {tc.name}({args_str})', C_GRAY)}"
)
if step_obj.status == agy_types.StepStatus.ERROR:
if need_newline:
trimmer.flush_remaining()
print()
need_newline = False
error_msg = step_obj.error or "Unknown step error"
print(f" ❌ [Error]: {error_msg}")
yield ToolCallEvent(name=tc.name, args=dict(tc.args))
if need_newline:
trimmer.flush_remaining()
print()
if step_obj.status == agy_types.StepStatus.ERROR:
yield ErrorEvent(message=step_obj.error or "Unknown step error")
def _get_usage_str(agent: Agent) -> str:
"""Safely retrieves the active context size from the agent's conversation."""
try:
usage = agent.conversation.last_turn_usage
if usage and usage.prompt_token_count is not None:
return f" [Context: {usage.prompt_token_count:,}]"
except Exception:
pass
return ""
async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
skill_src: str = None,
keep_workspace: bool = False,
cli_agent_model: str = None,
cli_evaluator_model: str = None):
cli_evaluator_model: str = None,
max_deviations: int = 3):
'''Executes the test playbook and evaluates the agent's responses.
Args:
@@ -571,6 +600,15 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
Returns:
True if the playbook passes completely, False if any step fails.
'''
# Initialize all finally-block dependencies at the very top to avoid NameErrors on early failure
log_prefix = "unknown_playbook"
conversation_history = []
executed_tool_calls = []
interaction_log = []
is_tmpdir = False
workspace_dir = os.getcwd()
original_cwd = os.getcwd()
evaluator_client = genai.Client()
log_dir = os.path.abspath(log_dir)
os.makedirs(log_dir, exist_ok=True)
@@ -594,25 +632,33 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
tmpdir_config = playbook.get('tmpdir')
is_tmpdir = tmpdir_config is not None
original_cwd = os.getcwd()
if is_tmpdir:
workspace_dir = tempfile.mkdtemp(prefix='gemini_harness_')
open(os.path.join(workspace_dir, '.project_root'), 'w').close()
def _ignore_symlinks_and_patterns(directory, names):
ignore_func = shutil.ignore_patterns('.terraform', '.git', '.venv',
'venv', '__pycache__',
'.pytest_cache',
'skill-turn-harness')
ignored = set(ignore_func(directory, names))
for name in names:
if os.path.islink(os.path.join(directory, name)):
ignored.add(name)
return list(ignored)
link_paths = tmpdir_config.get('link_paths', [])
for path in link_paths:
src_abs = os.path.abspath(os.path.join(original_cwd, path))
dst_abs = os.path.join(workspace_dir, path)
os.makedirs(os.path.dirname(dst_abs), exist_ok=True)
try:
if os.path.isdir(src_abs):
shutil.copytree(
src_abs, dst_abs,
ignore=shutil.ignore_patterns('.terraform', '.git', '.venv',
'venv', '__pycache__',
'.pytest_cache',
'skill-turn-harness'))
if os.path.islink(src_abs):
print(f'🔗 Skipped symlink: {path}')
elif os.path.isdir(src_abs):
shutil.copytree(src_abs, dst_abs,
ignore=_ignore_symlinks_and_patterns)
print(f'📁 Copied directory: {path} -> {dst_abs}')
else:
shutil.copy2(src_abs, dst_abs)
@@ -633,13 +679,19 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
full_stdout = ""
conversation_history = []
executed_tool_calls = []
step_index = 0
fallback_to_persona = False
# Configure SDK Agent
skills_paths = []
if skill_src:
skills_paths.append(os.path.abspath(skill_src))
if is_tmpdir:
# If sandboxed in tmpdir, point to the copied skill path inside the sandbox
skills_paths.append(
os.path.abspath(os.path.join(workspace_dir, skill_src)))
else:
skills_paths.append(os.path.abspath(skill_src))
# Allow all tools to emulate CLI -y/--dangerously-skip-permissions
policies = [
@@ -654,7 +706,12 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
standard_instructions = (
"GUIDELINES:\n"
"- Always check if a path is a directory before trying to view it. "
"Use list_directory to inspect directories, never view_file.")
"Use list_directory to inspect directories, never view_file.\n"
"- You are running inside an isolated, sandboxed temporary workspace (e.g., /tmp/gemini_harness_*). "
"Whenever creating local files, configuration directories (like custom-fast-config or fast-config), "
"or checking defaults, you MUST do so strictly relative to your current workspace directory (CWD). "
"NEVER try to directly read or write to /home/ludomagno/ or other external folders, as your file tools "
"are sandboxed and will fail with permission/step errors.")
config = LocalAgentConfig(
model=agent_model,
@@ -668,6 +725,19 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
try:
async with Agent(config) as agent:
async def _execute_turn(user_input_str: str):
renderer = ConsoleRenderer()
async for event in run_turn(agent, user_input_str):
if isinstance(event, ThinkingDeltaEvent):
renderer.render_thinking(event.text)
elif isinstance(event, ToolCallEvent):
executed_tool_calls.append({'name': event.name, 'args': event.args})
renderer.render_tool_call(event.name, event.args)
elif isinstance(event, ErrorEvent):
renderer.render_error(event.message)
renderer.finalize()
# --- PHASE 1: SCRIPTED STEPS ---
for step_dict in playbook_steps:
raw_user_input = step_dict['user_input']
@@ -681,13 +751,15 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
step = StepData(step_index=step_index, user_input=subbed_user_input,
expected_outcome=subbed_expected_outcome)
turn_str = format_color(f'[Step {step.step_index + 1}]', C_BOLD_WHITE)
usage_str_start = _get_usage_str(agent) if step_index > 0 else ""
turn_str = format_color(
f'[Step {step.step_index + 1}]{usage_str_start}', C_BOLD_WHITE)
print(
f"\n{turn_str}\n{format_color('Tester:', C_BLUE)}\n{step.user_input.rstrip()}"
)
try:
await asyncio.wait_for(run_turn(agent, step.user_input),
await asyncio.wait_for(_execute_turn(step.user_input),
timeout=playbook_timeout)
step.skill_response = agent.conversation.last_response
except asyncio.TimeoutError:
@@ -817,15 +889,19 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
if next_input:
print(f"{format_color('Tester:', C_BLUE)}\n{next_input.rstrip()}")
deviation_count = 0
for turn in range(max_turns):
if next_input:
turn_display = len(conversation_history) + 1
turn_str = format_color(f'[Autonomous Turn {turn_display}]',
C_BOLD_WHITE)
usage_str_start = _get_usage_str(agent)
turn_str = format_color(
f'[Autonomous Turn {turn_display}]{usage_str_start}',
C_BOLD_WHITE)
print(f"\n{turn_str}")
try:
await asyncio.wait_for(run_turn(agent, next_input),
await asyncio.wait_for(_execute_turn(next_input),
timeout=playbook_timeout)
agent_response = agent.conversation.last_response
except asyncio.TimeoutError:
@@ -899,20 +975,31 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
parsed_eval = json.loads(eval_response.text)
if not parsed_eval['agent_followed_skill_rules']:
label = format_color('[AUTONOMOUS FAIL]', C_GRAY)
msg = format_color(parsed_eval['reasoning'], C_RED)
print(f"{label}: {msg}")
dump_failed_log(log_dir, log_prefix, interaction_log)
return False
deviation_count += 1
label = format_color('[AGENT DEVIATION]', C_YELLOW)
msg = format_color(
f"{parsed_eval['reasoning']} (Deviation {deviation_count}/{max_deviations})",
C_YELLOW)
print(f"⚠️ {label}: {msg}")
if deviation_count > max_deviations:
label_fail = format_color('[AUTONOMOUS FAIL]', C_GRAY)
msg_fail = format_color(
f"Exceeded max allowed deviations ({max_deviations}). Failing test.",
C_RED)
print(f"{label_fail}: {msg_fail}")
dump_failed_log(log_dir, log_prefix, interaction_log)
return False
fallback_to_persona = True # Flag as passed with warning since we recovered from a deviation
if parsed_eval['test_completed_successfully']:
elif parsed_eval['test_completed_successfully']:
label = format_color('[AUTONOMOUS SEMANTIC SUCCESS]', C_GRAY)
msg = format_color(parsed_eval['reasoning'], C_GREEN)
print(f"{label}: {msg}")
print("🔍 Performing deterministic checks...")
if perform_deterministic_checks(interpolated_success_criteria,
workspace_dir, full_stdout):
workspace_dir, executed_tool_calls,
full_stdout):
if fallback_to_persona:
label = format_color('[PASS WITH WARNINGS]', C_GRAY)
msg = format_color(
@@ -946,19 +1033,30 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
dump_failed_log(log_dir, log_prefix, interaction_log)
return False
except Exception as e:
print(format_color(f'\n💥 [CRASH] Unexpected error: {e}', C_RED),
file=sys.stderr)
import traceback
traceback.print_exc()
dump_failed_log(log_dir, log_prefix, interaction_log)
return False
except KeyboardInterrupt:
print('\n🛑 [INTERRUPTED] Shutting down cleanly...')
dump_failed_log(log_dir, log_prefix, interaction_log)
return False
finally:
# Locate and copy the session json to the logs directory
# The SDK saves it in save_dir/chats/session-*.json
session_files = glob.glob(os.path.join(log_dir, 'chats', 'session-*.json'))
if session_files:
session_files.sort(key=os.path.getmtime, reverse=True)
session_log_path = os.path.join(log_dir, f'{log_prefix}_session.json')
shutil.copy2(session_files[0], session_log_path)
# Save the session trace json to the logs directory
session_log_path = os.path.join(log_dir, f'{log_prefix}_session.json')
session_data = {
"messages": conversation_history,
"toolCalls": executed_tool_calls
}
try:
with open(session_log_path, 'w') as f:
json.dump(session_data, f, indent=2)
print(f'📄 Session JSON saved to: {session_log_path}')
except Exception as e:
print(f'⚠️ [WARNING] Failed to write session JSON: {e}', file=sys.stderr)
if is_tmpdir:
os.chdir(original_cwd)
@@ -1006,13 +1104,20 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
help=
'Override the model the test harness uses to grade (e.g., gemini-2.5-flash).',
)
@click.option(
'--max-deviations',
type=int,
default=3,
help=
'Number of deviations/mistakes the agent can make before failing (allows human recovery).',
)
@click.option(
'--debug',
is_flag=True,
help='Enable debug logging for the SDK.',
)
def main(playbook, log_dir, skill_src, env_file, keep_workspace, agent_model,
evaluator_model, debug):
evaluator_model, max_deviations, debug):
'''Hybrid Python SDK Test Harness.
Executes a YAML playbook using the Antigravity SDK and evaluates the
@@ -1037,7 +1142,7 @@ def main(playbook, log_dir, skill_src, env_file, keep_workspace, agent_model,
asyncio.run(
run_hybrid_tuning_loop(playbook, log_dir, skill_src, keep_workspace,
agent_model, evaluator_model))
agent_model, evaluator_model, max_deviations))
if __name__ == '__main__':

View File

@@ -70,7 +70,7 @@ persona:
- "domain: custom-apis.domain"
- "prefix: cust"
- "primary: u-custom-region1"
- "gcp-organization-admins: principal://iam.googleapis.com/locations/global/workforcePools/my-pool/subject/my-user@custom.cloud.domain"
- "principal://iam.googleapis.com/locations/global/workforcePools/my-pool/subject/my-user@custom.cloud.domain"
"custom-fast-config/providers/0-org-setup-providers.tf":
- "universe_domain"
- "custom-apis.domain"

View File

@@ -4,7 +4,7 @@ description: 'A simple skill to test if the agent correctly executes tools.'
---
# Instructions
You are a simple file-creating agent. When the user asks you to create a file, you MUST use the `write_file` tool to create a file named `output.txt` in the current directory.
You are a simple file-creating agent. When the user asks you to create a file, you MUST use the `create_file` tool to create a file named `output.txt` in the current directory.
The content of the file must be exactly: `Hello World`
Once you have successfully executed the tool, tell the user that the file has been created.

View File

@@ -14,19 +14,25 @@
# yaml-language-server: $schema=../../playbooks/playbook.schema.json
name: "Tool Call Test Autonomous"
timeout: 30
timeout: 120
tmpdir:
link_paths:
- tools/skill-turn-harness/test/fixtures/mock-tool-use-skill
persona:
initial_user_input: "Hi, please activate the tool-test-skill and create the file as instructed."
context: |
You are a user asking the agent to create a file.
Wait for the agent to confirm the file has been created.
max_turns: 3
If the agent asks any questions or says there is an error, repeat your request to create the file.
max_turns: 5
success_criteria:
llm_checks:
- "The agent confirmed the file was created."
tool_calls_contain:
write_file:
edit_file:
- "output.txt"
- "Hello World"
files_exist:
- "output.txt"
files_contain:
output.txt:
- "Hello World"

View File

@@ -196,6 +196,49 @@ steps:
assert 'SYSTEM_ERROR: Timeout' in content
@pytest.mark.asyncio
@patch('harness.Agent')
async def test_run_turn_generator(mock_agent_class):
# Mock steps returned by the SDK
async def mock_receive_steps():
yield harness.agy_types.Step(type=harness.agy_types.StepType.UNKNOWN,
status=harness.agy_types.StepStatus.DONE,
thinking_delta="Thinking about it")
yield harness.agy_types.Step(
type=harness.agy_types.StepType.TOOL_CALL,
status=harness.agy_types.StepStatus.DONE, tool_calls=[
harness.agy_types.ToolCall(id="tc-1", name="list_directory",
args={"path": "/tmp"})
])
yield harness.agy_types.Step(type=harness.agy_types.StepType.TEXT_RESPONSE,
status=harness.agy_types.StepStatus.ERROR,
error="Something went wrong")
mock_conversation = MagicMock()
mock_conversation.send = AsyncMock()
mock_conversation.receive_steps.return_value = mock_receive_steps()
mock_agent = MagicMock()
mock_agent.conversation = mock_conversation
# Consume our new run_turn async generator
events = []
async for event in harness.run_turn(mock_agent, "Hi"):
events.append(event)
# Verify correct types and data are yielded
assert len(events) == 3
assert isinstance(events[0], harness.ThinkingDeltaEvent)
assert events[0].text == "Thinking about it"
assert isinstance(events[1], harness.ToolCallEvent)
assert events[1].name == "list_directory"
assert events[1].args == {"path": "/tmp"}
assert isinstance(events[2], harness.ErrorEvent)
assert events[2].message == "Something went wrong"
# --- Phase C: E2E Test ---
@@ -275,37 +318,3 @@ def test_e2e_tool_calls_contain(tmp_path):
session_files = list(tmp_path.glob('*_session.json'))
assert len(session_files) == 1
assert session_files[0].exists()
@pytest.mark.e2e
def test_e2e_working_dir(tmp_path):
'''
Runs an evaluation loop to verify working_dir functionality.
'''
fixtures_dir = os.path.join(os.path.dirname(__file__), 'fixtures')
skill_dir = os.path.join(fixtures_dir, 'mock-tool-use-skill')
# Create a specific subdirectory in tmp_path
workdir_target = tmp_path / "workdir_target"
workdir_target.mkdir()
# Dynamically create a playbook YAML file
playbook_content = f"""# yaml-language-server: $schema=../../playbooks/playbook.schema.json
name: "Tool Test with Workdir"
working_dir: "{workdir_target.resolve()}"
steps:
- user_input: "Hi, please activate tool-test-skill and create the file output.txt."
expected_outcome: "The agent confirms it has created the file."
"""
playbook_path = tmp_path / "playbook_workdir.yaml"
playbook_path.write_text(playbook_content)
result = asyncio.run(
harness.run_hybrid_tuning_loop(str(playbook_path), log_dir=str(tmp_path),
skill_src=skill_dir))
assert result is True
# Verify that output.txt was created INSIDE workdir_target
output_file = workdir_target / "output.txt"
assert output_file.exists()
assert output_file.read_text().strip() == "Hello World"