Enhance testing harness stability and update repository documentation (#3983)

* Refactor skill turn harness, fix session serialization, and resolve E2E test failures * Ignore symlinks during workspace copying and enforce sandbox boundaries in playbooks * Refactor interaction loop to use clean async generator-based Event flow * Introduce dedicated async generator test and improve autonomous tester instructions * Enforce strict sandbox awareness and Step 8 policy import gates * Track and display conversation context size next to turn headers * Streamline token usage display to only appear in turn step headers * Refactor token usage tracking to show actual active context size * Implement progress tracking block and human recovery in test harness * docs: document and categorize repository skills and tools * docs: add maintenance instructions for updating FACTORIES.md tables * docs: add missing data-catalog-policy-tag factory in FACTORIES.md * docs: add missing networking stage sub-factories in FACTORIES.md * docs: add systematic commands for discovering module/stage factories in FACTORIES.md * docs: add missing vpcs factories in 0-org-setup and 2-project-factory stages
2026-05-24 12:25:50 +02:00
parent 81f72e8068
commit c24dae395b
13 changed files with 417 additions and 179 deletions
--- a/tools/skill-turn-harness/README.md
+++ b/tools/skill-turn-harness/README.md
@@ -66,6 +66,7 @@ python3 harness.py playbooks/my-playbook.yaml
 - `--keep-workspace` (Optional): Preserve the temporary workspace directory (`/tmp/gemini_harness_*`) after execution to inspect files generated by the agent.
 - `--agent-model <model>` (Optional): Override the model the agent uses (e.g., `gemini-2.5-pro`). Overrides playbook definition.
 - `--evaluator-model <model>` (Optional): Override the model the test harness uses to grade and simulate (e.g., `gemini-2.5-flash`). Overrides playbook definition.
+- `--max-deviations <number>` (Optional): Set the maximum number of minor deviations/mistakes (such as rule violations or incorrect tool calls) the agent can make during autonomous/hybrid mode before the harness fails the test run. Defaults to `3`.
 - `--debug` (Optional): Enable verbose debug logging for the SDK (e.g., WebSocket traffic).

 ⚠️ **Security Warning regarding Logs:**
@@ -74,18 +75,27 @@ A default `.gitignore` is provided in the `logs/` directory to prevent committin

 ### Expected Output

-The harness executes the CLI steps, evaluates the responses, and streams the results to the console:
+The harness executes the playbook, rendering thoughts and tool calls in real-time, and streams the results to the console with active context usage stats:

 ```text
 --- Tuning: FAST Setup PoC | Workspace: /tmp/gemini_harness_abc123 ---

-[Step 1] Input: Hi, please activate the fast-setup-poc skill and let's configure FAST.
-[Step 1] Output: Hi, let's configure FAST. Please provide your Google Cloud Project ID.
+[Step 1]
+Tester:
+Hi, please activate the fast-setup-poc skill and let's configure FAST.
+  🧠 Thinking:
+  Let's activate the fast-setup-poc skill and check the requirements.
+  🛠️ [Tool Call]: list_directory(path=.)
+  ...
 ✅ [PASS Step 1]: The agent greeted the user ('Hi'), confirmed it was configuring FAST, and asked for the Project ID. All parts of the objective were fulfilled.

+[Step 2] [Context: 4,512]
+Tester:
+my-super-project-123
 ...

 ✅ [SUCCESS] Playbook 'FAST Setup PoC' completed successfully.
+📄 Session JSON saved to: logs/FAST_Setup_PoC_session.json
 📄 Markdown log saved to: logs/FAST_Setup_PoC_log.md
 ```

@@ -137,15 +147,12 @@ Playbooks are written in YAML. For autocompletion and validation in VS Code, add

 If your playbook requires environment variables (e.g., secrets), declare them in the `env` array. You can then reference them in your `steps` using `${VAR_NAME}`. If a variable is declared but not found in the environment (or passed via `--env-file`), the harness will safely halt before execution.

-To run the test in a specific directory (e.g., the repository root), specify `working_dir`. If omitted, a temporary isolated workspace is created.
-
 ```yaml
 # yaml-language-server: $schema=../playbooks/playbook.schema.json
 name: "My Test Playbook"
 timeout: 120
 agent_model: "gemini-2.5-pro"
 evaluator_model: "gemini-2.5-flash"
-working_dir: "." # Run in the directory where harness is executed
 env:
  - MY_API_KEY
 steps:
--- a/tools/skill-turn-harness/harness.py
+++ b/tools/skill-turn-harness/harness.py
@@ -45,7 +45,24 @@ import tempfile

 from dataclasses import dataclass, asdict
 from datetime import datetime
-from typing import Optional, Dict
+from typing import Optional, Dict, Union, AsyncIterator
+
+
+@dataclass
+class ThinkingDeltaEvent:
+  text: str
+
+
+@dataclass
+class ToolCallEvent:
+  name: str
+  args: dict
+
+
+@dataclass
+class ErrorEvent:
+  message: str
+

 # Third-party imports
 import click
@@ -189,6 +206,65 @@ class StreamingTrimmer:
    self.whitespace_buffer = ""


+class ConsoleRenderer:
+  """Handles console formatting and streaming output for interaction turns."""
+
+  def __init__(self):
+    self.trimmer = StreamingTrimmer()
+    self.need_newline = False
+    self.at_start_of_line = True
+
+  def render_thinking(self, text: str):
+    to_print = self.trimmer.process_delta(text)
+    if to_print:
+      if not self.need_newline:
+        print(f"  {format_color('🧠 Thinking:', C_GRAY)}", flush=True)
+        self.need_newline = True
+        self.at_start_of_line = True
+
+      parts = to_print.split('\n')
+      for i, part in enumerate(parts):
+        if i > 0:
+          print('\n', end='')
+          self.at_start_of_line = True
+        if part:
+          if self.at_start_of_line:
+            print('  ', end='')
+            self.at_start_of_line = False
+          print(format_color(part, C_GRAY), end='', flush=True)
+
+  def render_tool_call(self, name: str, args: dict):
+    if self.need_newline:
+      self.trimmer.flush_remaining()
+      print()
+      self.need_newline = False
+    cleaned_args = {
+        k: v for k, v in args.items() if k not in {
+            "output",
+            "results",
+            "num_results",
+            "diff_block",
+            "exit_code",
+            "combined_output",
+            "image_name",
+        }
+    }
+    args_str = ", ".join(f"{k}={v}" for k, v in cleaned_args.items())
+    print(f"  🛠️ {format_color(f'[Tool Call]: {name}({args_str})', C_GRAY)}")
+
+  def render_error(self, message: str):
+    if self.need_newline:
+      self.trimmer.flush_remaining()
+      print()
+      self.need_newline = False
+    print(f"  ❌ [Error]: {message}")
+
+  def finalize(self):
+    if self.need_newline:
+      self.trimmer.flush_remaining()
+      print()
+
+
 def init_markdown_log(md_log_path: str, playbook_name: str):
  '''Initializes the markdown log file with a header.

@@ -386,12 +462,12 @@ def check_files_contain(files_contain: dict, workspace_dir: str) -> bool:


 def check_tool_calls_contain(tool_calls_criteria: dict,
-                             workspace_dir: str) -> bool:
+                             executed_tool_calls: list) -> bool:
  '''Checks if the agent's tool calls contain expected literal strings in their arguments.

    Args:
      tool_calls_criteria: A dictionary mapping tool names to lists of expected strings.
-      workspace_dir: The temporary workspace directory path.
+      executed_tool_calls: A list of recorded tool calls, each being a dict with 'name' and 'args'.

    Returns:
      True if all tool calls contain their expected strings, False otherwise.
@@ -400,32 +476,14 @@ def check_tool_calls_contain(tool_calls_criteria: dict,
    return True

  passed = True
-  workspace_name = os.path.basename(workspace_dir)
-  slugified_name = re.sub(r'[^a-zA-Z0-9]+', '-',
-                          workspace_name).strip('-').lower()
-
-  session_files = glob.glob(
-      os.path.expanduser(
-          f'~/.gemini/tmp/{slugified_name}/chats/session-*.json'))
-  if not session_files:
-    print(
-        "❌ [CHECK FAILED]: Expected session JSON file not found in workspace for tool validation."
-    )
-    return False
-
-  session_files.sort(key=os.path.getmtime, reverse=True)
  try:
-    with open(session_files[0], 'r') as f:
-      session_data = json.load(f)
-
    extracted_calls: Dict[str, str] = {}
-    for m in session_data.get('messages', []):
-      for tc in m.get('toolCalls', []):
-        name = tc.get('name')
-        args_str = json.dumps(tc.get('args', {}))
-        if name not in extracted_calls:
-          extracted_calls[name] = ""
-        extracted_calls[name] += args_str + "\n"
+    for tc in executed_tool_calls:
+      name = tc['name']
+      args_str = json.dumps(tc['args'])
+      if name not in extracted_calls:
+        extracted_calls[name] = ""
+      extracted_calls[name] += args_str + "\n"

    for tool_name, expected_strings in tool_calls_criteria.items():
      if tool_name not in extracted_calls:
@@ -443,19 +501,21 @@ def check_tool_calls_contain(tool_calls_criteria: dict,
          passed = False

  except Exception as e:
-    print(f"❌ [CHECK FAILED]: Failed to parse session JSON: {e}")
+    print(f"❌ [CHECK FAILED]: Failed to process tool calls: {e}")
    passed = False

  return passed


 def perform_deterministic_checks(success_criteria: dict, workspace_dir: str,
+                                 executed_tool_calls: list,
                                 full_stdout: str) -> bool:
  '''Evaluates the deterministic checks defined in the persona success_criteria.

  Args:
    success_criteria: The success_criteria dictionary from the playbook.
    workspace_dir: The temporary workspace directory path.
+    executed_tool_calls: A list of recorded tool calls.
    full_stdout: The combined stdout of all CLI invocations.

  Returns:
@@ -468,7 +528,7 @@ def perform_deterministic_checks(success_criteria: dict, workspace_dir: str,
    passed = False

  if not check_tool_calls_contain(
-      success_criteria.get('tool_calls_contain', {}), workspace_dir):
+      success_criteria.get('tool_calls_contain', {}), executed_tool_calls):
    passed = False

  if not check_files_exist(success_criteria.get('files_exist', []),
@@ -490,74 +550,43 @@ def _view_file_directory_check(args: dict) -> bool:
  return False


-async def run_turn(agent: Agent, user_input: str) -> None:
-  """Sends user input and streams steps in real-time, logging tool calls and errors."""
+async def run_turn(
+    agent: Agent, user_input: str
+) -> AsyncIterator[Union[ThinkingDeltaEvent, ToolCallEvent, ErrorEvent]]:
+  """Sends user input and yields interaction events in real-time."""
  await agent.conversation.send(user_input)
  printed_calls = set()
-  need_newline = False
-  at_start_of_line = True
-  trimmer = StreamingTrimmer()
  async for step_obj in agent.conversation.receive_steps():
    if step_obj.thinking_delta:
-      to_print = trimmer.process_delta(step_obj.thinking_delta)
-      if to_print:
-        if not need_newline:
-          print(f"  {format_color('🧠 Thinking:', C_GRAY)}", flush=True)
-          need_newline = True
-          at_start_of_line = True
-
-        parts = to_print.split('\n')
-        for i, part in enumerate(parts):
-          if i > 0:
-            print('\n', end='')
-            at_start_of_line = True
-          if part:
-            if at_start_of_line:
-              print('  ', end='')
-              at_start_of_line = False
-            print(format_color(part, C_GRAY), end='', flush=True)
+      yield ThinkingDeltaEvent(text=step_obj.thinking_delta)

    if step_obj.type == agy_types.StepType.TOOL_CALL:
      for tc in step_obj.tool_calls:
        if tc.id not in printed_calls:
          printed_calls.add(tc.id)
-          if need_newline:
-            trimmer.flush_remaining()
-            print()
-            need_newline = False
-          cleaned_args = {
-              k: v for k, v in tc.args.items() if k not in {
-                  "output",
-                  "results",
-                  "num_results",
-                  "diff_block",
-                  "exit_code",
-                  "combined_output",
-                  "image_name",
-              }
-          }
-          args_str = ", ".join(f"{k}={v}" for k, v in cleaned_args.items())
-          print(
-              f"  🛠️ {format_color(f'[Tool Call]: {tc.name}({args_str})', C_GRAY)}"
-          )
-    if step_obj.status == agy_types.StepStatus.ERROR:
-      if need_newline:
-        trimmer.flush_remaining()
-        print()
-        need_newline = False
-      error_msg = step_obj.error or "Unknown step error"
-      print(f"  ❌ [Error]: {error_msg}")
+          yield ToolCallEvent(name=tc.name, args=dict(tc.args))

-  if need_newline:
-    trimmer.flush_remaining()
-    print()
+    if step_obj.status == agy_types.StepStatus.ERROR:
+      yield ErrorEvent(message=step_obj.error or "Unknown step error")
+
+
+def _get_usage_str(agent: Agent) -> str:
+  """Safely retrieves the active context size from the agent's conversation."""
+  try:
+    usage = agent.conversation.last_turn_usage
+    if usage and usage.prompt_token_count is not None:
+      return f" [Context: {usage.prompt_token_count:,}]"
+  except Exception:
+    pass
+  return ""


 async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
                                 skill_src: str = None,
                                 keep_workspace: bool = False,
                                 cli_agent_model: str = None,
-                                 cli_evaluator_model: str = None):
+                                 cli_evaluator_model: str = None,
+                                 max_deviations: int = 3):
  '''Executes the test playbook and evaluates the agent's responses.

  Args:
@@ -571,6 +600,15 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
  Returns:
    True if the playbook passes completely, False if any step fails.
  '''
+  # Initialize all finally-block dependencies at the very top to avoid NameErrors on early failure
+  log_prefix = "unknown_playbook"
+  conversation_history = []
+  executed_tool_calls = []
+  interaction_log = []
+  is_tmpdir = False
+  workspace_dir = os.getcwd()
+  original_cwd = os.getcwd()
+
  evaluator_client = genai.Client()
  log_dir = os.path.abspath(log_dir)
  os.makedirs(log_dir, exist_ok=True)
@@ -594,25 +632,33 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,

  tmpdir_config = playbook.get('tmpdir')
  is_tmpdir = tmpdir_config is not None
-  original_cwd = os.getcwd()

  if is_tmpdir:
    workspace_dir = tempfile.mkdtemp(prefix='gemini_harness_')
    open(os.path.join(workspace_dir, '.project_root'), 'w').close()

+    def _ignore_symlinks_and_patterns(directory, names):
+      ignore_func = shutil.ignore_patterns('.terraform', '.git', '.venv',
+                                           'venv', '__pycache__',
+                                           '.pytest_cache',
+                                           'skill-turn-harness')
+      ignored = set(ignore_func(directory, names))
+      for name in names:
+        if os.path.islink(os.path.join(directory, name)):
+          ignored.add(name)
+      return list(ignored)
+
    link_paths = tmpdir_config.get('link_paths', [])
    for path in link_paths:
      src_abs = os.path.abspath(os.path.join(original_cwd, path))
      dst_abs = os.path.join(workspace_dir, path)
      os.makedirs(os.path.dirname(dst_abs), exist_ok=True)
      try:
-        if os.path.isdir(src_abs):
-          shutil.copytree(
-              src_abs, dst_abs,
-              ignore=shutil.ignore_patterns('.terraform', '.git', '.venv',
-                                            'venv', '__pycache__',
-                                            '.pytest_cache',
-                                            'skill-turn-harness'))
+        if os.path.islink(src_abs):
+          print(f'🔗 Skipped symlink: {path}')
+        elif os.path.isdir(src_abs):
+          shutil.copytree(src_abs, dst_abs,
+                          ignore=_ignore_symlinks_and_patterns)
          print(f'📁 Copied directory: {path} -> {dst_abs}')
        else:
          shutil.copy2(src_abs, dst_abs)
@@ -633,13 +679,19 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,

  full_stdout = ""
  conversation_history = []
+  executed_tool_calls = []
  step_index = 0
  fallback_to_persona = False

  # Configure SDK Agent
  skills_paths = []
  if skill_src:
-    skills_paths.append(os.path.abspath(skill_src))
+    if is_tmpdir:
+      # If sandboxed in tmpdir, point to the copied skill path inside the sandbox
+      skills_paths.append(
+          os.path.abspath(os.path.join(workspace_dir, skill_src)))
+    else:
+      skills_paths.append(os.path.abspath(skill_src))

  # Allow all tools to emulate CLI -y/--dangerously-skip-permissions
  policies = [
@@ -654,7 +706,12 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
  standard_instructions = (
      "GUIDELINES:\n"
      "- Always check if a path is a directory before trying to view it. "
-      "Use list_directory to inspect directories, never view_file.")
+      "Use list_directory to inspect directories, never view_file.\n"
+      "- You are running inside an isolated, sandboxed temporary workspace (e.g., /tmp/gemini_harness_*). "
+      "Whenever creating local files, configuration directories (like custom-fast-config or fast-config), "
+      "or checking defaults, you MUST do so strictly relative to your current workspace directory (CWD). "
+      "NEVER try to directly read or write to /home/ludomagno/ or other external folders, as your file tools "
+      "are sandboxed and will fail with permission/step errors.")

  config = LocalAgentConfig(
      model=agent_model,
@@ -668,6 +725,19 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,

  try:
    async with Agent(config) as agent:
+
+      async def _execute_turn(user_input_str: str):
+        renderer = ConsoleRenderer()
+        async for event in run_turn(agent, user_input_str):
+          if isinstance(event, ThinkingDeltaEvent):
+            renderer.render_thinking(event.text)
+          elif isinstance(event, ToolCallEvent):
+            executed_tool_calls.append({'name': event.name, 'args': event.args})
+            renderer.render_tool_call(event.name, event.args)
+          elif isinstance(event, ErrorEvent):
+            renderer.render_error(event.message)
+        renderer.finalize()
+
      # --- PHASE 1: SCRIPTED STEPS ---
      for step_dict in playbook_steps:
        raw_user_input = step_dict['user_input']
@@ -681,13 +751,15 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
        step = StepData(step_index=step_index, user_input=subbed_user_input,
                        expected_outcome=subbed_expected_outcome)

-        turn_str = format_color(f'[Step {step.step_index + 1}]', C_BOLD_WHITE)
+        usage_str_start = _get_usage_str(agent) if step_index > 0 else ""
+        turn_str = format_color(
+            f'[Step {step.step_index + 1}]{usage_str_start}', C_BOLD_WHITE)
        print(
            f"\n{turn_str}\n{format_color('Tester:', C_BLUE)}\n{step.user_input.rstrip()}"
        )

        try:
-          await asyncio.wait_for(run_turn(agent, step.user_input),
+          await asyncio.wait_for(_execute_turn(step.user_input),
                                 timeout=playbook_timeout)
          step.skill_response = agent.conversation.last_response
        except asyncio.TimeoutError:
@@ -817,15 +889,19 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
      if next_input:
        print(f"{format_color('Tester:', C_BLUE)}\n{next_input.rstrip()}")

+      deviation_count = 0
+
      for turn in range(max_turns):
        if next_input:
          turn_display = len(conversation_history) + 1
-          turn_str = format_color(f'[Autonomous Turn {turn_display}]',
-                                  C_BOLD_WHITE)
+          usage_str_start = _get_usage_str(agent)
+          turn_str = format_color(
+              f'[Autonomous Turn {turn_display}]{usage_str_start}',
+              C_BOLD_WHITE)
          print(f"\n{turn_str}")

          try:
-            await asyncio.wait_for(run_turn(agent, next_input),
+            await asyncio.wait_for(_execute_turn(next_input),
                                   timeout=playbook_timeout)
            agent_response = agent.conversation.last_response
          except asyncio.TimeoutError:
@@ -899,20 +975,31 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
        parsed_eval = json.loads(eval_response.text)

        if not parsed_eval['agent_followed_skill_rules']:
-          label = format_color('[AUTONOMOUS FAIL]', C_GRAY)
-          msg = format_color(parsed_eval['reasoning'], C_RED)
-          print(f"❌ {label}: {msg}")
-          dump_failed_log(log_dir, log_prefix, interaction_log)
-          return False
+          deviation_count += 1
+          label = format_color('[AGENT DEVIATION]', C_YELLOW)
+          msg = format_color(
+              f"{parsed_eval['reasoning']} (Deviation {deviation_count}/{max_deviations})",
+              C_YELLOW)
+          print(f"⚠️ {label}: {msg}")
+          if deviation_count > max_deviations:
+            label_fail = format_color('[AUTONOMOUS FAIL]', C_GRAY)
+            msg_fail = format_color(
+                f"Exceeded max allowed deviations ({max_deviations}). Failing test.",
+                C_RED)
+            print(f"❌ {label_fail}: {msg_fail}")
+            dump_failed_log(log_dir, log_prefix, interaction_log)
+            return False
+          fallback_to_persona = True  # Flag as passed with warning since we recovered from a deviation

-        if parsed_eval['test_completed_successfully']:
+        elif parsed_eval['test_completed_successfully']:
          label = format_color('[AUTONOMOUS SEMANTIC SUCCESS]', C_GRAY)
          msg = format_color(parsed_eval['reasoning'], C_GREEN)
          print(f"✅ {label}: {msg}")
          print("🔍 Performing deterministic checks...")

          if perform_deterministic_checks(interpolated_success_criteria,
-                                          workspace_dir, full_stdout):
+                                          workspace_dir, executed_tool_calls,
+                                          full_stdout):
            if fallback_to_persona:
              label = format_color('[PASS WITH WARNINGS]', C_GRAY)
              msg = format_color(
@@ -946,19 +1033,30 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
      dump_failed_log(log_dir, log_prefix, interaction_log)
      return False

+  except Exception as e:
+    print(format_color(f'\n💥 [CRASH] Unexpected error: {e}', C_RED),
+          file=sys.stderr)
+    import traceback
+    traceback.print_exc()
+    dump_failed_log(log_dir, log_prefix, interaction_log)
+    return False
  except KeyboardInterrupt:
    print('\n🛑 [INTERRUPTED] Shutting down cleanly...')
    dump_failed_log(log_dir, log_prefix, interaction_log)
    return False
  finally:
-    # Locate and copy the session json to the logs directory
-    # The SDK saves it in save_dir/chats/session-*.json
-    session_files = glob.glob(os.path.join(log_dir, 'chats', 'session-*.json'))
-    if session_files:
-      session_files.sort(key=os.path.getmtime, reverse=True)
-      session_log_path = os.path.join(log_dir, f'{log_prefix}_session.json')
-      shutil.copy2(session_files[0], session_log_path)
+    # Save the session trace json to the logs directory
+    session_log_path = os.path.join(log_dir, f'{log_prefix}_session.json')
+    session_data = {
+        "messages": conversation_history,
+        "toolCalls": executed_tool_calls
+    }
+    try:
+      with open(session_log_path, 'w') as f:
+        json.dump(session_data, f, indent=2)
      print(f'📄 Session JSON saved to: {session_log_path}')
+    except Exception as e:
+      print(f'⚠️ [WARNING] Failed to write session JSON: {e}', file=sys.stderr)

    if is_tmpdir:
      os.chdir(original_cwd)
@@ -1006,13 +1104,20 @@ async def run_hybrid_tuning_loop(playbook_path: str, log_dir: str,
    help=
    'Override the model the test harness uses to grade (e.g., gemini-2.5-flash).',
 )
+@click.option(
+    '--max-deviations',
+    type=int,
+    default=3,
+    help=
+    'Number of deviations/mistakes the agent can make before failing (allows human recovery).',
+)
@click.option(
    '--debug',
    is_flag=True,
    help='Enable debug logging for the SDK.',
 )
 def main(playbook, log_dir, skill_src, env_file, keep_workspace, agent_model,
-         evaluator_model, debug):
+         evaluator_model, max_deviations, debug):
  '''Hybrid Python SDK Test Harness.

  Executes a YAML playbook using the Antigravity SDK and evaluates the
@@ -1037,7 +1142,7 @@ def main(playbook, log_dir, skill_src, env_file, keep_workspace, agent_model,

  asyncio.run(
      run_hybrid_tuning_loop(playbook, log_dir, skill_src, keep_workspace,
-                             agent_model, evaluator_model))
+                             agent_model, evaluator_model, max_deviations))


 if __name__ == '__main__':
--- a/tools/skill-turn-harness/playbooks/fast/prerequisites/gcd-custom-manual-autonomous.yaml
+++ b/tools/skill-turn-harness/playbooks/fast/prerequisites/gcd-custom-manual-autonomous.yaml
@@ -70,7 +70,7 @@ persona:
        - "domain: custom-apis.domain"
        - "prefix: cust"
        - "primary: u-custom-region1"
-        - "gcp-organization-admins: principal://iam.googleapis.com/locations/global/workforcePools/my-pool/subject/my-user@custom.cloud.domain"
+        - "principal://iam.googleapis.com/locations/global/workforcePools/my-pool/subject/my-user@custom.cloud.domain"
      "custom-fast-config/providers/0-org-setup-providers.tf":
        - "universe_domain"
        - "custom-apis.domain"
--- a/tools/skill-turn-harness/test/fixtures/mock-tool-use-skill/SKILL.md
+++ b/tools/skill-turn-harness/test/fixtures/mock-tool-use-skill/SKILL.md
@@ -4,7 +4,7 @@ description: 'A simple skill to test if the agent correctly executes tools.'
 ---

 # Instructions
-You are a simple file-creating agent. When the user asks you to create a file, you MUST use the `write_file` tool to create a file named `output.txt` in the current directory.
+You are a simple file-creating agent. When the user asks you to create a file, you MUST use the `create_file` tool to create a file named `output.txt` in the current directory.
 The content of the file must be exactly: `Hello World`

 Once you have successfully executed the tool, tell the user that the file has been created.
--- a/tools/skill-turn-harness/test/fixtures/playbook_autonomous_tool_use.yaml
+++ b/tools/skill-turn-harness/test/fixtures/playbook_autonomous_tool_use.yaml
@@ -14,19 +14,25 @@

 # yaml-language-server: $schema=../../playbooks/playbook.schema.json
 name: "Tool Call Test Autonomous"
-timeout: 30
+timeout: 120
+tmpdir:
+  link_paths:
+    - tools/skill-turn-harness/test/fixtures/mock-tool-use-skill
 persona:
  initial_user_input: "Hi, please activate the tool-test-skill and create the file as instructed."
  context: |
    You are a user asking the agent to create a file.
    Wait for the agent to confirm the file has been created.
-  max_turns: 3
+    If the agent asks any questions or says there is an error, repeat your request to create the file.
+  max_turns: 5
  success_criteria:
    llm_checks:
      - "The agent confirmed the file was created."
    tool_calls_contain:
-      write_file:
+      edit_file:
        - "output.txt"
-        - "Hello World"
    files_exist:
      - "output.txt"
+    files_contain:
+      output.txt:
+        - "Hello World"
--- a/tools/skill-turn-harness/test/test_harness.py
+++ b/tools/skill-turn-harness/test/test_harness.py
@@ -196,6 +196,49 @@ steps:
  assert 'SYSTEM_ERROR: Timeout' in content


+@pytest.mark.asyncio
+@patch('harness.Agent')
+async def test_run_turn_generator(mock_agent_class):
+  # Mock steps returned by the SDK
+  async def mock_receive_steps():
+    yield harness.agy_types.Step(type=harness.agy_types.StepType.UNKNOWN,
+                                 status=harness.agy_types.StepStatus.DONE,
+                                 thinking_delta="Thinking about it")
+    yield harness.agy_types.Step(
+        type=harness.agy_types.StepType.TOOL_CALL,
+        status=harness.agy_types.StepStatus.DONE, tool_calls=[
+            harness.agy_types.ToolCall(id="tc-1", name="list_directory",
+                                       args={"path": "/tmp"})
+        ])
+    yield harness.agy_types.Step(type=harness.agy_types.StepType.TEXT_RESPONSE,
+                                 status=harness.agy_types.StepStatus.ERROR,
+                                 error="Something went wrong")
+
+  mock_conversation = MagicMock()
+  mock_conversation.send = AsyncMock()
+  mock_conversation.receive_steps.return_value = mock_receive_steps()
+
+  mock_agent = MagicMock()
+  mock_agent.conversation = mock_conversation
+
+  # Consume our new run_turn async generator
+  events = []
+  async for event in harness.run_turn(mock_agent, "Hi"):
+    events.append(event)
+
+  # Verify correct types and data are yielded
+  assert len(events) == 3
+  assert isinstance(events[0], harness.ThinkingDeltaEvent)
+  assert events[0].text == "Thinking about it"
+
+  assert isinstance(events[1], harness.ToolCallEvent)
+  assert events[1].name == "list_directory"
+  assert events[1].args == {"path": "/tmp"}
+
+  assert isinstance(events[2], harness.ErrorEvent)
+  assert events[2].message == "Something went wrong"
+
+
 # --- Phase C: E2E Test ---


@@ -275,37 +318,3 @@ def test_e2e_tool_calls_contain(tmp_path):
  session_files = list(tmp_path.glob('*_session.json'))
  assert len(session_files) == 1
  assert session_files[0].exists()
-
-
-@pytest.mark.e2e
-def test_e2e_working_dir(tmp_path):
-  '''
-  Runs an evaluation loop to verify working_dir functionality.
-  '''
-  fixtures_dir = os.path.join(os.path.dirname(__file__), 'fixtures')
-  skill_dir = os.path.join(fixtures_dir, 'mock-tool-use-skill')
-
-  # Create a specific subdirectory in tmp_path
-  workdir_target = tmp_path / "workdir_target"
-  workdir_target.mkdir()
-
-  # Dynamically create a playbook YAML file
-  playbook_content = f"""# yaml-language-server: $schema=../../playbooks/playbook.schema.json
-name: "Tool Test with Workdir"
-working_dir: "{workdir_target.resolve()}"
-steps:
-  - user_input: "Hi, please activate tool-test-skill and create the file output.txt."
-    expected_outcome: "The agent confirms it has created the file."
-"""
-  playbook_path = tmp_path / "playbook_workdir.yaml"
-  playbook_path.write_text(playbook_content)
-
-  result = asyncio.run(
-      harness.run_hybrid_tuning_loop(str(playbook_path), log_dir=str(tmp_path),
-                                     skill_src=skill_dir))
-
-  assert result is True
-  # Verify that output.txt was created INSIDE workdir_target
-  output_file = workdir_target / "output.txt"
-  assert output_file.exists()
-  assert output_file.read_text().strip() == "Hello World"