Files
hunfabric/tools/skill-turn-harness/test/test_harness.py
Ludovico Magnocavallo c24dae395b Enhance testing harness stability and update repository documentation (#3983)
* Refactor skill turn harness, fix session serialization, and resolve E2E test failures

* Ignore symlinks during workspace copying and enforce sandbox boundaries in playbooks

* Refactor interaction loop to use clean async generator-based Event flow

* Introduce dedicated async generator test and improve autonomous tester instructions

* Enforce strict sandbox awareness and Step 8 policy import gates

* Track and display conversation context size next to turn headers

* Streamline token usage display to only appear in turn step headers

* Refactor token usage tracking to show actual active context size

* Implement progress tracking block and human recovery in test harness

* docs: document and categorize repository skills and tools

* docs: add maintenance instructions for updating FACTORIES.md tables

* docs: add missing data-catalog-policy-tag factory in FACTORIES.md

* docs: add missing networking stage sub-factories in FACTORIES.md

* docs: add systematic commands for discovering module/stage factories in FACTORIES.md

* docs: add missing vpcs factories in 0-org-setup and 2-project-factory stages
2026-05-24 12:25:50 +02:00

321 lines
10 KiB
Python

# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import subprocess
import asyncio
from unittest.mock import patch, MagicMock, AsyncMock, PropertyMock
import pytest
from dataclasses import asdict
import harness
# --- Phase A: Data & Logging Unit Tests ---
def test_parse_and_validate_env(monkeypatch):
playbook = {'env': ['TEST_KEY']}
# Missing key raises error
with pytest.raises(ValueError,
match='Missing required environment variables: TEST_KEY'):
harness.parse_and_validate_env(playbook)
# Present key succeeds
monkeypatch.setenv('TEST_KEY', '123')
result = harness.parse_and_validate_env(playbook)
assert result['TEST_KEY'] == '123'
def test_step_data_serialization():
step = harness.StepData(
step_index=0,
user_input='hello',
expected_outcome='greet back',
skill_response='hi',
parsed_eval={
'passed': True,
'reasoning': 'ok'
},
is_system_error=False,
)
d = asdict(step)
assert d['step_index'] == 0
assert d['user_input'] == 'hello'
assert d['expected_outcome'] == 'greet back'
assert d['parsed_eval']['passed'] is True
def test_load_env_file(tmp_path):
env_file = tmp_path / '.env'
env_file.write_text('FOO=bar\n# comment\nBAZ=qux=123\n')
harness.load_env_file(str(env_file))
assert os.environ.get('FOO') == 'bar'
assert os.environ.get('BAZ') == 'qux=123'
def test_markdown_logging(tmp_path):
log_file = tmp_path / 'test_log.md'
harness.init_markdown_log(str(log_file), 'Test Playbook')
harness.log_step_to_markdown(
md_log_path=str(log_file),
step_index=0,
user_input='input 1',
expected_outcome='outcome 1',
skill_response='response 1',
parsed_eval={
'passed': True,
'reasoning': 'Good job'
},
)
content = log_file.read_text()
assert '# Interaction Log: Test Playbook' in content
assert '## Step 1' in content
assert '**User:**\n\ninput 1' in content
assert '**Expected Outcome:**\n\noutcome 1' in content
assert '**Agent:**\n\nresponse 1' in content
assert '✅ PASS: Good job' in content
def test_dump_failed_log(tmp_path):
interaction_log = [{'step': 1, 'error': 'test'}]
harness.dump_failed_log(str(tmp_path), 'test-playbook-prefix',
interaction_log)
failed_file = tmp_path / 'test-playbook-prefix_failed.json'
assert failed_file.exists()
data = json.loads(failed_file.read_text())
assert len(data) == 1
assert data[0]['error'] == 'test'
# --- Phase B: Execution Unit Tests (Mocked) ---
@patch('harness.genai.Client')
@patch('harness.Agent')
def test_run_hybrid_tuning_loop_mocked_success(mock_agent_class,
mock_client_class, tmp_path):
# Mock Conversation
mock_conversation = MagicMock()
mock_conversation.send = AsyncMock()
async def mock_receive_steps():
yield harness.agy_types.Step(type=harness.agy_types.StepType.TEXT_RESPONSE,
source=harness.agy_types.StepSource.MODEL,
target=harness.agy_types.StepTarget.USER,
status=harness.agy_types.StepStatus.DONE,
content="Mocked Agent Response")
mock_conversation.receive_steps.return_value = mock_receive_steps()
type(mock_conversation).last_response = PropertyMock(
return_value="Mocked Agent Response")
# Mock Agent
mock_agent = MagicMock()
mock_agent.conversation = mock_conversation
mock_agent_class.return_value.__aenter__.return_value = mock_agent
# Mock Evaluator
mock_eval_client = MagicMock()
mock_client_class.return_value = mock_eval_client
mock_eval_response = MagicMock()
mock_eval_response.text = '{"passed": true, "reasoning": "Mocked pass"}'
mock_eval_client.models.generate_content.return_value = mock_eval_response
# Playbook
playbook_content = """
name: "Mocked Playbook"
steps:
- user_input: "Hello"
expected_outcome: "Greet"
"""
playbook_file = tmp_path / "playbook.yaml"
playbook_file.write_text(playbook_content)
import asyncio
result = asyncio.run(
harness.run_hybrid_tuning_loop(str(playbook_file), log_dir=str(tmp_path)))
assert result is True
mock_conversation.send.assert_called_once_with("Hello")
mock_eval_client.models.generate_content.assert_called_once()
@patch('harness.genai.Client')
@patch('harness.Agent')
def test_run_hybrid_tuning_loop_mocked_timeout(mock_agent_class,
mock_client_class, tmp_path):
# Mock genai.Client
mock_client_class.return_value = MagicMock()
import asyncio
mock_conversation = MagicMock()
mock_conversation.send = AsyncMock(side_effect=asyncio.TimeoutError())
async def empty_gen():
if False:
yield
mock_conversation.receive_steps.return_value = empty_gen()
mock_agent = MagicMock()
mock_agent.conversation = mock_conversation
mock_agent_class.return_value.__aenter__.return_value = mock_agent
# Playbook
playbook_content = """
name: "Mocked Playbook"
steps:
- user_input: "Hello"
expected_outcome: "Greet"
"""
playbook_file = tmp_path / "playbook.yaml"
playbook_file.write_text(playbook_content)
result = asyncio.run(
harness.run_hybrid_tuning_loop(str(playbook_file), log_dir=str(tmp_path)))
assert result is False
mock_conversation.send.assert_called_once_with("Hello")
log_files = list(tmp_path.glob('*_log.md'))
assert len(log_files) == 1
content = log_files[0].read_text()
assert 'SYSTEM_ERROR: Timeout' in content
@pytest.mark.asyncio
@patch('harness.Agent')
async def test_run_turn_generator(mock_agent_class):
# Mock steps returned by the SDK
async def mock_receive_steps():
yield harness.agy_types.Step(type=harness.agy_types.StepType.UNKNOWN,
status=harness.agy_types.StepStatus.DONE,
thinking_delta="Thinking about it")
yield harness.agy_types.Step(
type=harness.agy_types.StepType.TOOL_CALL,
status=harness.agy_types.StepStatus.DONE, tool_calls=[
harness.agy_types.ToolCall(id="tc-1", name="list_directory",
args={"path": "/tmp"})
])
yield harness.agy_types.Step(type=harness.agy_types.StepType.TEXT_RESPONSE,
status=harness.agy_types.StepStatus.ERROR,
error="Something went wrong")
mock_conversation = MagicMock()
mock_conversation.send = AsyncMock()
mock_conversation.receive_steps.return_value = mock_receive_steps()
mock_agent = MagicMock()
mock_agent.conversation = mock_conversation
# Consume our new run_turn async generator
events = []
async for event in harness.run_turn(mock_agent, "Hi"):
events.append(event)
# Verify correct types and data are yielded
assert len(events) == 3
assert isinstance(events[0], harness.ThinkingDeltaEvent)
assert events[0].text == "Thinking about it"
assert isinstance(events[1], harness.ToolCallEvent)
assert events[1].name == "list_directory"
assert events[1].args == {"path": "/tmp"}
assert isinstance(events[2], harness.ErrorEvent)
assert events[2].message == "Something went wrong"
# --- Phase C: E2E Test ---
@pytest.mark.e2e
def test_e2e_hybrid_tuning_loop(tmp_path):
'''
Runs the actual evaluation loop against the basic FAST Setup PoC skill.
Uses tmp_path for log_dir so we don't pollute the actual workspace logs.
'''
fixtures_dir = os.path.join(os.path.dirname(__file__), 'fixtures')
skill_dir = os.path.join(fixtures_dir, 'mock-conversation-skill')
playbook_path = os.path.join(fixtures_dir,
'playbook_scripted_env_substitution.yaml')
env_file_path = os.path.join(fixtures_dir, '.env.test')
# Load env to prime the os.environ
harness.load_env_file(env_file_path)
result = asyncio.run(
harness.run_hybrid_tuning_loop(playbook_path, log_dir=str(tmp_path),
skill_src=skill_dir))
assert result is True
# Verify the log file was created in the temporary directory
log_files = list(tmp_path.glob('*_log.md'))
assert len(log_files) == 1
log_file = log_files[0]
assert log_file.exists()
content = log_file.read_text()
assert '✅ PASS' in content
# Verify substitution happened securely
assert 'dummy-secret-12345' in content
assert '${MY_SECRET_ID}' not in content
@pytest.mark.e2e
def test_e2e_autonomous_tuning_loop(tmp_path):
'''
Runs the autonomous evaluation loop against the basic FAST Setup PoC skill.
'''
fixtures_dir = os.path.join(os.path.dirname(__file__), 'fixtures')
skill_dir = os.path.join(fixtures_dir, 'mock-conversation-skill')
playbook_path = os.path.join(fixtures_dir,
'playbook_autonomous_conversation.yaml')
env_file_path = os.path.join(fixtures_dir, '.env.test')
harness.load_env_file(env_file_path)
result = asyncio.run(
harness.run_hybrid_tuning_loop(playbook_path, log_dir=str(tmp_path),
skill_src=skill_dir))
assert result is True
log_files = list(tmp_path.glob('*_log.md'))
assert len(log_files) == 1
content = log_files[0].read_text()
# Check that the autonomous turns were logged
assert '## Autonomous Turn 1' in content
assert 'dummy-secret-12345' in content
@pytest.mark.e2e
def test_e2e_tool_calls_contain(tmp_path):
'''
Runs an autonomous evaluation loop to verify tool_calls_contain deterministic checks.
'''
fixtures_dir = os.path.join(os.path.dirname(__file__), 'fixtures')
skill_dir = os.path.join(fixtures_dir, 'mock-tool-use-skill')
playbook_path = os.path.join(fixtures_dir,
'playbook_autonomous_tool_use.yaml')
result = asyncio.run(
harness.run_hybrid_tuning_loop(playbook_path, log_dir=str(tmp_path),
skill_src=skill_dir))
assert result is True
# Verify that the session JSON was saved
session_files = list(tmp_path.glob('*_session.json'))
assert len(session_files) == 1
assert session_files[0].exists()