* Refactor skill turn harness, fix session serialization, and resolve E2E test failures * Ignore symlinks during workspace copying and enforce sandbox boundaries in playbooks * Refactor interaction loop to use clean async generator-based Event flow * Introduce dedicated async generator test and improve autonomous tester instructions * Enforce strict sandbox awareness and Step 8 policy import gates * Track and display conversation context size next to turn headers * Streamline token usage display to only appear in turn step headers * Refactor token usage tracking to show actual active context size * Implement progress tracking block and human recovery in test harness * docs: document and categorize repository skills and tools * docs: add maintenance instructions for updating FACTORIES.md tables * docs: add missing data-catalog-policy-tag factory in FACTORIES.md * docs: add missing networking stage sub-factories in FACTORIES.md * docs: add systematic commands for discovering module/stage factories in FACTORIES.md * docs: add missing vpcs factories in 0-org-setup and 2-project-factory stages
321 lines
10 KiB
Python
321 lines
10 KiB
Python
# Copyright 2026 Google LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# https://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import os
|
|
import json
|
|
import subprocess
|
|
import asyncio
|
|
from unittest.mock import patch, MagicMock, AsyncMock, PropertyMock
|
|
import pytest
|
|
from dataclasses import asdict
|
|
|
|
import harness
|
|
|
|
# --- Phase A: Data & Logging Unit Tests ---
|
|
|
|
|
|
def test_parse_and_validate_env(monkeypatch):
|
|
playbook = {'env': ['TEST_KEY']}
|
|
|
|
# Missing key raises error
|
|
with pytest.raises(ValueError,
|
|
match='Missing required environment variables: TEST_KEY'):
|
|
harness.parse_and_validate_env(playbook)
|
|
|
|
# Present key succeeds
|
|
monkeypatch.setenv('TEST_KEY', '123')
|
|
result = harness.parse_and_validate_env(playbook)
|
|
assert result['TEST_KEY'] == '123'
|
|
|
|
|
|
def test_step_data_serialization():
|
|
step = harness.StepData(
|
|
step_index=0,
|
|
user_input='hello',
|
|
expected_outcome='greet back',
|
|
skill_response='hi',
|
|
parsed_eval={
|
|
'passed': True,
|
|
'reasoning': 'ok'
|
|
},
|
|
is_system_error=False,
|
|
)
|
|
d = asdict(step)
|
|
assert d['step_index'] == 0
|
|
assert d['user_input'] == 'hello'
|
|
assert d['expected_outcome'] == 'greet back'
|
|
assert d['parsed_eval']['passed'] is True
|
|
|
|
|
|
def test_load_env_file(tmp_path):
|
|
env_file = tmp_path / '.env'
|
|
env_file.write_text('FOO=bar\n# comment\nBAZ=qux=123\n')
|
|
|
|
harness.load_env_file(str(env_file))
|
|
assert os.environ.get('FOO') == 'bar'
|
|
assert os.environ.get('BAZ') == 'qux=123'
|
|
|
|
|
|
def test_markdown_logging(tmp_path):
|
|
log_file = tmp_path / 'test_log.md'
|
|
harness.init_markdown_log(str(log_file), 'Test Playbook')
|
|
harness.log_step_to_markdown(
|
|
md_log_path=str(log_file),
|
|
step_index=0,
|
|
user_input='input 1',
|
|
expected_outcome='outcome 1',
|
|
skill_response='response 1',
|
|
parsed_eval={
|
|
'passed': True,
|
|
'reasoning': 'Good job'
|
|
},
|
|
)
|
|
content = log_file.read_text()
|
|
assert '# Interaction Log: Test Playbook' in content
|
|
assert '## Step 1' in content
|
|
assert '**User:**\n\ninput 1' in content
|
|
assert '**Expected Outcome:**\n\noutcome 1' in content
|
|
assert '**Agent:**\n\nresponse 1' in content
|
|
assert '✅ PASS: Good job' in content
|
|
|
|
|
|
def test_dump_failed_log(tmp_path):
|
|
interaction_log = [{'step': 1, 'error': 'test'}]
|
|
harness.dump_failed_log(str(tmp_path), 'test-playbook-prefix',
|
|
interaction_log)
|
|
failed_file = tmp_path / 'test-playbook-prefix_failed.json'
|
|
assert failed_file.exists()
|
|
data = json.loads(failed_file.read_text())
|
|
assert len(data) == 1
|
|
assert data[0]['error'] == 'test'
|
|
|
|
|
|
# --- Phase B: Execution Unit Tests (Mocked) ---
|
|
|
|
|
|
@patch('harness.genai.Client')
|
|
@patch('harness.Agent')
|
|
def test_run_hybrid_tuning_loop_mocked_success(mock_agent_class,
|
|
mock_client_class, tmp_path):
|
|
# Mock Conversation
|
|
mock_conversation = MagicMock()
|
|
mock_conversation.send = AsyncMock()
|
|
|
|
async def mock_receive_steps():
|
|
yield harness.agy_types.Step(type=harness.agy_types.StepType.TEXT_RESPONSE,
|
|
source=harness.agy_types.StepSource.MODEL,
|
|
target=harness.agy_types.StepTarget.USER,
|
|
status=harness.agy_types.StepStatus.DONE,
|
|
content="Mocked Agent Response")
|
|
|
|
mock_conversation.receive_steps.return_value = mock_receive_steps()
|
|
type(mock_conversation).last_response = PropertyMock(
|
|
return_value="Mocked Agent Response")
|
|
|
|
# Mock Agent
|
|
mock_agent = MagicMock()
|
|
mock_agent.conversation = mock_conversation
|
|
mock_agent_class.return_value.__aenter__.return_value = mock_agent
|
|
|
|
# Mock Evaluator
|
|
mock_eval_client = MagicMock()
|
|
mock_client_class.return_value = mock_eval_client
|
|
mock_eval_response = MagicMock()
|
|
mock_eval_response.text = '{"passed": true, "reasoning": "Mocked pass"}'
|
|
mock_eval_client.models.generate_content.return_value = mock_eval_response
|
|
|
|
# Playbook
|
|
playbook_content = """
|
|
name: "Mocked Playbook"
|
|
steps:
|
|
- user_input: "Hello"
|
|
expected_outcome: "Greet"
|
|
"""
|
|
playbook_file = tmp_path / "playbook.yaml"
|
|
playbook_file.write_text(playbook_content)
|
|
|
|
import asyncio
|
|
result = asyncio.run(
|
|
harness.run_hybrid_tuning_loop(str(playbook_file), log_dir=str(tmp_path)))
|
|
|
|
assert result is True
|
|
mock_conversation.send.assert_called_once_with("Hello")
|
|
mock_eval_client.models.generate_content.assert_called_once()
|
|
|
|
|
|
@patch('harness.genai.Client')
|
|
@patch('harness.Agent')
|
|
def test_run_hybrid_tuning_loop_mocked_timeout(mock_agent_class,
|
|
mock_client_class, tmp_path):
|
|
# Mock genai.Client
|
|
mock_client_class.return_value = MagicMock()
|
|
import asyncio
|
|
mock_conversation = MagicMock()
|
|
mock_conversation.send = AsyncMock(side_effect=asyncio.TimeoutError())
|
|
|
|
async def empty_gen():
|
|
if False:
|
|
yield
|
|
|
|
mock_conversation.receive_steps.return_value = empty_gen()
|
|
|
|
mock_agent = MagicMock()
|
|
mock_agent.conversation = mock_conversation
|
|
mock_agent_class.return_value.__aenter__.return_value = mock_agent
|
|
|
|
# Playbook
|
|
playbook_content = """
|
|
name: "Mocked Playbook"
|
|
steps:
|
|
- user_input: "Hello"
|
|
expected_outcome: "Greet"
|
|
"""
|
|
playbook_file = tmp_path / "playbook.yaml"
|
|
playbook_file.write_text(playbook_content)
|
|
|
|
result = asyncio.run(
|
|
harness.run_hybrid_tuning_loop(str(playbook_file), log_dir=str(tmp_path)))
|
|
|
|
assert result is False
|
|
mock_conversation.send.assert_called_once_with("Hello")
|
|
|
|
log_files = list(tmp_path.glob('*_log.md'))
|
|
assert len(log_files) == 1
|
|
content = log_files[0].read_text()
|
|
assert 'SYSTEM_ERROR: Timeout' in content
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@patch('harness.Agent')
|
|
async def test_run_turn_generator(mock_agent_class):
|
|
# Mock steps returned by the SDK
|
|
async def mock_receive_steps():
|
|
yield harness.agy_types.Step(type=harness.agy_types.StepType.UNKNOWN,
|
|
status=harness.agy_types.StepStatus.DONE,
|
|
thinking_delta="Thinking about it")
|
|
yield harness.agy_types.Step(
|
|
type=harness.agy_types.StepType.TOOL_CALL,
|
|
status=harness.agy_types.StepStatus.DONE, tool_calls=[
|
|
harness.agy_types.ToolCall(id="tc-1", name="list_directory",
|
|
args={"path": "/tmp"})
|
|
])
|
|
yield harness.agy_types.Step(type=harness.agy_types.StepType.TEXT_RESPONSE,
|
|
status=harness.agy_types.StepStatus.ERROR,
|
|
error="Something went wrong")
|
|
|
|
mock_conversation = MagicMock()
|
|
mock_conversation.send = AsyncMock()
|
|
mock_conversation.receive_steps.return_value = mock_receive_steps()
|
|
|
|
mock_agent = MagicMock()
|
|
mock_agent.conversation = mock_conversation
|
|
|
|
# Consume our new run_turn async generator
|
|
events = []
|
|
async for event in harness.run_turn(mock_agent, "Hi"):
|
|
events.append(event)
|
|
|
|
# Verify correct types and data are yielded
|
|
assert len(events) == 3
|
|
assert isinstance(events[0], harness.ThinkingDeltaEvent)
|
|
assert events[0].text == "Thinking about it"
|
|
|
|
assert isinstance(events[1], harness.ToolCallEvent)
|
|
assert events[1].name == "list_directory"
|
|
assert events[1].args == {"path": "/tmp"}
|
|
|
|
assert isinstance(events[2], harness.ErrorEvent)
|
|
assert events[2].message == "Something went wrong"
|
|
|
|
|
|
# --- Phase C: E2E Test ---
|
|
|
|
|
|
@pytest.mark.e2e
|
|
def test_e2e_hybrid_tuning_loop(tmp_path):
|
|
'''
|
|
Runs the actual evaluation loop against the basic FAST Setup PoC skill.
|
|
Uses tmp_path for log_dir so we don't pollute the actual workspace logs.
|
|
'''
|
|
fixtures_dir = os.path.join(os.path.dirname(__file__), 'fixtures')
|
|
skill_dir = os.path.join(fixtures_dir, 'mock-conversation-skill')
|
|
playbook_path = os.path.join(fixtures_dir,
|
|
'playbook_scripted_env_substitution.yaml')
|
|
env_file_path = os.path.join(fixtures_dir, '.env.test')
|
|
|
|
# Load env to prime the os.environ
|
|
harness.load_env_file(env_file_path)
|
|
|
|
result = asyncio.run(
|
|
harness.run_hybrid_tuning_loop(playbook_path, log_dir=str(tmp_path),
|
|
skill_src=skill_dir))
|
|
assert result is True
|
|
# Verify the log file was created in the temporary directory
|
|
log_files = list(tmp_path.glob('*_log.md'))
|
|
assert len(log_files) == 1
|
|
log_file = log_files[0]
|
|
assert log_file.exists()
|
|
content = log_file.read_text()
|
|
assert '✅ PASS' in content
|
|
# Verify substitution happened securely
|
|
assert 'dummy-secret-12345' in content
|
|
assert '${MY_SECRET_ID}' not in content
|
|
|
|
|
|
@pytest.mark.e2e
|
|
def test_e2e_autonomous_tuning_loop(tmp_path):
|
|
'''
|
|
Runs the autonomous evaluation loop against the basic FAST Setup PoC skill.
|
|
'''
|
|
fixtures_dir = os.path.join(os.path.dirname(__file__), 'fixtures')
|
|
skill_dir = os.path.join(fixtures_dir, 'mock-conversation-skill')
|
|
playbook_path = os.path.join(fixtures_dir,
|
|
'playbook_autonomous_conversation.yaml')
|
|
env_file_path = os.path.join(fixtures_dir, '.env.test')
|
|
|
|
harness.load_env_file(env_file_path)
|
|
|
|
result = asyncio.run(
|
|
harness.run_hybrid_tuning_loop(playbook_path, log_dir=str(tmp_path),
|
|
skill_src=skill_dir))
|
|
assert result is True
|
|
log_files = list(tmp_path.glob('*_log.md'))
|
|
assert len(log_files) == 1
|
|
content = log_files[0].read_text()
|
|
|
|
# Check that the autonomous turns were logged
|
|
assert '## Autonomous Turn 1' in content
|
|
assert 'dummy-secret-12345' in content
|
|
|
|
|
|
@pytest.mark.e2e
|
|
def test_e2e_tool_calls_contain(tmp_path):
|
|
'''
|
|
Runs an autonomous evaluation loop to verify tool_calls_contain deterministic checks.
|
|
'''
|
|
fixtures_dir = os.path.join(os.path.dirname(__file__), 'fixtures')
|
|
skill_dir = os.path.join(fixtures_dir, 'mock-tool-use-skill')
|
|
playbook_path = os.path.join(fixtures_dir,
|
|
'playbook_autonomous_tool_use.yaml')
|
|
|
|
result = asyncio.run(
|
|
harness.run_hybrid_tuning_loop(playbook_path, log_dir=str(tmp_path),
|
|
skill_src=skill_dir))
|
|
|
|
assert result is True
|
|
# Verify that the session JSON was saved
|
|
session_files = list(tmp_path.glob('*_session.json'))
|
|
assert len(session_files) == 1
|
|
assert session_files[0].exists()
|