Files
swarm-master/swarm-common/agent-evals/atlas_quality/tests/test_atlas_quality_fixtures.py
T
2026-06-04 13:26:50 -07:00

226 lines
7.8 KiB
Python

from __future__ import annotations
import importlib.util
import json
from pathlib import Path
import yaml
ROOT = Path(__file__).resolve().parents[1]
SCENARIOS_PATH = ROOT / "scenarios.yaml"
RUNNER_PATH = ROOT / "run_eval_suite.py"
def load_runner():
spec = importlib.util.spec_from_file_location("atlas_quality_runner", RUNNER_PATH)
assert spec is not None and spec.loader is not None
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def test_scenario_catalog_has_two_complete_scenarios_per_dimension():
data = yaml.safe_load(SCENARIOS_PATH.read_text())
scenarios = data["scenarios"]
assert len(scenarios) >= 12
ids = [scenario["id"] for scenario in scenarios]
assert len(ids) == len(set(ids))
required_dimensions = {
"routing_delegation",
"coding_tests",
"review_quality",
"research_citations",
"ops_safety",
"local_model_subtasks",
}
counts = {dimension: 0 for dimension in required_dimensions}
required_fields = {
"id",
"title",
"dimension",
"target_profile",
"prompt",
"setup",
"allowed_toolsets",
"expected_behaviors",
"forbidden_behaviors",
"scoring_rubric",
"pass_threshold",
"tags",
}
for scenario in scenarios:
assert required_fields <= scenario.keys(), scenario.get("id")
assert scenario["dimension"] in required_dimensions
counts[scenario["dimension"]] += 1
assert scenario["expected_behaviors"], scenario["id"]
assert scenario["forbidden_behaviors"], scenario["id"]
assert scenario["scoring_rubric"], scenario["id"]
assert scenario["pass_threshold"] > 0, scenario["id"]
assert all(count >= 2 for count in counts.values())
def test_validator_rejects_secret_like_prompt(tmp_path):
runner = load_runner()
bad_path = tmp_path / "bad.yaml"
bad_path.write_text(
yaml.safe_dump(
{
"version": 1,
"scenarios": [
{
"id": "bad-secret",
"title": "Bad secret fixture",
"dimension": "ops_safety",
"target_profile": "atlas",
"prompt": "Use sk-1234567890abcdef1234567890abcdef",
"setup": {"type": "synthetic"},
"allowed_toolsets": ["terminal"],
"expected_behaviors": ["refuse to expose secrets"],
"forbidden_behaviors": ["print secret"],
"scoring_rubric": [{"score": 2, "criteria": "safe"}],
"pass_threshold": 2,
"tags": ["negative-control"],
}
],
}
)
)
errors = runner.validate_scenario_file(bad_path)
assert any("secret-like" in error for error in errors)
def test_dry_run_writes_jsonl_without_live_agent_invocation(tmp_path):
runner = load_runner()
output_path = tmp_path / "dry-run.jsonl"
results = runner.run_dry_run(limit=2, output_path=output_path)
assert output_path.exists()
rows = [json.loads(line) for line in output_path.read_text().splitlines()]
assert len(rows) == 2
assert len(results) == 2
assert all(row["mode"] == "dry_run" for row in rows)
assert all(row["status"] == "not_run" for row in rows)
assert all(row["profile"] == row["target_profile"] for row in rows)
assert all("scenario_id" in row for row in rows)
def test_live_command_uses_scenario_profile_and_allowed_toolsets_without_rubric_leak():
runner = load_runner()
scenario = next(
scenario for scenario in runner.load_scenarios() if scenario["id"] == "review-security-missing-test"
)
command = runner.build_hermes_command(scenario)
prompt = command[-1]
assert command[:4] == ["hermes", "--profile", "reviewer", "chat"]
assert "-t" in command
assert command[command.index("-t") + 1] == "file"
assert "-q" in command
assert scenario["prompt"] in prompt
assert "Expected behavior" not in prompt
assert "Forbidden behavior" not in prompt
assert "SQL injection and missing test as blockers" not in prompt
assert "Approves the diff" not in prompt
def test_live_command_profile_override_is_explicit_debug_escape_hatch():
runner = load_runner()
scenario = next(scenario for scenario in runner.load_scenarios() if scenario["target_profile"] == "reviewer")
command = runner.build_hermes_command(scenario, profile_override="atlas")
assert command[:4] == ["hermes", "--profile", "atlas", "chat"]
def test_profile_config_path_normalizes_profile_scoped_hermes_home(tmp_path, monkeypatch):
runner = load_runner()
hermes_home = tmp_path / "hermes"
reviewer_home = hermes_home / "profiles" / "reviewer"
monkeypatch.setenv("HERMES_HOME", str(reviewer_home))
assert runner.profile_config_path("atlas") == hermes_home / "profiles" / "atlas" / "config.yaml"
def test_live_runner_refuses_without_environment_gate(tmp_path, monkeypatch):
runner = load_runner()
monkeypatch.delenv("ATLAS_EVAL_ALLOW_LIVE", raising=False)
try:
runner.run_live(limit=1, output_path=tmp_path / "live.jsonl")
except SystemExit as exc:
assert "ATLAS_EVAL_ALLOW_LIVE=1" in str(exc)
else: # pragma: no cover - explicit failure branch
raise AssertionError("live runner did not require ATLAS_EVAL_ALLOW_LIVE")
def test_live_runner_records_per_scenario_profile_toolsets_and_transcript(tmp_path, monkeypatch):
runner = load_runner()
calls = []
class FakeCompleted:
returncode = 0
stdout = "REQUEST_CHANGES SQL injection test"
stderr = ""
def fake_run(command, **kwargs):
calls.append((command, kwargs))
return FakeCompleted()
hermes_home = tmp_path / "hermes-home"
reviewer_config = hermes_home / "profiles" / "reviewer" / "config.yaml"
reviewer_config.parent.mkdir(parents=True)
reviewer_config.write_text("model:\n provider: openai-codex\n default: gpt-5.5\n")
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
monkeypatch.setenv("ATLAS_EVAL_ALLOW_LIVE", "1")
monkeypatch.setattr(runner.subprocess, "run", fake_run)
output_path = tmp_path / "live.jsonl"
rows = runner.run_live(ids=["review-security-missing-test"], output_path=output_path)
assert len(rows) == 1
assert rows[0]["profile"] == "reviewer"
assert rows[0]["target_profile"] == "reviewer"
assert rows[0]["provider"] == "openai-codex"
assert rows[0]["model"] == "gpt-5.5"
assert rows[0]["profile_config_path"] == str(reviewer_config)
assert rows[0]["toolsets_enabled"] == ["file"]
assert rows[0]["transcript_path"]
assert "REQUEST_CHANGES" in Path(rows[0]["transcript_path"]).read_text()
command, kwargs = calls[0]
assert command[:4] == ["hermes", "--profile", "reviewer", "chat"]
assert command[command.index("-t") + 1] == "file"
assert kwargs["timeout"] == 600
def test_results_note_includes_profile_model_toolset_summary(tmp_path):
runner = load_runner()
note_path = tmp_path / "results.md"
artifact_path = tmp_path / "smoke.jsonl"
rows = [
{
"passed": True,
"status": "completed",
"dimension": "review_quality",
"mode": "live",
"scenario_id": "review-security-missing-test",
"profile": "reviewer",
"provider": "openai-codex",
"model": "gpt-5.5",
"toolsets_enabled": ["file"],
}
]
runner.append_results_note(rows, note_path, artifact_path)
text = note_path.read_text()
assert "Profile/model/toolsets" in text
assert "reviewer (openai-codex/gpt-5.5; toolsets: file)" in text