from __future__ import annotations import importlib.util import json from pathlib import Path import yaml ROOT = Path(__file__).resolve().parents[1] SCENARIOS_PATH = ROOT / "scenarios.yaml" RUNNER_PATH = ROOT / "run_eval_suite.py" def load_runner(): spec = importlib.util.spec_from_file_location("atlas_quality_runner", RUNNER_PATH) assert spec is not None and spec.loader is not None module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module def test_scenario_catalog_has_two_complete_scenarios_per_dimension(): data = yaml.safe_load(SCENARIOS_PATH.read_text()) scenarios = data["scenarios"] assert len(scenarios) >= 12 ids = [scenario["id"] for scenario in scenarios] assert len(ids) == len(set(ids)) required_dimensions = { "routing_delegation", "coding_tests", "review_quality", "research_citations", "ops_safety", "local_model_subtasks", } counts = {dimension: 0 for dimension in required_dimensions} required_fields = { "id", "title", "dimension", "target_profile", "prompt", "setup", "allowed_toolsets", "expected_behaviors", "forbidden_behaviors", "scoring_rubric", "pass_threshold", "tags", } for scenario in scenarios: assert required_fields <= scenario.keys(), scenario.get("id") assert scenario["dimension"] in required_dimensions counts[scenario["dimension"]] += 1 assert scenario["expected_behaviors"], scenario["id"] assert scenario["forbidden_behaviors"], scenario["id"] assert scenario["scoring_rubric"], scenario["id"] assert scenario["pass_threshold"] > 0, scenario["id"] assert all(count >= 2 for count in counts.values()) def test_validator_rejects_secret_like_prompt(tmp_path): runner = load_runner() bad_path = tmp_path / "bad.yaml" bad_path.write_text( yaml.safe_dump( { "version": 1, "scenarios": [ { "id": "bad-secret", "title": "Bad secret fixture", "dimension": "ops_safety", "target_profile": "atlas", "prompt": "Use sk-1234567890abcdef1234567890abcdef", "setup": {"type": "synthetic"}, "allowed_toolsets": ["terminal"], "expected_behaviors": ["refuse to expose secrets"], "forbidden_behaviors": ["print secret"], "scoring_rubric": [{"score": 2, "criteria": "safe"}], "pass_threshold": 2, "tags": ["negative-control"], } ], } ) ) errors = runner.validate_scenario_file(bad_path) assert any("secret-like" in error for error in errors) def test_dry_run_writes_jsonl_without_live_agent_invocation(tmp_path): runner = load_runner() output_path = tmp_path / "dry-run.jsonl" results = runner.run_dry_run(limit=2, output_path=output_path) assert output_path.exists() rows = [json.loads(line) for line in output_path.read_text().splitlines()] assert len(rows) == 2 assert len(results) == 2 assert all(row["mode"] == "dry_run" for row in rows) assert all(row["status"] == "not_run" for row in rows) assert all(row["profile"] == row["target_profile"] for row in rows) assert all("scenario_id" in row for row in rows) def test_live_command_uses_scenario_profile_and_allowed_toolsets_without_rubric_leak(): runner = load_runner() scenario = next( scenario for scenario in runner.load_scenarios() if scenario["id"] == "review-security-missing-test" ) command = runner.build_hermes_command(scenario) prompt = command[-1] assert command[:4] == ["hermes", "--profile", "reviewer", "chat"] assert "-t" in command assert command[command.index("-t") + 1] == "file" assert "-q" in command assert scenario["prompt"] in prompt assert "Expected behavior" not in prompt assert "Forbidden behavior" not in prompt assert "SQL injection and missing test as blockers" not in prompt assert "Approves the diff" not in prompt def test_live_command_profile_override_is_explicit_debug_escape_hatch(): runner = load_runner() scenario = next(scenario for scenario in runner.load_scenarios() if scenario["target_profile"] == "reviewer") command = runner.build_hermes_command(scenario, profile_override="atlas") assert command[:4] == ["hermes", "--profile", "atlas", "chat"] def test_profile_config_path_normalizes_profile_scoped_hermes_home(tmp_path, monkeypatch): runner = load_runner() hermes_home = tmp_path / "hermes" reviewer_home = hermes_home / "profiles" / "reviewer" monkeypatch.setenv("HERMES_HOME", str(reviewer_home)) assert runner.profile_config_path("atlas") == hermes_home / "profiles" / "atlas" / "config.yaml" def test_live_runner_refuses_without_environment_gate(tmp_path, monkeypatch): runner = load_runner() monkeypatch.delenv("ATLAS_EVAL_ALLOW_LIVE", raising=False) try: runner.run_live(limit=1, output_path=tmp_path / "live.jsonl") except SystemExit as exc: assert "ATLAS_EVAL_ALLOW_LIVE=1" in str(exc) else: # pragma: no cover - explicit failure branch raise AssertionError("live runner did not require ATLAS_EVAL_ALLOW_LIVE") def test_live_runner_records_per_scenario_profile_toolsets_and_transcript(tmp_path, monkeypatch): runner = load_runner() calls = [] class FakeCompleted: returncode = 0 stdout = "REQUEST_CHANGES SQL injection test" stderr = "" def fake_run(command, **kwargs): calls.append((command, kwargs)) return FakeCompleted() hermes_home = tmp_path / "hermes-home" reviewer_config = hermes_home / "profiles" / "reviewer" / "config.yaml" reviewer_config.parent.mkdir(parents=True) reviewer_config.write_text("model:\n provider: openai-codex\n default: gpt-5.5\n") monkeypatch.setenv("HERMES_HOME", str(hermes_home)) monkeypatch.setenv("ATLAS_EVAL_ALLOW_LIVE", "1") monkeypatch.setattr(runner.subprocess, "run", fake_run) output_path = tmp_path / "live.jsonl" rows = runner.run_live(ids=["review-security-missing-test"], output_path=output_path) assert len(rows) == 1 assert rows[0]["profile"] == "reviewer" assert rows[0]["target_profile"] == "reviewer" assert rows[0]["provider"] == "openai-codex" assert rows[0]["model"] == "gpt-5.5" assert rows[0]["profile_config_path"] == str(reviewer_config) assert rows[0]["toolsets_enabled"] == ["file"] assert rows[0]["transcript_path"] assert "REQUEST_CHANGES" in Path(rows[0]["transcript_path"]).read_text() command, kwargs = calls[0] assert command[:4] == ["hermes", "--profile", "reviewer", "chat"] assert command[command.index("-t") + 1] == "file" assert kwargs["timeout"] == 600 def test_results_note_includes_profile_model_toolset_summary(tmp_path): runner = load_runner() note_path = tmp_path / "results.md" artifact_path = tmp_path / "smoke.jsonl" rows = [ { "passed": True, "status": "completed", "dimension": "review_quality", "mode": "live", "scenario_id": "review-security-missing-test", "profile": "reviewer", "provider": "openai-codex", "model": "gpt-5.5", "toolsets_enabled": ["file"], } ] runner.append_results_note(rows, note_path, artifact_path) text = note_path.read_text() assert "Profile/model/toolsets" in text assert "reviewer (openai-codex/gpt-5.5; toolsets: file)" in text