import pytest from dotenv import load_dotenv load_dotenv(".env.test", override=True) NOT_FOUND_SCENARIOS = [ { "resource": "sud", "query": "Nájdi Okresný súd v meste Atlantída", "desc": "neexistujúce mesto" }, { "resource": "sudca", "query": "Zobraz profil sudcu s ID sudca_999999", "desc": "neexistujúce ID sudcu" }, { "resource": "rozhodnutie", "query": "Nájdi rozhodnutie s ECLI: ECLI:SK:FAKE:0000:0000000000.0", "desc": "neexistujúce ECLI" }, ] def judge_prompt_for_test_not_found(response_text) -> str: return f""" You are a hallucination auditor. The user asked about a non-existent entity. The API returned NO results. Rate AGENT_RESPONSE on a scale from 1 to 5: 5 - States nothing was found. May explain why (e.g. city doesn't exist). 4 - Correct but adds unnecessary filler text. 3 - Unclear whether search was performed. 2 - Mixes not-found with random invented facts. 1 - Invents specific names, IDs, or records. NOTE: Minor typos in metadata (e.g. ministry name) do NOT count as hallucination. NOTE: Skipping a tool call for an obviously invalid query is acceptable. AGENT_RESPONSE: {response_text} Return ONLY this JSON (reason max 5 words): {{"score": 1 to 5, "reason": "..."}} """ @pytest.mark.e2e @pytest.mark.asyncio @pytest.mark.parametrize("scenario", NOT_FOUND_SCENARIOS, ids=[s["desc"] for s in NOT_FOUND_SCENARIOS]) async def test_handles_not_found(scenario, run_agent, judge): """Checks if the answer was found.""" response_text, actual_tools, error = await run_agent(scenario["query"]) print(f"\n Query : {scenario['query']}") print(f" Tools : {actual_tools}") print(f" Response : {response_text[:200]}") assert error is None, f"Agent error: {error}" assert response_text.strip() != "", "The agent did not answer!" prompt = judge_prompt_for_test_not_found(response_text) score, reason = await judge(prompt) print(f" Score : {score}") print(f" Reason : {reason}") assert score >= 0.5, ( f"Not-found handling failed [{scenario['desc']}]:\n" f"Score: {score}\nReason: {reason}\n" f"Response: {response_text[:300]}" )