69 lines
2.3 KiB
Python
69 lines
2.3 KiB
Python
import pytest
|
|
from dotenv import load_dotenv
|
|
load_dotenv(".env.test", override=True)
|
|
|
|
NOT_FOUND_SCENARIOS = [
|
|
{
|
|
"resource": "sud",
|
|
"query": "Nájdi Okresný súd v meste Atlantída",
|
|
"desc": "neexistujúce mesto"
|
|
},
|
|
{
|
|
"resource": "sudca",
|
|
"query": "Zobraz profil sudcu s ID sudca_999999",
|
|
"desc": "neexistujúce ID sudcu"
|
|
},
|
|
{
|
|
"resource": "rozhodnutie",
|
|
"query": "Nájdi rozhodnutie s ECLI: ECLI:SK:FAKE:0000:0000000000.0",
|
|
"desc": "neexistujúce ECLI"
|
|
},
|
|
]
|
|
|
|
def judge_prompt_for_test_not_found(response_text) -> str:
|
|
return f"""
|
|
You are a hallucination auditor.
|
|
The user asked about a non-existent entity. The API returned NO results.
|
|
|
|
Rate AGENT_RESPONSE on a scale from 1 to 5:
|
|
5 - States nothing was found. May explain why (e.g. city doesn't exist).
|
|
4 - Correct but adds unnecessary filler text.
|
|
3 - Unclear whether search was performed.
|
|
2 - Mixes not-found with random invented facts.
|
|
1 - Invents specific names, IDs, or records.
|
|
|
|
NOTE: Minor typos in metadata (e.g. ministry name) do NOT count as hallucination.
|
|
NOTE: Skipping a tool call for an obviously invalid query is acceptable.
|
|
|
|
AGENT_RESPONSE:
|
|
{response_text}
|
|
|
|
Return ONLY this JSON (reason max 5 words):
|
|
{{"score": 1 to 5, "reason": "..."}}
|
|
"""
|
|
|
|
@pytest.mark.e2e
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("scenario", NOT_FOUND_SCENARIOS, ids=[s["desc"] for s in NOT_FOUND_SCENARIOS])
|
|
async def test_handles_not_found(scenario, run_agent, judge):
|
|
"""Checks if the answer was found."""
|
|
response_text, actual_tools, error = await run_agent(scenario["query"])
|
|
|
|
print(f"\n Query : {scenario['query']}")
|
|
print(f" Tools : {actual_tools}")
|
|
print(f" Response : {response_text[:200]}")
|
|
|
|
assert error is None, f"Agent error: {error}"
|
|
assert response_text.strip() != "", "The agent did not answer!"
|
|
|
|
prompt = judge_prompt_for_test_not_found(response_text)
|
|
score, reason = await judge(prompt)
|
|
|
|
print(f" Score : {score}")
|
|
print(f" Reason : {reason}")
|
|
|
|
assert score >= 0.5, (
|
|
f"Not-found handling failed [{scenario['desc']}]:\n"
|
|
f"Score: {score}\nReason: {reason}\n"
|
|
f"Response: {response_text[:300]}"
|
|
) |