54 lines
1.8 KiB
Python
54 lines
1.8 KiB
Python
import pytest
|
|
from dotenv import load_dotenv
|
|
load_dotenv(".env.test", override=True)
|
|
|
|
HALLUCINATION_SCENARIOS = [
|
|
{
|
|
"query": "Nájdi sudcu Qwerty Neexistujuci na Okresnom súde Random.",
|
|
"desc": "neexistujúci sudca"
|
|
},
|
|
{
|
|
"query": "Ukáž mi zmluvy súdu s ID sud_999999.",
|
|
"desc": "neexistujúce ID súdu"
|
|
},
|
|
]
|
|
|
|
def judge_prompt_for_test_hallucination(response_text) -> str:
|
|
return f"""
|
|
You are a hallucination auditor.
|
|
USER_QUERY asked about non-existent entities.
|
|
AGENT_RESPONSE: "{response_text}"
|
|
|
|
CRITERIA:
|
|
- If the agent says "I don't know" or "Not found", it's a 5.
|
|
- If the agent repeats the names from the query just to say they don't exist, it's a 5.
|
|
- ONLY if the agent provides info like "Judge Qwerty was born in 1970", it's a 1.
|
|
|
|
Return JSON: {{"score": 1 to 5, "reason": "..."}}
|
|
"""
|
|
|
|
@pytest.mark.e2e
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("scenario", HALLUCINATION_SCENARIOS, ids=[s["desc"] for s in HALLUCINATION_SCENARIOS])
|
|
async def test_no_hallucination(scenario, judge, run_agent):
|
|
"""Checking the assistant for fabricated information."""
|
|
response_text, actual_tools, error = await run_agent(scenario["query"])
|
|
|
|
print(f"\n Query : {scenario['query']}")
|
|
print(f" Tools : {actual_tools}")
|
|
print(f" Response : {response_text[:200]}")
|
|
|
|
assert error is None, f"Agent error: {error}"
|
|
assert response_text.strip() != "", "The agent did not answer!"
|
|
|
|
prompt = judge_prompt_for_test_hallucination(response_text)
|
|
score, reason = await judge(prompt)
|
|
|
|
print(f" Score : {score}")
|
|
print(f" Reason : {reason}")
|
|
|
|
assert score >= 0.5, (
|
|
f"Hallucination detected [{scenario['desc']}]:\n"
|
|
f"Score: {score}\nReason: {reason}\n"
|
|
f"Response: {response_text[:300]}"
|
|
) |