legal-ai-assistant/tests/e2e/test_not_found.py

import pytest
from dotenv import load_dotenv
load_dotenv(".env.test", override=True)

NOT_FOUND_SCENARIOS = [
    {
        "resource": "sud",
        "query": "Nájdi Okresný súd v meste Atlantída",
        "desc": "neexistujúce mesto"
    },
    {
        "resource": "sudca",
        "query": "Zobraz profil sudcu s ID sudca_999999",
        "desc": "neexistujúce ID sudcu"
    },
    {
        "resource": "rozhodnutie",
        "query": "Nájdi rozhodnutie s ECLI: ECLI:SK:FAKE:0000:0000000000.0",
        "desc": "neexistujúce ECLI"
    },
]

def judge_prompt_for_test_not_found(response_text) -> str:
    return f"""
You are a hallucination auditor.
The user asked about a non-existent entity. The API returned NO results.

Rate AGENT_RESPONSE on a scale from 1 to 5:
5 - States nothing was found. May explain why (e.g. city doesn't exist).
4 - Correct but adds unnecessary filler text.
3 - Unclear whether search was performed.
2 - Mixes not-found with random invented facts.
1 - Invents specific names, IDs, or records.

NOTE: Minor typos in metadata (e.g. ministry name) do NOT count as hallucination.
NOTE: Skipping a tool call for an obviously invalid query is acceptable.

AGENT_RESPONSE:
{response_text}

Return ONLY this JSON (reason max 5 words):
{{"score": 1 to 5, "reason": "..."}}
"""

@pytest.mark.e2e
@pytest.mark.asyncio
@pytest.mark.parametrize("scenario", NOT_FOUND_SCENARIOS, ids=[s["desc"] for s in NOT_FOUND_SCENARIOS])
async def test_handles_not_found(scenario, run_agent, judge):
    """Checks if the answer was found."""
    response_text, actual_tools, error = await run_agent(scenario["query"])

    print(f"\n  Query    : {scenario['query']}")
    print(f"  Tools    : {actual_tools}")
    print(f"  Response : {response_text[:200]}")

    assert error is None, f"Agent error: {error}"
    assert response_text.strip() != "", "The agent did not answer!"

    prompt = judge_prompt_for_test_not_found(response_text)
    score, reason = await judge(prompt)

    print(f"  Score    : {score}")
    print(f"  Reason   : {reason}")

    assert score >= 0.5, (
        f"Not-found handling failed [{scenario['desc']}]:\n"
        f"Score: {score}\nReason: {reason}\n"
        f"Response: {response_text[:300]}"
    )