DiplomovaPraca/scripts/adc_scraper/parse_adc_json.py

"""Parse raw ADC HTML/JSONL into structured JSON for LightRAG ingestion."""

from __future__ import annotations

import argparse
import json
import re
from pathlib import Path
from typing import Any

from bs4 import BeautifulSoup


SECTION_PATTERNS = {
    "contraindications": [
        r"nepoužívajte",
        r"kedy .* nepoužívať",
        r"kontraindik",
    ],
    "interactions": [
        r"iné lieky",
        r"vzájomné pôsobenie",
        r"interakci",
    ],
    "side_effects": [
        r"možné vedľajšie účinky",
        r"nežiaduce účinky",
        r"vedľajšie účinky",
    ],
    "dosage": [
        r"ako používať",
        r"dávkovanie",
        r"spôsob podávania",
    ],
}


def html_to_text(html: str | None) -> str:
    if not html:
        return ""
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    text = soup.get_text(" ", strip=True)
    return normalize_text(text)


def normalize_text(text: str) -> str:
    text = text.replace("\xa0", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def infer_name(source_url: str, text: str) -> str:
    match = re.search(r"Písomná informácia pre používateľa\s+(.{3,160}?)(?:\s+Pozorne|\s+V tejto|\s+1\.)", text)
    if match:
        return normalize_text(match.group(1))

    slug = source_url.rstrip("/").split("/")[-1].replace(".html", "")
    slug = re.sub(r"-\d+$", "", slug)
    return slug.replace("-", " ").title()


def extract_sections(text: str) -> dict[str, str]:
    sections: dict[str, str] = {}
    lower = text.lower()

    starts: list[tuple[int, str]] = []
    for section_name, patterns in SECTION_PATTERNS.items():
        found_positions = []
        for pattern in patterns:
            match = re.search(pattern, lower)
            if match:
                found_positions.append(match.start())
        if found_positions:
            starts.append((min(found_positions), section_name))

    starts.sort()
    for idx, (start, section_name) in enumerate(starts):
        end = starts[idx + 1][0] if idx + 1 < len(starts) else min(len(text), start + 8000)
        sections[section_name] = text[start:end].strip()

    return sections


def iter_raw_records(path: Path) -> list[dict[str, Any]]:
    if path.suffix.lower() == ".jsonl":
        records = []
        with path.open(encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    records.append(json.loads(line))
        return records

    data = json.loads(path.read_text(encoding="utf-8"))
    if isinstance(data, list):
        return data
    if "records" in data:
        return data["records"]
    return [data]


def parse_record(raw: dict[str, Any]) -> dict[str, Any]:
    source_url = raw.get("source_url") or raw.get("link") or raw.get("pil_url") or ""

    pil_text = raw.get("pribalovy_letak")
    if pil_text is None:
        pil_text = html_to_text(raw.get("pil_html"))
    else:
        pil_text = normalize_text(str(pil_text))

    spc_text = raw.get("spc")
    if spc_text is None:
        spc_text = html_to_text(raw.get("spc_html"))
    else:
        spc_text = normalize_text(str(spc_text))

    combined_text = f"{pil_text} {spc_text}".strip()
    name = raw.get("name") or infer_name(source_url, combined_text)

    return {
        "source_url": source_url,
        "name": name,
        "pil_url": raw.get("pil_url"),
        "spc_url": raw.get("spc_url"),
        "pil_text": pil_text,
        "spc_text": spc_text,
        "sections": extract_sections(combined_text),
        "metadata": {
            "source": "adc.sk",
            "scraped_at": (raw.get("metadata") or {}).get("scraped_at"),
            "parser": "scripts/adc_scraper/parse_adc_json.py",
        },
    }


def main() -> None:
    parser = argparse.ArgumentParser(description="Parse ADC raw data into structured JSON.")
    parser.add_argument("--input", type=Path, required=True)
    parser.add_argument("--out", type=Path, required=True)
    parser.add_argument("--limit", type=int, default=None)
    parser.add_argument(
        "--keep-empty",
        action="store_true",
        help="Keep records where both PIL and SPC text are empty.",
    )
    args = parser.parse_args()

    raw_records = iter_raw_records(args.input)

    parsed = []
    for record in raw_records:
        item = parse_record(record)
        if not args.keep_empty and not item["pil_text"] and not item["spc_text"]:
            continue
        parsed.append(item)
        if args.limit and len(parsed) >= args.limit:
            break

    args.out.parent.mkdir(parents=True, exist_ok=True)
    args.out.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Saved {len(parsed)} structured records to {args.out}")


if __name__ == "__main__":
    main()