"""Parse raw ADC HTML/JSONL into structured JSON for LightRAG ingestion.""" from __future__ import annotations import argparse import json import re from pathlib import Path from typing import Any from bs4 import BeautifulSoup SECTION_PATTERNS = { "contraindications": [ r"nepoužívajte", r"kedy .* nepoužívať", r"kontraindik", ], "interactions": [ r"iné lieky", r"vzájomné pôsobenie", r"interakci", ], "side_effects": [ r"možné vedľajšie účinky", r"nežiaduce účinky", r"vedľajšie účinky", ], "dosage": [ r"ako používať", r"dávkovanie", r"spôsob podávania", ], } def html_to_text(html: str | None) -> str: if not html: return "" soup = BeautifulSoup(html, "lxml") for tag in soup(["script", "style", "noscript"]): tag.decompose() text = soup.get_text(" ", strip=True) return normalize_text(text) def normalize_text(text: str) -> str: text = text.replace("\xa0", " ") text = re.sub(r"\s+", " ", text) return text.strip() def infer_name(source_url: str, text: str) -> str: match = re.search(r"Písomná informácia pre používateľa\s+(.{3,160}?)(?:\s+Pozorne|\s+V tejto|\s+1\.)", text) if match: return normalize_text(match.group(1)) slug = source_url.rstrip("/").split("/")[-1].replace(".html", "") slug = re.sub(r"-\d+$", "", slug) return slug.replace("-", " ").title() def extract_sections(text: str) -> dict[str, str]: sections: dict[str, str] = {} lower = text.lower() starts: list[tuple[int, str]] = [] for section_name, patterns in SECTION_PATTERNS.items(): found_positions = [] for pattern in patterns: match = re.search(pattern, lower) if match: found_positions.append(match.start()) if found_positions: starts.append((min(found_positions), section_name)) starts.sort() for idx, (start, section_name) in enumerate(starts): end = starts[idx + 1][0] if idx + 1 < len(starts) else min(len(text), start + 8000) sections[section_name] = text[start:end].strip() return sections def iter_raw_records(path: Path) -> list[dict[str, Any]]: if path.suffix.lower() == ".jsonl": records = [] with path.open(encoding="utf-8") as f: for line in f: line = line.strip() if line: records.append(json.loads(line)) return records data = json.loads(path.read_text(encoding="utf-8")) if isinstance(data, list): return data if "records" in data: return data["records"] return [data] def parse_record(raw: dict[str, Any]) -> dict[str, Any]: source_url = raw.get("source_url") or raw.get("link") or raw.get("pil_url") or "" pil_text = raw.get("pribalovy_letak") if pil_text is None: pil_text = html_to_text(raw.get("pil_html")) else: pil_text = normalize_text(str(pil_text)) spc_text = raw.get("spc") if spc_text is None: spc_text = html_to_text(raw.get("spc_html")) else: spc_text = normalize_text(str(spc_text)) combined_text = f"{pil_text} {spc_text}".strip() name = raw.get("name") or infer_name(source_url, combined_text) return { "source_url": source_url, "name": name, "pil_url": raw.get("pil_url"), "spc_url": raw.get("spc_url"), "pil_text": pil_text, "spc_text": spc_text, "sections": extract_sections(combined_text), "metadata": { "source": "adc.sk", "scraped_at": (raw.get("metadata") or {}).get("scraped_at"), "parser": "scripts/adc_scraper/parse_adc_json.py", }, } def main() -> None: parser = argparse.ArgumentParser(description="Parse ADC raw data into structured JSON.") parser.add_argument("--input", type=Path, required=True) parser.add_argument("--out", type=Path, required=True) parser.add_argument("--limit", type=int, default=None) parser.add_argument( "--keep-empty", action="store_true", help="Keep records where both PIL and SPC text are empty.", ) args = parser.parse_args() raw_records = iter_raw_records(args.input) parsed = [] for record in raw_records: item = parse_record(record) if not args.keep_empty and not item["pil_text"] and not item["spc_text"]: continue parsed.append(item) if args.limit and len(parsed) >= args.limit: break args.out.parent.mkdir(parents=True, exist_ok=True) args.out.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding="utf-8") print(f"Saved {len(parsed)} structured records to {args.out}") if __name__ == "__main__": main()