DiplomovaPraca/scripts/adc_scraper/parse_adc_json.py
2026-05-14 12:26:11 +02:00

168 lines
4.8 KiB
Python

"""Parse raw ADC HTML/JSONL into structured JSON for LightRAG ingestion."""
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import Any
from bs4 import BeautifulSoup
SECTION_PATTERNS = {
"contraindications": [
r"nepoužívajte",
r"kedy .* nepoužívať",
r"kontraindik",
],
"interactions": [
r"iné lieky",
r"vzájomné pôsobenie",
r"interakci",
],
"side_effects": [
r"možné vedľajšie účinky",
r"nežiaduce účinky",
r"vedľajšie účinky",
],
"dosage": [
r"ako používať",
r"dávkovanie",
r"spôsob podávania",
],
}
def html_to_text(html: str | None) -> str:
if not html:
return ""
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
text = soup.get_text(" ", strip=True)
return normalize_text(text)
def normalize_text(text: str) -> str:
text = text.replace("\xa0", " ")
text = re.sub(r"\s+", " ", text)
return text.strip()
def infer_name(source_url: str, text: str) -> str:
match = re.search(r"Písomná informácia pre používateľa\s+(.{3,160}?)(?:\s+Pozorne|\s+V tejto|\s+1\.)", text)
if match:
return normalize_text(match.group(1))
slug = source_url.rstrip("/").split("/")[-1].replace(".html", "")
slug = re.sub(r"-\d+$", "", slug)
return slug.replace("-", " ").title()
def extract_sections(text: str) -> dict[str, str]:
sections: dict[str, str] = {}
lower = text.lower()
starts: list[tuple[int, str]] = []
for section_name, patterns in SECTION_PATTERNS.items():
found_positions = []
for pattern in patterns:
match = re.search(pattern, lower)
if match:
found_positions.append(match.start())
if found_positions:
starts.append((min(found_positions), section_name))
starts.sort()
for idx, (start, section_name) in enumerate(starts):
end = starts[idx + 1][0] if idx + 1 < len(starts) else min(len(text), start + 8000)
sections[section_name] = text[start:end].strip()
return sections
def iter_raw_records(path: Path) -> list[dict[str, Any]]:
if path.suffix.lower() == ".jsonl":
records = []
with path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
records.append(json.loads(line))
return records
data = json.loads(path.read_text(encoding="utf-8"))
if isinstance(data, list):
return data
if "records" in data:
return data["records"]
return [data]
def parse_record(raw: dict[str, Any]) -> dict[str, Any]:
source_url = raw.get("source_url") or raw.get("link") or raw.get("pil_url") or ""
pil_text = raw.get("pribalovy_letak")
if pil_text is None:
pil_text = html_to_text(raw.get("pil_html"))
else:
pil_text = normalize_text(str(pil_text))
spc_text = raw.get("spc")
if spc_text is None:
spc_text = html_to_text(raw.get("spc_html"))
else:
spc_text = normalize_text(str(spc_text))
combined_text = f"{pil_text} {spc_text}".strip()
name = raw.get("name") or infer_name(source_url, combined_text)
return {
"source_url": source_url,
"name": name,
"pil_url": raw.get("pil_url"),
"spc_url": raw.get("spc_url"),
"pil_text": pil_text,
"spc_text": spc_text,
"sections": extract_sections(combined_text),
"metadata": {
"source": "adc.sk",
"scraped_at": (raw.get("metadata") or {}).get("scraped_at"),
"parser": "scripts/adc_scraper/parse_adc_json.py",
},
}
def main() -> None:
parser = argparse.ArgumentParser(description="Parse ADC raw data into structured JSON.")
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--out", type=Path, required=True)
parser.add_argument("--limit", type=int, default=None)
parser.add_argument(
"--keep-empty",
action="store_true",
help="Keep records where both PIL and SPC text are empty.",
)
args = parser.parse_args()
raw_records = iter_raw_records(args.input)
parsed = []
for record in raw_records:
item = parse_record(record)
if not args.keep_empty and not item["pil_text"] and not item["spc_text"]:
continue
parsed.append(item)
if args.limit and len(parsed) >= args.limit:
break
args.out.parent.mkdir(parents=True, exist_ok=True)
args.out.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Saved {len(parsed)} structured records to {args.out}")
if __name__ == "__main__":
main()