168 lines
4.8 KiB
Python
168 lines
4.8 KiB
Python
"""Parse raw ADC HTML/JSONL into structured JSON for LightRAG ingestion."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
SECTION_PATTERNS = {
|
|
"contraindications": [
|
|
r"nepoužívajte",
|
|
r"kedy .* nepoužívať",
|
|
r"kontraindik",
|
|
],
|
|
"interactions": [
|
|
r"iné lieky",
|
|
r"vzájomné pôsobenie",
|
|
r"interakci",
|
|
],
|
|
"side_effects": [
|
|
r"možné vedľajšie účinky",
|
|
r"nežiaduce účinky",
|
|
r"vedľajšie účinky",
|
|
],
|
|
"dosage": [
|
|
r"ako používať",
|
|
r"dávkovanie",
|
|
r"spôsob podávania",
|
|
],
|
|
}
|
|
|
|
|
|
def html_to_text(html: str | None) -> str:
|
|
if not html:
|
|
return ""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
for tag in soup(["script", "style", "noscript"]):
|
|
tag.decompose()
|
|
text = soup.get_text(" ", strip=True)
|
|
return normalize_text(text)
|
|
|
|
|
|
def normalize_text(text: str) -> str:
|
|
text = text.replace("\xa0", " ")
|
|
text = re.sub(r"\s+", " ", text)
|
|
return text.strip()
|
|
|
|
|
|
def infer_name(source_url: str, text: str) -> str:
|
|
match = re.search(r"Písomná informácia pre používateľa\s+(.{3,160}?)(?:\s+Pozorne|\s+V tejto|\s+1\.)", text)
|
|
if match:
|
|
return normalize_text(match.group(1))
|
|
|
|
slug = source_url.rstrip("/").split("/")[-1].replace(".html", "")
|
|
slug = re.sub(r"-\d+$", "", slug)
|
|
return slug.replace("-", " ").title()
|
|
|
|
|
|
def extract_sections(text: str) -> dict[str, str]:
|
|
sections: dict[str, str] = {}
|
|
lower = text.lower()
|
|
|
|
starts: list[tuple[int, str]] = []
|
|
for section_name, patterns in SECTION_PATTERNS.items():
|
|
found_positions = []
|
|
for pattern in patterns:
|
|
match = re.search(pattern, lower)
|
|
if match:
|
|
found_positions.append(match.start())
|
|
if found_positions:
|
|
starts.append((min(found_positions), section_name))
|
|
|
|
starts.sort()
|
|
for idx, (start, section_name) in enumerate(starts):
|
|
end = starts[idx + 1][0] if idx + 1 < len(starts) else min(len(text), start + 8000)
|
|
sections[section_name] = text[start:end].strip()
|
|
|
|
return sections
|
|
|
|
|
|
def iter_raw_records(path: Path) -> list[dict[str, Any]]:
|
|
if path.suffix.lower() == ".jsonl":
|
|
records = []
|
|
with path.open(encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
records.append(json.loads(line))
|
|
return records
|
|
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
if isinstance(data, list):
|
|
return data
|
|
if "records" in data:
|
|
return data["records"]
|
|
return [data]
|
|
|
|
|
|
def parse_record(raw: dict[str, Any]) -> dict[str, Any]:
|
|
source_url = raw.get("source_url") or raw.get("link") or raw.get("pil_url") or ""
|
|
|
|
pil_text = raw.get("pribalovy_letak")
|
|
if pil_text is None:
|
|
pil_text = html_to_text(raw.get("pil_html"))
|
|
else:
|
|
pil_text = normalize_text(str(pil_text))
|
|
|
|
spc_text = raw.get("spc")
|
|
if spc_text is None:
|
|
spc_text = html_to_text(raw.get("spc_html"))
|
|
else:
|
|
spc_text = normalize_text(str(spc_text))
|
|
|
|
combined_text = f"{pil_text} {spc_text}".strip()
|
|
name = raw.get("name") or infer_name(source_url, combined_text)
|
|
|
|
return {
|
|
"source_url": source_url,
|
|
"name": name,
|
|
"pil_url": raw.get("pil_url"),
|
|
"spc_url": raw.get("spc_url"),
|
|
"pil_text": pil_text,
|
|
"spc_text": spc_text,
|
|
"sections": extract_sections(combined_text),
|
|
"metadata": {
|
|
"source": "adc.sk",
|
|
"scraped_at": (raw.get("metadata") or {}).get("scraped_at"),
|
|
"parser": "scripts/adc_scraper/parse_adc_json.py",
|
|
},
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Parse ADC raw data into structured JSON.")
|
|
parser.add_argument("--input", type=Path, required=True)
|
|
parser.add_argument("--out", type=Path, required=True)
|
|
parser.add_argument("--limit", type=int, default=None)
|
|
parser.add_argument(
|
|
"--keep-empty",
|
|
action="store_true",
|
|
help="Keep records where both PIL and SPC text are empty.",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
raw_records = iter_raw_records(args.input)
|
|
|
|
parsed = []
|
|
for record in raw_records:
|
|
item = parse_record(record)
|
|
if not args.keep_empty and not item["pil_text"] and not item["spc_text"]:
|
|
continue
|
|
parsed.append(item)
|
|
if args.limit and len(parsed) >= args.limit:
|
|
break
|
|
|
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
args.out.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print(f"Saved {len(parsed)} structured records to {args.out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|