"""Download ADC PIL/SPC pages into a raw JSONL file.""" from __future__ import annotations import argparse import json import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse import requests from bs4 import BeautifulSoup from tqdm import tqdm HEADERS = { "User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)", } def load_links(path: Path) -> list[str]: data = json.loads(path.read_text(encoding="utf-8")) if isinstance(data, list): return [str(x) for x in data] return [str(x) for x in data.get("links", [])] def paired_leaflet_urls(url: str) -> dict[str, str]: """Return best-effort PIL/SPC URLs for an ADC product URL.""" urls: dict[str, str] = {} path = urlparse(url).path.lower() if "/pil/" in path: urls["pil_url"] = url urls["spc_url"] = url.replace("/pil/", "/spc/") elif "/spc/" in path: urls["spc_url"] = url urls["pil_url"] = url.replace("/spc/", "/pil/") else: urls["detail_url"] = url return urls def discover_leaflet_urls_from_detail(html: str, base_url: str) -> dict[str, str]: from urllib.parse import urljoin soup = BeautifulSoup(html, "lxml") result: dict[str, str] = {} for tag in soup.find_all("a", href=True): candidate = urljoin(base_url, tag["href"]) path = urlparse(candidate).path.lower() if "/databazy/produkty/pil/" in path: result["pil_url"] = candidate elif "/databazy/produkty/spc/" in path: result["spc_url"] = candidate return result def fetch(session: requests.Session, url: str, timeout: int) -> tuple[int, str]: response = session.get(url, headers=HEADERS, timeout=timeout) response.encoding = response.apparent_encoding or "utf-8" return response.status_code, response.text def main() -> None: parser = argparse.ArgumentParser(description="Download ADC PIL/SPC HTML pages.") parser.add_argument("--links", type=Path, required=True) parser.add_argument("--out", type=Path, required=True) parser.add_argument("--limit", type=int, default=None) parser.add_argument("--delay", type=float, default=0.5) parser.add_argument("--timeout", type=int, default=30) args = parser.parse_args() links = load_links(args.links) if args.limit: links = links[: args.limit] args.out.parent.mkdir(parents=True, exist_ok=True) session = requests.Session() with args.out.open("w", encoding="utf-8") as out: for source_url in tqdm(links, desc="ADC leaflets"): urls = paired_leaflet_urls(source_url) if "detail_url" in urls: status, html = fetch(session, urls["detail_url"], args.timeout) if status == 200: urls.update(discover_leaflet_urls_from_detail(html, urls["detail_url"])) time.sleep(args.delay) record = { "source_url": source_url, "pil_url": urls.get("pil_url"), "spc_url": urls.get("spc_url"), "pil_status": None, "spc_status": None, "pil_html": None, "spc_html": None, "metadata": { "source": "adc.sk", "scraped_at": datetime.now(timezone.utc).isoformat(), }, } for kind in ("pil", "spc"): url = urls.get(f"{kind}_url") if not url: continue try: status, html = fetch(session, url, args.timeout) record[f"{kind}_status"] = status if status == 200: record[f"{kind}_html"] = html except Exception as exc: record[f"{kind}_status"] = f"error: {exc}" time.sleep(args.delay) out.write(json.dumps(record, ensure_ascii=False) + "\n") print(f"Saved raw leaflets to {args.out}") if __name__ == "__main__": main()