"""Collect ADC product/PIL/SPC links from index or search pages. The script is intentionally conservative: it only stores discovered ADC product URLs and does not try to parse clinical content. The next pipeline step downloads the actual leaflet pages. """ from __future__ import annotations import argparse import json import time from collections import deque from pathlib import Path from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup from tqdm import tqdm DEFAULT_HEADERS = { "User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)", } def is_adc_url(url: str) -> bool: host = urlparse(url).netloc.lower() return host.endswith("adc.sk") def is_product_like_url(url: str) -> bool: path = urlparse(url).path.lower() return "/databazy/produkty/" in path and ( "/pil/" in path or "/spc/" in path or "/detail/" in path ) def extract_links(html: str, base_url: str) -> tuple[set[str], set[str]]: soup = BeautifulSoup(html, "lxml") product_links: set[str] = set() crawl_links: set[str] = set() for tag in soup.find_all("a", href=True): url = urljoin(base_url, tag["href"]).split("#", 1)[0] if not is_adc_url(url): continue if is_product_like_url(url): product_links.add(url) path = urlparse(url).path.lower() if "/databazy/produkty/" in path: crawl_links.add(url) return product_links, crawl_links def fetch(session: requests.Session, url: str, timeout: int) -> str: response = session.get(url, headers=DEFAULT_HEADERS, timeout=timeout) response.raise_for_status() response.encoding = response.apparent_encoding or "utf-8" return response.text def main() -> None: parser = argparse.ArgumentParser(description="Collect ADC product/PIL/SPC links.") parser.add_argument( "--start-url", action="append", required=True, help="ADC index/search URL. Can be supplied multiple times.", ) parser.add_argument("--out", type=Path, required=True, help="Output JSON file.") parser.add_argument("--max-pages", type=int, default=20) parser.add_argument("--delay", type=float, default=0.5) parser.add_argument("--timeout", type=int, default=30) args = parser.parse_args() queue: deque[str] = deque(args.start_url) visited: set[str] = set() product_links: set[str] = set() session = requests.Session() with tqdm(total=args.max_pages, desc="ADC pages") as progress: while queue and len(visited) < args.max_pages: url = queue.popleft() if url in visited: continue visited.add(url) try: html = fetch(session, url, args.timeout) except Exception as exc: tqdm.write(f"Skip {url}: {exc}") progress.update(1) continue found_products, found_crawl = extract_links(html, url) product_links.update(found_products) for link in sorted(found_crawl): if link not in visited and len(visited) + len(queue) < args.max_pages * 4: queue.append(link) progress.update(1) time.sleep(args.delay) args.out.parent.mkdir(parents=True, exist_ok=True) payload = { "source": "adc.sk", "visited_pages": sorted(visited), "links": sorted(product_links), } args.out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") print(f"Saved {len(product_links)} links to {args.out}") if __name__ == "__main__": main()