DiplomovaPraca/scripts/adc_scraper/scrape_adc_index.py

"""Collect ADC product/PIL/SPC links from index or search pages.

The script is intentionally conservative: it only stores discovered ADC product
URLs and does not try to parse clinical content. The next pipeline step downloads
the actual leaflet pages.
"""

from __future__ import annotations

import argparse
import json
import time
from collections import deque
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


DEFAULT_HEADERS = {
    "User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)",
}


def is_adc_url(url: str) -> bool:
    host = urlparse(url).netloc.lower()
    return host.endswith("adc.sk")


def is_product_like_url(url: str) -> bool:
    path = urlparse(url).path.lower()
    return "/databazy/produkty/" in path and (
        "/pil/" in path or "/spc/" in path or "/detail/" in path
    )


def extract_links(html: str, base_url: str) -> tuple[set[str], set[str]]:
    soup = BeautifulSoup(html, "lxml")
    product_links: set[str] = set()
    crawl_links: set[str] = set()

    for tag in soup.find_all("a", href=True):
        url = urljoin(base_url, tag["href"]).split("#", 1)[0]
        if not is_adc_url(url):
            continue
        if is_product_like_url(url):
            product_links.add(url)

        path = urlparse(url).path.lower()
        if "/databazy/produkty/" in path:
            crawl_links.add(url)

    return product_links, crawl_links


def fetch(session: requests.Session, url: str, timeout: int) -> str:
    response = session.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
    response.raise_for_status()
    response.encoding = response.apparent_encoding or "utf-8"
    return response.text


def main() -> None:
    parser = argparse.ArgumentParser(description="Collect ADC product/PIL/SPC links.")
    parser.add_argument(
        "--start-url",
        action="append",
        required=True,
        help="ADC index/search URL. Can be supplied multiple times.",
    )
    parser.add_argument("--out", type=Path, required=True, help="Output JSON file.")
    parser.add_argument("--max-pages", type=int, default=20)
    parser.add_argument("--delay", type=float, default=0.5)
    parser.add_argument("--timeout", type=int, default=30)
    args = parser.parse_args()

    queue: deque[str] = deque(args.start_url)
    visited: set[str] = set()
    product_links: set[str] = set()
    session = requests.Session()

    with tqdm(total=args.max_pages, desc="ADC pages") as progress:
        while queue and len(visited) < args.max_pages:
            url = queue.popleft()
            if url in visited:
                continue
            visited.add(url)

            try:
                html = fetch(session, url, args.timeout)
            except Exception as exc:
                tqdm.write(f"Skip {url}: {exc}")
                progress.update(1)
                continue

            found_products, found_crawl = extract_links(html, url)
            product_links.update(found_products)

            for link in sorted(found_crawl):
                if link not in visited and len(visited) + len(queue) < args.max_pages * 4:
                    queue.append(link)

            progress.update(1)
            time.sleep(args.delay)

    args.out.parent.mkdir(parents=True, exist_ok=True)
    payload = {
        "source": "adc.sk",
        "visited_pages": sorted(visited),
        "links": sorted(product_links),
    }
    args.out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Saved {len(product_links)} links to {args.out}")


if __name__ == "__main__":
    main()