DiplomovaPraca/scripts/adc_scraper/scrape_adc_index.py
2026-05-14 12:26:11 +02:00

120 lines
3.6 KiB
Python

"""Collect ADC product/PIL/SPC links from index or search pages.
The script is intentionally conservative: it only stores discovered ADC product
URLs and does not try to parse clinical content. The next pipeline step downloads
the actual leaflet pages.
"""
from __future__ import annotations
import argparse
import json
import time
from collections import deque
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
DEFAULT_HEADERS = {
"User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)",
}
def is_adc_url(url: str) -> bool:
host = urlparse(url).netloc.lower()
return host.endswith("adc.sk")
def is_product_like_url(url: str) -> bool:
path = urlparse(url).path.lower()
return "/databazy/produkty/" in path and (
"/pil/" in path or "/spc/" in path or "/detail/" in path
)
def extract_links(html: str, base_url: str) -> tuple[set[str], set[str]]:
soup = BeautifulSoup(html, "lxml")
product_links: set[str] = set()
crawl_links: set[str] = set()
for tag in soup.find_all("a", href=True):
url = urljoin(base_url, tag["href"]).split("#", 1)[0]
if not is_adc_url(url):
continue
if is_product_like_url(url):
product_links.add(url)
path = urlparse(url).path.lower()
if "/databazy/produkty/" in path:
crawl_links.add(url)
return product_links, crawl_links
def fetch(session: requests.Session, url: str, timeout: int) -> str:
response = session.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
response.raise_for_status()
response.encoding = response.apparent_encoding or "utf-8"
return response.text
def main() -> None:
parser = argparse.ArgumentParser(description="Collect ADC product/PIL/SPC links.")
parser.add_argument(
"--start-url",
action="append",
required=True,
help="ADC index/search URL. Can be supplied multiple times.",
)
parser.add_argument("--out", type=Path, required=True, help="Output JSON file.")
parser.add_argument("--max-pages", type=int, default=20)
parser.add_argument("--delay", type=float, default=0.5)
parser.add_argument("--timeout", type=int, default=30)
args = parser.parse_args()
queue: deque[str] = deque(args.start_url)
visited: set[str] = set()
product_links: set[str] = set()
session = requests.Session()
with tqdm(total=args.max_pages, desc="ADC pages") as progress:
while queue and len(visited) < args.max_pages:
url = queue.popleft()
if url in visited:
continue
visited.add(url)
try:
html = fetch(session, url, args.timeout)
except Exception as exc:
tqdm.write(f"Skip {url}: {exc}")
progress.update(1)
continue
found_products, found_crawl = extract_links(html, url)
product_links.update(found_products)
for link in sorted(found_crawl):
if link not in visited and len(visited) + len(queue) < args.max_pages * 4:
queue.append(link)
progress.update(1)
time.sleep(args.delay)
args.out.parent.mkdir(parents=True, exist_ok=True)
payload = {
"source": "adc.sk",
"visited_pages": sorted(visited),
"links": sorted(product_links),
}
args.out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Saved {len(product_links)} links to {args.out}")
if __name__ == "__main__":
main()