DiplomovaPraca/scripts/adc_scraper/scrape_adc_leaflets.py
2026-05-14 12:26:11 +02:00

125 lines
4.0 KiB
Python

"""Download ADC PIL/SPC pages into a raw JSONL file."""
from __future__ import annotations
import argparse
import json
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
HEADERS = {
"User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)",
}
def load_links(path: Path) -> list[str]:
data = json.loads(path.read_text(encoding="utf-8"))
if isinstance(data, list):
return [str(x) for x in data]
return [str(x) for x in data.get("links", [])]
def paired_leaflet_urls(url: str) -> dict[str, str]:
"""Return best-effort PIL/SPC URLs for an ADC product URL."""
urls: dict[str, str] = {}
path = urlparse(url).path.lower()
if "/pil/" in path:
urls["pil_url"] = url
urls["spc_url"] = url.replace("/pil/", "/spc/")
elif "/spc/" in path:
urls["spc_url"] = url
urls["pil_url"] = url.replace("/spc/", "/pil/")
else:
urls["detail_url"] = url
return urls
def discover_leaflet_urls_from_detail(html: str, base_url: str) -> dict[str, str]:
from urllib.parse import urljoin
soup = BeautifulSoup(html, "lxml")
result: dict[str, str] = {}
for tag in soup.find_all("a", href=True):
candidate = urljoin(base_url, tag["href"])
path = urlparse(candidate).path.lower()
if "/databazy/produkty/pil/" in path:
result["pil_url"] = candidate
elif "/databazy/produkty/spc/" in path:
result["spc_url"] = candidate
return result
def fetch(session: requests.Session, url: str, timeout: int) -> tuple[int, str]:
response = session.get(url, headers=HEADERS, timeout=timeout)
response.encoding = response.apparent_encoding or "utf-8"
return response.status_code, response.text
def main() -> None:
parser = argparse.ArgumentParser(description="Download ADC PIL/SPC HTML pages.")
parser.add_argument("--links", type=Path, required=True)
parser.add_argument("--out", type=Path, required=True)
parser.add_argument("--limit", type=int, default=None)
parser.add_argument("--delay", type=float, default=0.5)
parser.add_argument("--timeout", type=int, default=30)
args = parser.parse_args()
links = load_links(args.links)
if args.limit:
links = links[: args.limit]
args.out.parent.mkdir(parents=True, exist_ok=True)
session = requests.Session()
with args.out.open("w", encoding="utf-8") as out:
for source_url in tqdm(links, desc="ADC leaflets"):
urls = paired_leaflet_urls(source_url)
if "detail_url" in urls:
status, html = fetch(session, urls["detail_url"], args.timeout)
if status == 200:
urls.update(discover_leaflet_urls_from_detail(html, urls["detail_url"]))
time.sleep(args.delay)
record = {
"source_url": source_url,
"pil_url": urls.get("pil_url"),
"spc_url": urls.get("spc_url"),
"pil_status": None,
"spc_status": None,
"pil_html": None,
"spc_html": None,
"metadata": {
"source": "adc.sk",
"scraped_at": datetime.now(timezone.utc).isoformat(),
},
}
for kind in ("pil", "spc"):
url = urls.get(f"{kind}_url")
if not url:
continue
try:
status, html = fetch(session, url, args.timeout)
record[f"{kind}_status"] = status
if status == 200:
record[f"{kind}_html"] = html
except Exception as exc:
record[f"{kind}_status"] = f"error: {exc}"
time.sleep(args.delay)
out.write(json.dumps(record, ensure_ascii=False) + "\n")
print(f"Saved raw leaflets to {args.out}")
if __name__ == "__main__":
main()