125 lines
4.0 KiB
Python
125 lines
4.0 KiB
Python
"""Download ADC PIL/SPC pages into a raw JSONL file."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from tqdm import tqdm
|
|
|
|
|
|
HEADERS = {
|
|
"User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)",
|
|
}
|
|
|
|
|
|
def load_links(path: Path) -> list[str]:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
if isinstance(data, list):
|
|
return [str(x) for x in data]
|
|
return [str(x) for x in data.get("links", [])]
|
|
|
|
|
|
def paired_leaflet_urls(url: str) -> dict[str, str]:
|
|
"""Return best-effort PIL/SPC URLs for an ADC product URL."""
|
|
urls: dict[str, str] = {}
|
|
path = urlparse(url).path.lower()
|
|
if "/pil/" in path:
|
|
urls["pil_url"] = url
|
|
urls["spc_url"] = url.replace("/pil/", "/spc/")
|
|
elif "/spc/" in path:
|
|
urls["spc_url"] = url
|
|
urls["pil_url"] = url.replace("/spc/", "/pil/")
|
|
else:
|
|
urls["detail_url"] = url
|
|
return urls
|
|
|
|
|
|
def discover_leaflet_urls_from_detail(html: str, base_url: str) -> dict[str, str]:
|
|
from urllib.parse import urljoin
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
result: dict[str, str] = {}
|
|
for tag in soup.find_all("a", href=True):
|
|
candidate = urljoin(base_url, tag["href"])
|
|
path = urlparse(candidate).path.lower()
|
|
if "/databazy/produkty/pil/" in path:
|
|
result["pil_url"] = candidate
|
|
elif "/databazy/produkty/spc/" in path:
|
|
result["spc_url"] = candidate
|
|
return result
|
|
|
|
|
|
def fetch(session: requests.Session, url: str, timeout: int) -> tuple[int, str]:
|
|
response = session.get(url, headers=HEADERS, timeout=timeout)
|
|
response.encoding = response.apparent_encoding or "utf-8"
|
|
return response.status_code, response.text
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Download ADC PIL/SPC HTML pages.")
|
|
parser.add_argument("--links", type=Path, required=True)
|
|
parser.add_argument("--out", type=Path, required=True)
|
|
parser.add_argument("--limit", type=int, default=None)
|
|
parser.add_argument("--delay", type=float, default=0.5)
|
|
parser.add_argument("--timeout", type=int, default=30)
|
|
args = parser.parse_args()
|
|
|
|
links = load_links(args.links)
|
|
if args.limit:
|
|
links = links[: args.limit]
|
|
|
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
session = requests.Session()
|
|
|
|
with args.out.open("w", encoding="utf-8") as out:
|
|
for source_url in tqdm(links, desc="ADC leaflets"):
|
|
urls = paired_leaflet_urls(source_url)
|
|
|
|
if "detail_url" in urls:
|
|
status, html = fetch(session, urls["detail_url"], args.timeout)
|
|
if status == 200:
|
|
urls.update(discover_leaflet_urls_from_detail(html, urls["detail_url"]))
|
|
time.sleep(args.delay)
|
|
|
|
record = {
|
|
"source_url": source_url,
|
|
"pil_url": urls.get("pil_url"),
|
|
"spc_url": urls.get("spc_url"),
|
|
"pil_status": None,
|
|
"spc_status": None,
|
|
"pil_html": None,
|
|
"spc_html": None,
|
|
"metadata": {
|
|
"source": "adc.sk",
|
|
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
|
},
|
|
}
|
|
|
|
for kind in ("pil", "spc"):
|
|
url = urls.get(f"{kind}_url")
|
|
if not url:
|
|
continue
|
|
try:
|
|
status, html = fetch(session, url, args.timeout)
|
|
record[f"{kind}_status"] = status
|
|
if status == 200:
|
|
record[f"{kind}_html"] = html
|
|
except Exception as exc:
|
|
record[f"{kind}_status"] = f"error: {exc}"
|
|
time.sleep(args.delay)
|
|
|
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
|
|
print(f"Saved raw leaflets to {args.out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|