120 lines
3.6 KiB
Python
120 lines
3.6 KiB
Python
"""Collect ADC product/PIL/SPC links from index or search pages.
|
|
|
|
The script is intentionally conservative: it only stores discovered ADC product
|
|
URLs and does not try to parse clinical content. The next pipeline step downloads
|
|
the actual leaflet pages.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import time
|
|
from collections import deque
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from tqdm import tqdm
|
|
|
|
|
|
DEFAULT_HEADERS = {
|
|
"User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)",
|
|
}
|
|
|
|
|
|
def is_adc_url(url: str) -> bool:
|
|
host = urlparse(url).netloc.lower()
|
|
return host.endswith("adc.sk")
|
|
|
|
|
|
def is_product_like_url(url: str) -> bool:
|
|
path = urlparse(url).path.lower()
|
|
return "/databazy/produkty/" in path and (
|
|
"/pil/" in path or "/spc/" in path or "/detail/" in path
|
|
)
|
|
|
|
|
|
def extract_links(html: str, base_url: str) -> tuple[set[str], set[str]]:
|
|
soup = BeautifulSoup(html, "lxml")
|
|
product_links: set[str] = set()
|
|
crawl_links: set[str] = set()
|
|
|
|
for tag in soup.find_all("a", href=True):
|
|
url = urljoin(base_url, tag["href"]).split("#", 1)[0]
|
|
if not is_adc_url(url):
|
|
continue
|
|
if is_product_like_url(url):
|
|
product_links.add(url)
|
|
|
|
path = urlparse(url).path.lower()
|
|
if "/databazy/produkty/" in path:
|
|
crawl_links.add(url)
|
|
|
|
return product_links, crawl_links
|
|
|
|
|
|
def fetch(session: requests.Session, url: str, timeout: int) -> str:
|
|
response = session.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
|
|
response.raise_for_status()
|
|
response.encoding = response.apparent_encoding or "utf-8"
|
|
return response.text
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Collect ADC product/PIL/SPC links.")
|
|
parser.add_argument(
|
|
"--start-url",
|
|
action="append",
|
|
required=True,
|
|
help="ADC index/search URL. Can be supplied multiple times.",
|
|
)
|
|
parser.add_argument("--out", type=Path, required=True, help="Output JSON file.")
|
|
parser.add_argument("--max-pages", type=int, default=20)
|
|
parser.add_argument("--delay", type=float, default=0.5)
|
|
parser.add_argument("--timeout", type=int, default=30)
|
|
args = parser.parse_args()
|
|
|
|
queue: deque[str] = deque(args.start_url)
|
|
visited: set[str] = set()
|
|
product_links: set[str] = set()
|
|
session = requests.Session()
|
|
|
|
with tqdm(total=args.max_pages, desc="ADC pages") as progress:
|
|
while queue and len(visited) < args.max_pages:
|
|
url = queue.popleft()
|
|
if url in visited:
|
|
continue
|
|
visited.add(url)
|
|
|
|
try:
|
|
html = fetch(session, url, args.timeout)
|
|
except Exception as exc:
|
|
tqdm.write(f"Skip {url}: {exc}")
|
|
progress.update(1)
|
|
continue
|
|
|
|
found_products, found_crawl = extract_links(html, url)
|
|
product_links.update(found_products)
|
|
|
|
for link in sorted(found_crawl):
|
|
if link not in visited and len(visited) + len(queue) < args.max_pages * 4:
|
|
queue.append(link)
|
|
|
|
progress.update(1)
|
|
time.sleep(args.delay)
|
|
|
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
payload = {
|
|
"source": "adc.sk",
|
|
"visited_pages": sorted(visited),
|
|
"links": sorted(product_links),
|
|
}
|
|
args.out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print(f"Saved {len(product_links)} links to {args.out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|