"""Scrape product detail links from ADC product listing pages. Example: python scripts/adc_scraper/scrape_adc_product_links.py --out data_adc_databaza/adc_scrape_2026_05_04/adc_product_links.json """ from __future__ import annotations import argparse import json import time from pathlib import Path from urllib.parse import urljoin import requests from bs4 import BeautifulSoup from tqdm import tqdm BASE_URL = "https://www.adc.sk" LISTING_URL = "https://www.adc.sk/databazy/produkty?page={page}&ord=a1" DEFAULT_PAGES = 711 HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "sk-SK,sk;q=0.9,cs;q=0.8,en;q=0.7", "Cache-Control": "no-cache", "Pragma": "no-cache", "Referer": "https://www.adc.sk/databazy/produkty", } def fetch_page(session: requests.Session, page: int, timeout: int, retries: int) -> str: url = LISTING_URL.format(page=page) last_error: Exception | None = None for attempt in range(1, retries + 1): try: response = session.get(url, headers=HEADERS, timeout=timeout) response.raise_for_status() response.encoding = response.apparent_encoding or "utf-8" return response.text except Exception as exc: last_error = exc if attempt < retries: time.sleep(1.5 * attempt) raise RuntimeError(f"Failed to fetch page {page}: {last_error}") def extract_product_links(html: str) -> list[str]: soup = BeautifulSoup(html, "lxml") links: list[str] = [] for tag in soup.select('a.product[href^="/databazy/produkty/detail/"]'): href = tag.get("href") if not href: continue links.append(urljoin(BASE_URL, href)) return links def scrape_with_requests( start_page: int, pages: int, delay: float, timeout: int, retries: int, ) -> tuple[list[str], list[int]]: session = requests.Session() seen: set[str] = set() all_links: list[str] = [] failed_pages: list[int] = [] end_page = start_page + pages - 1 for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"): try: html = fetch_page(session, page, timeout, retries) page_links = extract_product_links(html) except Exception as exc: tqdm.write(str(exc)) failed_pages.append(page) continue for link in page_links: if link not in seen: seen.add(link) all_links.append(link) time.sleep(delay) return all_links, failed_pages def scrape_with_browser(start_page: int, pages: int, delay: float) -> tuple[list[str], list[int]]: try: from playwright.sync_api import sync_playwright except ImportError as exc: raise SystemExit( "Playwright is not installed. Run: pip install playwright; python -m playwright install chromium" ) from exc seen: set[str] = set() all_links: list[str] = [] failed_pages: list[int] = [] end_page = start_page + pages - 1 with sync_playwright() as playwright: browser = playwright.chromium.launch(headless=True) page_obj = browser.new_page( user_agent=HEADERS["User-Agent"], locale="sk-SK", viewport={"width": 1366, "height": 900}, ) for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"): url = LISTING_URL.format(page=page) try: response = page_obj.goto(url, wait_until="domcontentloaded", timeout=60000) if response is None or response.status >= 400: status = response.status if response else "no-response" raise RuntimeError(f"HTTP {status}") html = page_obj.content() page_links = extract_product_links(html) except Exception as exc: tqdm.write(f"Failed page {page}: {exc}") failed_pages.append(page) continue for link in page_links: if link not in seen: seen.add(link) all_links.append(link) time.sleep(delay) browser.close() return all_links, failed_pages def main() -> None: parser = argparse.ArgumentParser(description="Scrape ADC product detail links.") parser.add_argument("--out", type=Path, required=True, help="Output JSON file.") parser.add_argument("--pages", type=int, default=DEFAULT_PAGES, help="Number of ADC listing pages.") parser.add_argument("--start-page", type=int, default=1, help="First page number.") parser.add_argument("--delay", type=float, default=0.25, help="Delay between requests in seconds.") parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout in seconds.") parser.add_argument("--retries", type=int, default=3, help="Retries per page.") parser.add_argument( "--browser", action="store_true", help="Use Playwright Chromium instead of requests. Useful when ADC returns HTTP 403.", ) args = parser.parse_args() if args.browser: all_links, failed_pages = scrape_with_browser(args.start_page, args.pages, args.delay) else: all_links, failed_pages = scrape_with_requests( args.start_page, args.pages, args.delay, args.timeout, args.retries, ) args.out.parent.mkdir(parents=True, exist_ok=True) args.out.write_text(json.dumps(all_links, ensure_ascii=False, indent=2), encoding="utf-8") print(f"Saved {len(all_links)} unique product links to {args.out}") if failed_pages: print(f"Failed pages: {failed_pages}") if __name__ == "__main__": main()