DiplomovaPraca/scripts/adc_scraper/scrape_adc_product_links.py

"""Scrape product detail links from ADC product listing pages.

Example:
    python scripts/adc_scraper/scrape_adc_product_links.py --out data_adc_databaza/adc_scrape_2026_05_04/adc_product_links.json
"""

from __future__ import annotations

import argparse
import json
import time
from pathlib import Path
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


BASE_URL = "https://www.adc.sk"
LISTING_URL = "https://www.adc.sk/databazy/produkty?page={page}&ord=a1"
DEFAULT_PAGES = 711
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "sk-SK,sk;q=0.9,cs;q=0.8,en;q=0.7",
    "Cache-Control": "no-cache",
    "Pragma": "no-cache",
    "Referer": "https://www.adc.sk/databazy/produkty",
}


def fetch_page(session: requests.Session, page: int, timeout: int, retries: int) -> str:
    url = LISTING_URL.format(page=page)
    last_error: Exception | None = None

    for attempt in range(1, retries + 1):
        try:
            response = session.get(url, headers=HEADERS, timeout=timeout)
            response.raise_for_status()
            response.encoding = response.apparent_encoding or "utf-8"
            return response.text
        except Exception as exc:
            last_error = exc
            if attempt < retries:
                time.sleep(1.5 * attempt)

    raise RuntimeError(f"Failed to fetch page {page}: {last_error}")


def extract_product_links(html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    links: list[str] = []

    for tag in soup.select('a.product[href^="/databazy/produkty/detail/"]'):
        href = tag.get("href")
        if not href:
            continue
        links.append(urljoin(BASE_URL, href))

    return links


def scrape_with_requests(
    start_page: int,
    pages: int,
    delay: float,
    timeout: int,
    retries: int,
) -> tuple[list[str], list[int]]:
    session = requests.Session()
    seen: set[str] = set()
    all_links: list[str] = []
    failed_pages: list[int] = []

    end_page = start_page + pages - 1
    for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"):
        try:
            html = fetch_page(session, page, timeout, retries)
            page_links = extract_product_links(html)
        except Exception as exc:
            tqdm.write(str(exc))
            failed_pages.append(page)
            continue

        for link in page_links:
            if link not in seen:
                seen.add(link)
                all_links.append(link)

        time.sleep(delay)

    return all_links, failed_pages


def scrape_with_browser(start_page: int, pages: int, delay: float) -> tuple[list[str], list[int]]:
    try:
        from playwright.sync_api import sync_playwright
    except ImportError as exc:
        raise SystemExit(
            "Playwright is not installed. Run: pip install playwright; python -m playwright install chromium"
        ) from exc

    seen: set[str] = set()
    all_links: list[str] = []
    failed_pages: list[int] = []
    end_page = start_page + pages - 1

    with sync_playwright() as playwright:
        browser = playwright.chromium.launch(headless=True)
        page_obj = browser.new_page(
            user_agent=HEADERS["User-Agent"],
            locale="sk-SK",
            viewport={"width": 1366, "height": 900},
        )

        for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"):
            url = LISTING_URL.format(page=page)
            try:
                response = page_obj.goto(url, wait_until="domcontentloaded", timeout=60000)
                if response is None or response.status >= 400:
                    status = response.status if response else "no-response"
                    raise RuntimeError(f"HTTP {status}")
                html = page_obj.content()
                page_links = extract_product_links(html)
            except Exception as exc:
                tqdm.write(f"Failed page {page}: {exc}")
                failed_pages.append(page)
                continue

            for link in page_links:
                if link not in seen:
                    seen.add(link)
                    all_links.append(link)

            time.sleep(delay)

        browser.close()

    return all_links, failed_pages


def main() -> None:
    parser = argparse.ArgumentParser(description="Scrape ADC product detail links.")
    parser.add_argument("--out", type=Path, required=True, help="Output JSON file.")
    parser.add_argument("--pages", type=int, default=DEFAULT_PAGES, help="Number of ADC listing pages.")
    parser.add_argument("--start-page", type=int, default=1, help="First page number.")
    parser.add_argument("--delay", type=float, default=0.25, help="Delay between requests in seconds.")
    parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout in seconds.")
    parser.add_argument("--retries", type=int, default=3, help="Retries per page.")
    parser.add_argument(
        "--browser",
        action="store_true",
        help="Use Playwright Chromium instead of requests. Useful when ADC returns HTTP 403.",
    )
    args = parser.parse_args()

    if args.browser:
        all_links, failed_pages = scrape_with_browser(args.start_page, args.pages, args.delay)
    else:
        all_links, failed_pages = scrape_with_requests(
            args.start_page,
            args.pages,
            args.delay,
            args.timeout,
            args.retries,
        )

    args.out.parent.mkdir(parents=True, exist_ok=True)
    args.out.write_text(json.dumps(all_links, ensure_ascii=False, indent=2), encoding="utf-8")

    print(f"Saved {len(all_links)} unique product links to {args.out}")
    if failed_pages:
        print(f"Failed pages: {failed_pages}")


if __name__ == "__main__":
    main()