DiplomovaPraca/scripts/adc_scraper/scrape_adc_product_links.py
2026-05-14 12:26:11 +02:00

183 lines
5.9 KiB
Python

"""Scrape product detail links from ADC product listing pages.
Example:
python scripts/adc_scraper/scrape_adc_product_links.py --out data_adc_databaza/adc_scrape_2026_05_04/adc_product_links.json
"""
from __future__ import annotations
import argparse
import json
import time
from pathlib import Path
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
BASE_URL = "https://www.adc.sk"
LISTING_URL = "https://www.adc.sk/databazy/produkty?page={page}&ord=a1"
DEFAULT_PAGES = 711
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "sk-SK,sk;q=0.9,cs;q=0.8,en;q=0.7",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
"Referer": "https://www.adc.sk/databazy/produkty",
}
def fetch_page(session: requests.Session, page: int, timeout: int, retries: int) -> str:
url = LISTING_URL.format(page=page)
last_error: Exception | None = None
for attempt in range(1, retries + 1):
try:
response = session.get(url, headers=HEADERS, timeout=timeout)
response.raise_for_status()
response.encoding = response.apparent_encoding or "utf-8"
return response.text
except Exception as exc:
last_error = exc
if attempt < retries:
time.sleep(1.5 * attempt)
raise RuntimeError(f"Failed to fetch page {page}: {last_error}")
def extract_product_links(html: str) -> list[str]:
soup = BeautifulSoup(html, "lxml")
links: list[str] = []
for tag in soup.select('a.product[href^="/databazy/produkty/detail/"]'):
href = tag.get("href")
if not href:
continue
links.append(urljoin(BASE_URL, href))
return links
def scrape_with_requests(
start_page: int,
pages: int,
delay: float,
timeout: int,
retries: int,
) -> tuple[list[str], list[int]]:
session = requests.Session()
seen: set[str] = set()
all_links: list[str] = []
failed_pages: list[int] = []
end_page = start_page + pages - 1
for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"):
try:
html = fetch_page(session, page, timeout, retries)
page_links = extract_product_links(html)
except Exception as exc:
tqdm.write(str(exc))
failed_pages.append(page)
continue
for link in page_links:
if link not in seen:
seen.add(link)
all_links.append(link)
time.sleep(delay)
return all_links, failed_pages
def scrape_with_browser(start_page: int, pages: int, delay: float) -> tuple[list[str], list[int]]:
try:
from playwright.sync_api import sync_playwright
except ImportError as exc:
raise SystemExit(
"Playwright is not installed. Run: pip install playwright; python -m playwright install chromium"
) from exc
seen: set[str] = set()
all_links: list[str] = []
failed_pages: list[int] = []
end_page = start_page + pages - 1
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=True)
page_obj = browser.new_page(
user_agent=HEADERS["User-Agent"],
locale="sk-SK",
viewport={"width": 1366, "height": 900},
)
for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"):
url = LISTING_URL.format(page=page)
try:
response = page_obj.goto(url, wait_until="domcontentloaded", timeout=60000)
if response is None or response.status >= 400:
status = response.status if response else "no-response"
raise RuntimeError(f"HTTP {status}")
html = page_obj.content()
page_links = extract_product_links(html)
except Exception as exc:
tqdm.write(f"Failed page {page}: {exc}")
failed_pages.append(page)
continue
for link in page_links:
if link not in seen:
seen.add(link)
all_links.append(link)
time.sleep(delay)
browser.close()
return all_links, failed_pages
def main() -> None:
parser = argparse.ArgumentParser(description="Scrape ADC product detail links.")
parser.add_argument("--out", type=Path, required=True, help="Output JSON file.")
parser.add_argument("--pages", type=int, default=DEFAULT_PAGES, help="Number of ADC listing pages.")
parser.add_argument("--start-page", type=int, default=1, help="First page number.")
parser.add_argument("--delay", type=float, default=0.25, help="Delay between requests in seconds.")
parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout in seconds.")
parser.add_argument("--retries", type=int, default=3, help="Retries per page.")
parser.add_argument(
"--browser",
action="store_true",
help="Use Playwright Chromium instead of requests. Useful when ADC returns HTTP 403.",
)
args = parser.parse_args()
if args.browser:
all_links, failed_pages = scrape_with_browser(args.start_page, args.pages, args.delay)
else:
all_links, failed_pages = scrape_with_requests(
args.start_page,
args.pages,
args.delay,
args.timeout,
args.retries,
)
args.out.parent.mkdir(parents=True, exist_ok=True)
args.out.write_text(json.dumps(all_links, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Saved {len(all_links)} unique product links to {args.out}")
if failed_pages:
print(f"Failed pages: {failed_pages}")
if __name__ == "__main__":
main()