183 lines
5.9 KiB
Python
183 lines
5.9 KiB
Python
"""Scrape product detail links from ADC product listing pages.
|
|
|
|
Example:
|
|
python scripts/adc_scraper/scrape_adc_product_links.py --out data_adc_databaza/adc_scrape_2026_05_04/adc_product_links.json
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from tqdm import tqdm
|
|
|
|
|
|
BASE_URL = "https://www.adc.sk"
|
|
LISTING_URL = "https://www.adc.sk/databazy/produkty?page={page}&ord=a1"
|
|
DEFAULT_PAGES = 711
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/124.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "sk-SK,sk;q=0.9,cs;q=0.8,en;q=0.7",
|
|
"Cache-Control": "no-cache",
|
|
"Pragma": "no-cache",
|
|
"Referer": "https://www.adc.sk/databazy/produkty",
|
|
}
|
|
|
|
|
|
def fetch_page(session: requests.Session, page: int, timeout: int, retries: int) -> str:
|
|
url = LISTING_URL.format(page=page)
|
|
last_error: Exception | None = None
|
|
|
|
for attempt in range(1, retries + 1):
|
|
try:
|
|
response = session.get(url, headers=HEADERS, timeout=timeout)
|
|
response.raise_for_status()
|
|
response.encoding = response.apparent_encoding or "utf-8"
|
|
return response.text
|
|
except Exception as exc:
|
|
last_error = exc
|
|
if attempt < retries:
|
|
time.sleep(1.5 * attempt)
|
|
|
|
raise RuntimeError(f"Failed to fetch page {page}: {last_error}")
|
|
|
|
|
|
def extract_product_links(html: str) -> list[str]:
|
|
soup = BeautifulSoup(html, "lxml")
|
|
links: list[str] = []
|
|
|
|
for tag in soup.select('a.product[href^="/databazy/produkty/detail/"]'):
|
|
href = tag.get("href")
|
|
if not href:
|
|
continue
|
|
links.append(urljoin(BASE_URL, href))
|
|
|
|
return links
|
|
|
|
|
|
def scrape_with_requests(
|
|
start_page: int,
|
|
pages: int,
|
|
delay: float,
|
|
timeout: int,
|
|
retries: int,
|
|
) -> tuple[list[str], list[int]]:
|
|
session = requests.Session()
|
|
seen: set[str] = set()
|
|
all_links: list[str] = []
|
|
failed_pages: list[int] = []
|
|
|
|
end_page = start_page + pages - 1
|
|
for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"):
|
|
try:
|
|
html = fetch_page(session, page, timeout, retries)
|
|
page_links = extract_product_links(html)
|
|
except Exception as exc:
|
|
tqdm.write(str(exc))
|
|
failed_pages.append(page)
|
|
continue
|
|
|
|
for link in page_links:
|
|
if link not in seen:
|
|
seen.add(link)
|
|
all_links.append(link)
|
|
|
|
time.sleep(delay)
|
|
|
|
return all_links, failed_pages
|
|
|
|
|
|
def scrape_with_browser(start_page: int, pages: int, delay: float) -> tuple[list[str], list[int]]:
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
except ImportError as exc:
|
|
raise SystemExit(
|
|
"Playwright is not installed. Run: pip install playwright; python -m playwright install chromium"
|
|
) from exc
|
|
|
|
seen: set[str] = set()
|
|
all_links: list[str] = []
|
|
failed_pages: list[int] = []
|
|
end_page = start_page + pages - 1
|
|
|
|
with sync_playwright() as playwright:
|
|
browser = playwright.chromium.launch(headless=True)
|
|
page_obj = browser.new_page(
|
|
user_agent=HEADERS["User-Agent"],
|
|
locale="sk-SK",
|
|
viewport={"width": 1366, "height": 900},
|
|
)
|
|
|
|
for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"):
|
|
url = LISTING_URL.format(page=page)
|
|
try:
|
|
response = page_obj.goto(url, wait_until="domcontentloaded", timeout=60000)
|
|
if response is None or response.status >= 400:
|
|
status = response.status if response else "no-response"
|
|
raise RuntimeError(f"HTTP {status}")
|
|
html = page_obj.content()
|
|
page_links = extract_product_links(html)
|
|
except Exception as exc:
|
|
tqdm.write(f"Failed page {page}: {exc}")
|
|
failed_pages.append(page)
|
|
continue
|
|
|
|
for link in page_links:
|
|
if link not in seen:
|
|
seen.add(link)
|
|
all_links.append(link)
|
|
|
|
time.sleep(delay)
|
|
|
|
browser.close()
|
|
|
|
return all_links, failed_pages
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Scrape ADC product detail links.")
|
|
parser.add_argument("--out", type=Path, required=True, help="Output JSON file.")
|
|
parser.add_argument("--pages", type=int, default=DEFAULT_PAGES, help="Number of ADC listing pages.")
|
|
parser.add_argument("--start-page", type=int, default=1, help="First page number.")
|
|
parser.add_argument("--delay", type=float, default=0.25, help="Delay between requests in seconds.")
|
|
parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout in seconds.")
|
|
parser.add_argument("--retries", type=int, default=3, help="Retries per page.")
|
|
parser.add_argument(
|
|
"--browser",
|
|
action="store_true",
|
|
help="Use Playwright Chromium instead of requests. Useful when ADC returns HTTP 403.",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.browser:
|
|
all_links, failed_pages = scrape_with_browser(args.start_page, args.pages, args.delay)
|
|
else:
|
|
all_links, failed_pages = scrape_with_requests(
|
|
args.start_page,
|
|
args.pages,
|
|
args.delay,
|
|
args.timeout,
|
|
args.retries,
|
|
)
|
|
|
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
args.out.write_text(json.dumps(all_links, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
|
print(f"Saved {len(all_links)} unique product links to {args.out}")
|
|
if failed_pages:
|
|
print(f"Failed pages: {failed_pages}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|