commit 5f25004d059a5cf62867c3602ed41cdb6bcc1e0e Author: Oleh Poiasnik Date: Thu May 14 12:26:11 2026 +0200 Initial ADC scraper project setup diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..90ec8ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,32 @@ +# Large local datasets +data_adc_databaza/ + +# External LightRAG checkout and generated RAG storage +lightrag/ + +# Scraped/debug HTML snapshots +pil.html +detail-product.html + +# Generated graph artifacts +*.graphml + +# Logs +*.log + +# Python cache and local environments +__pycache__/ +*.py[cod] +.venv/ +venv/ +env/ + +# Tool/local workspace metadata +.claude/ +.tmp/ + +# OS/editor files +.DS_Store +Thumbs.db +.idea/ +.vscode/ diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..c036ef4 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,127 @@ +# LightRAG ADC System Architecture + +This document describes the full architecture of the LightRAG-based Adverse Drug Condition (ADC) system for processing and querying Slovak pharmaceutical leaflets. + +The system consists of three main components running locally: +- **Embedding Server** (port 8010) — wraps a sentence-transformers model for vector generation +- **LightRAG Server** (port 9621) — core RAG engine managing the knowledge graph and vector DB +- **OpenWebUI LLM** (remote) — hosts the Qwen3.5-122B model used for entity extraction and answer generation + +Both local servers are launched via `start_servers.py`. Source data is 6929 Slovak pharmaceutical leaflets stored in `cleaned_general_info_additional.json`. + +--- + +## Flow 1: Ingestion (Loading Leaflets) + +```mermaid +flowchart TD + A([👤 User runs load_leaflets.py]) --> B + + B[("📄 cleaned_general_info_additional.json\n6929 Slovak leaflets")] + B --> C{Filter:\nclinical leaflets only\ninteractions +\ncontraindications} + + C -->|Filtered leaflets| D["🔁 For each leaflet\n(loop)"] + + D --> E["POST http://localhost:9621\n/documents/text\n\nBody: { text, metadata }"] + + subgraph LightRAG_Server ["⚙️ LightRAG Server — port 9621"] + E --> F["Text chunker\n600 tokens per chunk"] + F --> G["🔁 For each chunk\n(loop)"] + + G --> H["POST https://ui.tukekemt.xyz\n/api/v1/chat/completions\n\nModel: model2 (Qwen3.5-122B)\nTask: extract entities & relations"] + + H --> I["Extracted:\n• Entities (drugs, conditions, etc.)\n• Relations between entities"] + + I --> J["🔁 For each entity / chunk\n(loop)"] + + J --> K["POST http://localhost:8010\n/embeddings\n\nBody: { input: text }"] + end + + subgraph Embedding_Server ["🧠 Embedding Server — port 8010"] + K --> L["paraphrase-multilingual\n-MiniLM-L12-v2\n(sentence-transformers)"] + L --> M["Float vector\n(384 dimensions)"] + end + + subgraph OpenWebUI ["☁️ OpenWebUI — ui.tukekemt.xyz"] + H + end + + M --> N + + subgraph RAG_Storage ["💾 rag_storage/"] + N["graph_chunk_entity_relation.graphml\n— knowledge graph (NetworkX)"] + O["vdb_entities.json\n— entity vectors (NanoVectorDB)"] + P["vdb_relationships.json\n— relation vectors (NanoVectorDB)"] + Q["kv_store_*.json\n— chunk text cache & metadata"] + end + + I --> N + I --> P + M --> O + F --> Q +``` + +--- + +## Flow 2: Query (Answering Questions) + +```mermaid +flowchart TD + A([👤 User sends query]) --> B + + B["POST http://localhost:9621/query\n\nBody:\n{ query: string,\n mode: hybrid | local | global | naive }"] + + subgraph LightRAG_Server ["⚙️ LightRAG Server — port 9621"] + B --> C["Parse query\n& select retrieval mode"] + + C --> D["POST http://localhost:8010\n/embeddings\n\nEmbed the query text"] + + subgraph Retrieval ["🔍 Retrieval (parallel)"] + E["Vector search\nNanoVectorDB\n(vdb_entities.json,\nvdb_relationships.json)"] + F["Graph traversal\nNetworkX\n(graph_chunk_entity_relation.graphml)"] + end + + D --> Retrieval + Retrieval --> G["Merge & rank\nrelevant entities,\nrelations & text chunks"] + + G --> H["Build context prompt\nfrom top-K results\n+ retrieved chunk texts\n(kv_store_*.json)"] + + H --> I["POST https://ui.tukekemt.xyz\n/api/v1/chat/completions\n\nModel: model2 (Qwen3.5-122B)\nTask: generate answer\nfrom context"] + end + + subgraph Embedding_Server ["🧠 Embedding Server — port 8010"] + D2["paraphrase-multilingual\n-MiniLM-L12-v2"] + D --> D2 + D2 --> E + end + + subgraph OpenWebUI ["☁️ OpenWebUI — ui.tukekemt.xyz"] + I + end + + subgraph RAG_Storage ["💾 rag_storage/"] + VDB["vdb_entities.json\nvdb_relationships.json"] + GRAPH["graph_chunk_entity_relation.graphml"] + KV["kv_store_*.json"] + end + + E --- VDB + F --- GRAPH + H --- KV + + I --> J["Generated answer\n+ source references"] + J --> K([👤 User receives response]) +``` + +--- + +## Component Summary + +| Component | Type | Address | Key Endpoints | +|---|---|---|---| +| `embedding_server.py` | FastAPI (local) | `http://localhost:8010` | `GET /health`, `POST /embeddings`, `POST /v1/embeddings` | +| LightRAG Server | FastAPI (local) | `http://localhost:9621` | `GET /health`, `POST /documents/text`, `POST /documents/scan`, `GET /documents/pipeline_status`, `POST /query` | +| OpenWebUI (model2) | Remote LLM API | `https://ui.tukekemt.xyz` | `POST /api/v1/chat/completions` | +| `rag_storage/` | File system | Local disk | `.graphml`, `.json` files | +| `cleaned_general_info_additional.json` | Source data | Local disk | 6929 Slovak pharmaceutical leaflets | +| `start_servers.py` | Launcher script | — | Starts embedding server + LightRAG server | diff --git a/RUN_INSTRUCTION.md b/RUN_INSTRUCTION.md new file mode 100644 index 0000000..57dce8f --- /dev/null +++ b/RUN_INSTRUCTION.md @@ -0,0 +1,214 @@ +# Run Instructions - LightRAG ADC Knowledge Graph + +This project prepares ADC pharmaceutical leaflet data for a knowledge graph and +LightRAG-based question answering about drug interactions, contraindications, +warnings, indications, dosage, and side effects. + +## Current Data + +The current ADC scrape is stored in: + +```powershell +data_adc_databaza/adc_scrape_2026_05_04/ +``` + +Main files: + +- `adc_product_links.json` - 35k+ ADC product detail URLs. +- `adc_products_structured.json` - main structured dataset for the next pipeline stage. +- `adc_products_structured.failed.json` - products that failed during scraping. +- `adc_products_structured_10.json` - small parser test sample. + +Use `adc_products_structured.json` as the main source for new graph and +LightRAG ingestion work. + +## Requirements + +- Python 3.10+ +- ADC scraper dependencies: + +```powershell +pip install -r scripts/adc_scraper/requirements.txt +python -m playwright install chromium +``` + +- Local embedding dependencies: + +```powershell +pip install sentence-transformers fastapi uvicorn +``` + +- LightRAG package from `lightrag/` +- OpenWebUI-compatible LLM API access configured in `lightrag/.env` + +## Scraping Pipeline + +Collect ADC product detail links: + +```powershell +python scripts/adc_scraper/scrape_adc_product_links.py ` + --out data_adc_databaza/adc_scrape_2026_05_04/adc_product_links.json ` + --browser +``` + +Scrape product detail pages and PIL pages into structured JSON: + +```powershell +python scripts/adc_scraper/scrape_adc_product_data.py --browser +``` + +The default output is: + +```powershell +data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json +``` + +For a small test run: + +```powershell +python scripts/adc_scraper/scrape_adc_product_data.py ` + --browser ` + --limit 10 ` + --out data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured_10.json +``` + +## Start Servers + +Start the local embedding server and LightRAG server: + +```powershell +cd "c:\Users\Oleh\Desktop\Diplomova praca" +python start_servers.py +``` + +Keep this terminal open. Stop with `Ctrl+C`. + +Health checks: + +```text +http://localhost:8010/health - embedding server +http://localhost:9621/health - LightRAG server +``` + +## Old Ingestion Pipeline + +The folder `checkpoint_02_ingest/` contains an older ingestion pipeline that +loads data from: + +```powershell +data_adc_databaza/cleaned_general_info_additional.json +``` + +It is kept as a reference because it already contains working LightRAG upload +logic and progress tracking: + +```powershell +python checkpoint_02_ingest/load_leaflets.py --count 50 +python checkpoint_02_ingest/load_leaflets.py --status +``` + +Do not treat this as the final ingestion path for the new dataset. The next +step is to create a new ingestion script that reads: + +```powershell +data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json +``` + +and sends each record's `lightrag_text` to LightRAG. + +## Query LightRAG + +After documents are ingested and LightRAG has finished processing them: + +```powershell +python -c " +import urllib.request, json +payload = json.dumps({'query': 'Ake su kontraindikacie Abirateronu?', 'mode': 'hybrid'}).encode() +req = urllib.request.Request('http://localhost:9621/query', data=payload, headers={'Content-Type': 'application/json'}) +r = urllib.request.urlopen(req, timeout=120) +print(json.loads(r.read())['response']) +" +``` + +Available query modes: + +- `hybrid` - recommended combined retrieval mode. +- `local` - entity-centered retrieval. +- `global` - broader graph-level retrieval. +- `naive` - vector-only retrieval. + +Avoid querying while the document pipeline is still busy. Entity extraction can +take several minutes per batch depending on the LLM API and concurrency limits. + +## Reset LightRAG Storage + +Stop the servers first, then clear generated graph/vector data: + +```powershell +Remove-Item -LiteralPath "c:\Users\Oleh\Desktop\Diplomova praca\lightrag\rag_storage\*" -Force +python checkpoint_02_ingest/load_leaflets.py --reset +``` + +Use this only when you intentionally want to rebuild the graph. + +## Recommended Next Steps + +1. Update `validate_adc_json.py` for the new `adc_products_structured.json` schema. +2. Build an explicit knowledge graph from `graph_hints` and PIL subsections. +3. Create a new LightRAG ingestion script for the new dataset. +4. Retry failed scrape URLs from `adc_products_structured.failed.json`. +5. Prepare a small RAGAS evaluation set for contraindication and interaction questions. + +## Project Layout + +```text +Diplomova praca/ + start_servers.py + embedding_server.py + scripts/adc_scraper/ + scrape_adc_product_links.py + scrape_adc_product_data.py + validate_adc_json.py + data_adc_databaza/ + adc_scrape_2026_05_04/ + adc_product_links.json + adc_products_structured.json + adc_products_structured.failed.json + adc_products_structured_10.json + checkpoint_02_ingest/ + load_leaflets.py + batch_ingest.py + progress.json + lightrag/ + .env + rag_storage/ +``` + +## Troubleshooting + +If the embedding server does not start: + +```powershell +pip install sentence-transformers fastapi uvicorn +``` + +If LightRAG has encoding issues: + +```powershell +$env:PYTHONUTF8 = "1" +python -m lightrag.api.lightrag_server +``` + +If LLM extraction times out, reduce concurrency in `lightrag/.env`: + +```text +MAX_ASYNC=3 +MAX_PARALLEL_INSERT=1 +``` + +If the graph looks empty after ingestion, wait for background processing and +check: + +```powershell +python checkpoint_02_ingest/load_leaflets.py --status +``` diff --git a/embedding_server.py b/embedding_server.py new file mode 100644 index 0000000..70d07d1 --- /dev/null +++ b/embedding_server.py @@ -0,0 +1,69 @@ +""" +Локальный OpenAI-compatible embedding сервер на базе sentence-transformers. +Модель: paraphrase-multilingual-MiniLM-L12-v2 (поддерживает словацкий язык!) + +Запуск: + python embedding_server.py + +Тест: + curl http://localhost:8010/v1/embeddings -H "Content-Type: application/json" \ + -d '{"model": "local-embed", "input": "test"}' +""" + +import time +import json +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse +import uvicorn +from sentence_transformers import SentenceTransformer + +MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2" +PORT = 8010 + +print(f"Загрузка модели {MODEL_NAME}...") +model = SentenceTransformer(MODEL_NAME) +EMBED_DIM = model.get_sentence_embedding_dimension() +print(f"Модель загружена. Размерность: {EMBED_DIM}") + +app = FastAPI(title="Local Embedding Server") + + +@app.get("/health") +def health(): + return {"status": "ok", "model": MODEL_NAME, "dim": EMBED_DIM} + + +async def _handle_embeddings(request: Request): + body = await request.json() + inp = body.get("input", "") + if isinstance(inp, str): + texts = [inp] + else: + texts = inp + + vecs = model.encode(texts, normalize_embeddings=True).tolist() + + data = [ + {"object": "embedding", "index": i, "embedding": vec} + for i, vec in enumerate(vecs) + ] + return JSONResponse({ + "object": "list", + "data": data, + "model": MODEL_NAME, + "usage": {"prompt_tokens": sum(len(t.split()) for t in texts), "total_tokens": sum(len(t.split()) for t in texts)} + }) + + +@app.post("/v1/embeddings") +async def embeddings_v1(request: Request): + return await _handle_embeddings(request) + + +@app.post("/embeddings") +async def embeddings_root(request: Request): + return await _handle_embeddings(request) + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=PORT, log_level="warning") diff --git a/scripts/adc_scraper/__init__.py b/scripts/adc_scraper/__init__.py new file mode 100644 index 0000000..dae72e4 --- /dev/null +++ b/scripts/adc_scraper/__init__.py @@ -0,0 +1 @@ +"""ADC scraper scripts for the diploma project.""" diff --git a/scripts/adc_scraper/parse_adc_json.py b/scripts/adc_scraper/parse_adc_json.py new file mode 100644 index 0000000..8a4be16 --- /dev/null +++ b/scripts/adc_scraper/parse_adc_json.py @@ -0,0 +1,167 @@ +"""Parse raw ADC HTML/JSONL into structured JSON for LightRAG ingestion.""" + +from __future__ import annotations + +import argparse +import json +import re +from pathlib import Path +from typing import Any + +from bs4 import BeautifulSoup + + +SECTION_PATTERNS = { + "contraindications": [ + r"nepoužívajte", + r"kedy .* nepoužívať", + r"kontraindik", + ], + "interactions": [ + r"iné lieky", + r"vzájomné pôsobenie", + r"interakci", + ], + "side_effects": [ + r"možné vedľajšie účinky", + r"nežiaduce účinky", + r"vedľajšie účinky", + ], + "dosage": [ + r"ako používať", + r"dávkovanie", + r"spôsob podávania", + ], +} + + +def html_to_text(html: str | None) -> str: + if not html: + return "" + soup = BeautifulSoup(html, "lxml") + for tag in soup(["script", "style", "noscript"]): + tag.decompose() + text = soup.get_text(" ", strip=True) + return normalize_text(text) + + +def normalize_text(text: str) -> str: + text = text.replace("\xa0", " ") + text = re.sub(r"\s+", " ", text) + return text.strip() + + +def infer_name(source_url: str, text: str) -> str: + match = re.search(r"Písomná informácia pre používateľa\s+(.{3,160}?)(?:\s+Pozorne|\s+V tejto|\s+1\.)", text) + if match: + return normalize_text(match.group(1)) + + slug = source_url.rstrip("/").split("/")[-1].replace(".html", "") + slug = re.sub(r"-\d+$", "", slug) + return slug.replace("-", " ").title() + + +def extract_sections(text: str) -> dict[str, str]: + sections: dict[str, str] = {} + lower = text.lower() + + starts: list[tuple[int, str]] = [] + for section_name, patterns in SECTION_PATTERNS.items(): + found_positions = [] + for pattern in patterns: + match = re.search(pattern, lower) + if match: + found_positions.append(match.start()) + if found_positions: + starts.append((min(found_positions), section_name)) + + starts.sort() + for idx, (start, section_name) in enumerate(starts): + end = starts[idx + 1][0] if idx + 1 < len(starts) else min(len(text), start + 8000) + sections[section_name] = text[start:end].strip() + + return sections + + +def iter_raw_records(path: Path) -> list[dict[str, Any]]: + if path.suffix.lower() == ".jsonl": + records = [] + with path.open(encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + records.append(json.loads(line)) + return records + + data = json.loads(path.read_text(encoding="utf-8")) + if isinstance(data, list): + return data + if "records" in data: + return data["records"] + return [data] + + +def parse_record(raw: dict[str, Any]) -> dict[str, Any]: + source_url = raw.get("source_url") or raw.get("link") or raw.get("pil_url") or "" + + pil_text = raw.get("pribalovy_letak") + if pil_text is None: + pil_text = html_to_text(raw.get("pil_html")) + else: + pil_text = normalize_text(str(pil_text)) + + spc_text = raw.get("spc") + if spc_text is None: + spc_text = html_to_text(raw.get("spc_html")) + else: + spc_text = normalize_text(str(spc_text)) + + combined_text = f"{pil_text} {spc_text}".strip() + name = raw.get("name") or infer_name(source_url, combined_text) + + return { + "source_url": source_url, + "name": name, + "pil_url": raw.get("pil_url"), + "spc_url": raw.get("spc_url"), + "pil_text": pil_text, + "spc_text": spc_text, + "sections": extract_sections(combined_text), + "metadata": { + "source": "adc.sk", + "scraped_at": (raw.get("metadata") or {}).get("scraped_at"), + "parser": "scripts/adc_scraper/parse_adc_json.py", + }, + } + + +def main() -> None: + parser = argparse.ArgumentParser(description="Parse ADC raw data into structured JSON.") + parser.add_argument("--input", type=Path, required=True) + parser.add_argument("--out", type=Path, required=True) + parser.add_argument("--limit", type=int, default=None) + parser.add_argument( + "--keep-empty", + action="store_true", + help="Keep records where both PIL and SPC text are empty.", + ) + args = parser.parse_args() + + raw_records = iter_raw_records(args.input) + + parsed = [] + for record in raw_records: + item = parse_record(record) + if not args.keep_empty and not item["pil_text"] and not item["spc_text"]: + continue + parsed.append(item) + if args.limit and len(parsed) >= args.limit: + break + + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Saved {len(parsed)} structured records to {args.out}") + + +if __name__ == "__main__": + main() diff --git a/scripts/adc_scraper/requirements.txt b/scripts/adc_scraper/requirements.txt new file mode 100644 index 0000000..36f0ee0 --- /dev/null +++ b/scripts/adc_scraper/requirements.txt @@ -0,0 +1,5 @@ +requests>=2.31.0 +beautifulsoup4>=4.12.0 +lxml>=5.0.0 +tqdm>=4.66.0 +playwright>=1.45.0 diff --git a/scripts/adc_scraper/scrape_adc_index.py b/scripts/adc_scraper/scrape_adc_index.py new file mode 100644 index 0000000..7c29a06 --- /dev/null +++ b/scripts/adc_scraper/scrape_adc_index.py @@ -0,0 +1,119 @@ +"""Collect ADC product/PIL/SPC links from index or search pages. + +The script is intentionally conservative: it only stores discovered ADC product +URLs and does not try to parse clinical content. The next pipeline step downloads +the actual leaflet pages. +""" + +from __future__ import annotations + +import argparse +import json +import time +from collections import deque +from pathlib import Path +from urllib.parse import urljoin, urlparse + +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm + + +DEFAULT_HEADERS = { + "User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)", +} + + +def is_adc_url(url: str) -> bool: + host = urlparse(url).netloc.lower() + return host.endswith("adc.sk") + + +def is_product_like_url(url: str) -> bool: + path = urlparse(url).path.lower() + return "/databazy/produkty/" in path and ( + "/pil/" in path or "/spc/" in path or "/detail/" in path + ) + + +def extract_links(html: str, base_url: str) -> tuple[set[str], set[str]]: + soup = BeautifulSoup(html, "lxml") + product_links: set[str] = set() + crawl_links: set[str] = set() + + for tag in soup.find_all("a", href=True): + url = urljoin(base_url, tag["href"]).split("#", 1)[0] + if not is_adc_url(url): + continue + if is_product_like_url(url): + product_links.add(url) + + path = urlparse(url).path.lower() + if "/databazy/produkty/" in path: + crawl_links.add(url) + + return product_links, crawl_links + + +def fetch(session: requests.Session, url: str, timeout: int) -> str: + response = session.get(url, headers=DEFAULT_HEADERS, timeout=timeout) + response.raise_for_status() + response.encoding = response.apparent_encoding or "utf-8" + return response.text + + +def main() -> None: + parser = argparse.ArgumentParser(description="Collect ADC product/PIL/SPC links.") + parser.add_argument( + "--start-url", + action="append", + required=True, + help="ADC index/search URL. Can be supplied multiple times.", + ) + parser.add_argument("--out", type=Path, required=True, help="Output JSON file.") + parser.add_argument("--max-pages", type=int, default=20) + parser.add_argument("--delay", type=float, default=0.5) + parser.add_argument("--timeout", type=int, default=30) + args = parser.parse_args() + + queue: deque[str] = deque(args.start_url) + visited: set[str] = set() + product_links: set[str] = set() + session = requests.Session() + + with tqdm(total=args.max_pages, desc="ADC pages") as progress: + while queue and len(visited) < args.max_pages: + url = queue.popleft() + if url in visited: + continue + visited.add(url) + + try: + html = fetch(session, url, args.timeout) + except Exception as exc: + tqdm.write(f"Skip {url}: {exc}") + progress.update(1) + continue + + found_products, found_crawl = extract_links(html, url) + product_links.update(found_products) + + for link in sorted(found_crawl): + if link not in visited and len(visited) + len(queue) < args.max_pages * 4: + queue.append(link) + + progress.update(1) + time.sleep(args.delay) + + args.out.parent.mkdir(parents=True, exist_ok=True) + payload = { + "source": "adc.sk", + "visited_pages": sorted(visited), + "links": sorted(product_links), + } + args.out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Saved {len(product_links)} links to {args.out}") + + +if __name__ == "__main__": + main() diff --git a/scripts/adc_scraper/scrape_adc_leaflets.py b/scripts/adc_scraper/scrape_adc_leaflets.py new file mode 100644 index 0000000..71d78d5 --- /dev/null +++ b/scripts/adc_scraper/scrape_adc_leaflets.py @@ -0,0 +1,124 @@ +"""Download ADC PIL/SPC pages into a raw JSONL file.""" + +from __future__ import annotations + +import argparse +import json +import time +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import urlparse + +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm + + +HEADERS = { + "User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)", +} + + +def load_links(path: Path) -> list[str]: + data = json.loads(path.read_text(encoding="utf-8")) + if isinstance(data, list): + return [str(x) for x in data] + return [str(x) for x in data.get("links", [])] + + +def paired_leaflet_urls(url: str) -> dict[str, str]: + """Return best-effort PIL/SPC URLs for an ADC product URL.""" + urls: dict[str, str] = {} + path = urlparse(url).path.lower() + if "/pil/" in path: + urls["pil_url"] = url + urls["spc_url"] = url.replace("/pil/", "/spc/") + elif "/spc/" in path: + urls["spc_url"] = url + urls["pil_url"] = url.replace("/spc/", "/pil/") + else: + urls["detail_url"] = url + return urls + + +def discover_leaflet_urls_from_detail(html: str, base_url: str) -> dict[str, str]: + from urllib.parse import urljoin + + soup = BeautifulSoup(html, "lxml") + result: dict[str, str] = {} + for tag in soup.find_all("a", href=True): + candidate = urljoin(base_url, tag["href"]) + path = urlparse(candidate).path.lower() + if "/databazy/produkty/pil/" in path: + result["pil_url"] = candidate + elif "/databazy/produkty/spc/" in path: + result["spc_url"] = candidate + return result + + +def fetch(session: requests.Session, url: str, timeout: int) -> tuple[int, str]: + response = session.get(url, headers=HEADERS, timeout=timeout) + response.encoding = response.apparent_encoding or "utf-8" + return response.status_code, response.text + + +def main() -> None: + parser = argparse.ArgumentParser(description="Download ADC PIL/SPC HTML pages.") + parser.add_argument("--links", type=Path, required=True) + parser.add_argument("--out", type=Path, required=True) + parser.add_argument("--limit", type=int, default=None) + parser.add_argument("--delay", type=float, default=0.5) + parser.add_argument("--timeout", type=int, default=30) + args = parser.parse_args() + + links = load_links(args.links) + if args.limit: + links = links[: args.limit] + + args.out.parent.mkdir(parents=True, exist_ok=True) + session = requests.Session() + + with args.out.open("w", encoding="utf-8") as out: + for source_url in tqdm(links, desc="ADC leaflets"): + urls = paired_leaflet_urls(source_url) + + if "detail_url" in urls: + status, html = fetch(session, urls["detail_url"], args.timeout) + if status == 200: + urls.update(discover_leaflet_urls_from_detail(html, urls["detail_url"])) + time.sleep(args.delay) + + record = { + "source_url": source_url, + "pil_url": urls.get("pil_url"), + "spc_url": urls.get("spc_url"), + "pil_status": None, + "spc_status": None, + "pil_html": None, + "spc_html": None, + "metadata": { + "source": "adc.sk", + "scraped_at": datetime.now(timezone.utc).isoformat(), + }, + } + + for kind in ("pil", "spc"): + url = urls.get(f"{kind}_url") + if not url: + continue + try: + status, html = fetch(session, url, args.timeout) + record[f"{kind}_status"] = status + if status == 200: + record[f"{kind}_html"] = html + except Exception as exc: + record[f"{kind}_status"] = f"error: {exc}" + time.sleep(args.delay) + + out.write(json.dumps(record, ensure_ascii=False) + "\n") + + print(f"Saved raw leaflets to {args.out}") + + +if __name__ == "__main__": + main() diff --git a/scripts/adc_scraper/scrape_adc_product_data.py b/scripts/adc_scraper/scrape_adc_product_data.py new file mode 100644 index 0000000..fc37578 --- /dev/null +++ b/scripts/adc_scraper/scrape_adc_product_data.py @@ -0,0 +1,580 @@ +"""Scrape structured ADC product data from detail and PIL pages. + +Example: + python scripts/adc_scraper/scrape_adc_product_data.py --browser --limit 10 +""" + +from __future__ import annotations + +import argparse +import json +import re +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Callable, Iterable +from urllib.parse import urljoin, urlparse + +import requests +from bs4 import BeautifulSoup, Tag +from tqdm import tqdm + + +BASE_URL = "https://www.adc.sk" +DEFAULT_DATA_DIR = Path("data_adc_databaza/adc_scrape_2026_05_04") +DEFAULT_LINKS = DEFAULT_DATA_DIR / "adc_product_links.json" +DEFAULT_OUT = DEFAULT_DATA_DIR / "adc_products_structured.json" +HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/124.0.0.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "sk-SK,sk;q=0.9,cs;q=0.8,en;q=0.7", + "Cache-Control": "no-cache", + "Pragma": "no-cache", + "Referer": "https://www.adc.sk/databazy/produkty", +} + +DETAIL_SECTION_ALIASES = { + "Popis a určenie": "description_and_indications", + "Použitie": "use_and_dosage", + "Nežiaduce účinky": "side_effects", + "Účinné látky": "active_substances", + "Indikačná skupina": "indication_group", + "ADC Klasifikácia produktu": "adc_classification", + "Všeobecné informácie vzťahujúce sa k produktu": "general_product_info", +} + +PIL_SECTION_PATTERNS = { + "what_is_it": r"^1\.\s+Čo je .+", + "before_use": r"^2\.\s+Čo potrebujete vedieť .+", + "how_to_use": r"^3\.\s+Ako .+", + "side_effects": r"^4\.\s+Možné .+účinky", + "storage": r"^5\.\s+Ako uchovávať .+", + "package_info": r"^6\.\s+Obsah balenia .+", +} + +PIL_SUBSECTION_ALIASES = { + "contraindications": [ + r"^Neužívajte .+", + r"^Nepoužívajte .+", + r"^Nesmiete .+", + ], + "warnings": [ + r"^Upozornenia a opatrenia", + r"^Buďte zvlášť opatrný .+", + ], + "interactions": [ + r"^Iné lieky a .+", + r"^Užívanie .+ s inými liekmi", + ], + "pregnancy_breastfeeding": [ + r"^Tehotenstvo.*dojčenie.*", + r"^Tehotenstvo.*", + ], + "driving": [ + r"^Vedenie vozidiel .+", + ], +} + + +@dataclass(frozen=True) +class ProductUrls: + detail_url: str + pil_url: str + spc_url: str + + +def clean_text(value: str) -> str: + value = value.replace("\xa0", " ") + value = re.sub(r"[ \t\r\f\v]+", " ", value) + value = re.sub(r"\n{3,}", "\n\n", value) + value = re.sub(r"(?im)^reklama$", "", value) + return value.strip() + + +def normalize_key(value: str) -> str: + value = clean_text(value).lower() + replacements = { + "á": "a", + "ä": "a", + "č": "c", + "ď": "d", + "é": "e", + "í": "i", + "ľ": "l", + "ĺ": "l", + "ň": "n", + "ó": "o", + "ô": "o", + "ŕ": "r", + "š": "s", + "ť": "t", + "ú": "u", + "ý": "y", + "ž": "z", + } + for source, target in replacements.items(): + value = value.replace(source, target) + value = re.sub(r"[^a-z0-9]+", "_", value) + return value.strip("_") + + +def product_urls(detail_url: str) -> ProductUrls: + return ProductUrls( + detail_url=detail_url, + pil_url=detail_url.replace("/detail/", "/pil/"), + spc_url=detail_url.replace("/detail/", "/spc/"), + ) + + +def product_id_from_url(url: str) -> str | None: + match = re.search(r"-(\d+)\.html(?:$|\?)", urlparse(url).path) + return match.group(1) if match else None + + +def slug_from_url(url: str) -> str: + name = Path(urlparse(url).path).name + return re.sub(r"-\d+\.html$", "", name) + + +def load_links(path: Path) -> list[str]: + data = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(data, list): + raise ValueError(f"Expected a JSON list in {path}") + return [str(item) for item in data if str(item).strip()] + + +def soup_from_html(html: str) -> BeautifulSoup: + return BeautifulSoup(html, "lxml") + + +def remove_noise(root: Tag) -> None: + for tag in root.select( + "script, style, noscript, nav, header, footer, iframe, form, " + ".modal, .adbl, .ad-video-fake, .breadcrumb, .piktograms" + ): + tag.decompose() + + +def node_text(node: Tag) -> str: + remove_noise(node) + return clean_text(node.get_text("\n", strip=True)) + + +def parse_json_ld_product(soup: BeautifulSoup) -> dict[str, str | None]: + for script in soup.find_all("script", {"type": "application/ld+json"}): + raw = script.string or script.get_text() + if not raw.strip(): + continue + try: + data = json.loads(raw) + except json.JSONDecodeError: + continue + items = data if isinstance(data, list) else [data] + for item in items: + if isinstance(item, dict) and item.get("@type") == "Product": + return { + "name": item.get("name"), + "description": item.get("description"), + "image_url": item.get("image"), + } + return {} + + +def parse_info_rows(soup: BeautifulSoup) -> dict[str, str]: + fields: dict[str, str] = {} + + for row in soup.select(".pmi-info-row"): + children = [child for child in row.find_all(recursive=False) if isinstance(child, Tag)] + if len(children) >= 2: + key = clean_text(children[0].get_text(" ", strip=True)) + value = clean_text(" ".join(child.get_text(" ", strip=True) for child in children[1:])) + else: + parts = [part.strip() for part in row.get_text("|", strip=True).split("|") if part.strip()] + if len(parts) < 2: + continue + key, value = parts[0], " ".join(parts[1:]) + if key and value: + fields[normalize_key(key)] = value + + for table in soup.find_all("table"): + for tr in table.find_all("tr"): + cells = [clean_text(c.get_text(" ", strip=True)) for c in tr.find_all(["th", "td"])] + if len(cells) == 2 and len(cells[0]) <= 80 and cells[1]: + fields.setdefault(normalize_key(cells[0]), cells[1]) + + return fields + + +def collect_until_next_section(header: Tag) -> str: + parts: list[str] = [] + for sibling in header.next_siblings: + if isinstance(sibling, Tag) and sibling.name == "h4" and "section-product" in sibling.get("class", []): + break + if not isinstance(sibling, Tag): + continue + clone = BeautifulSoup(str(sibling), "lxml") + text = node_text(clone) + if text and text != clean_text(header.get_text(" ", strip=True)): + parts.append(text) + return clean_text("\n".join(parts)) + + +def parse_detail_sections(soup: BeautifulSoup) -> dict[str, str]: + sections: dict[str, str] = {} + for header in soup.select("h4.section-product"): + title = clean_text(header.get_text(" ", strip=True)) + key = DETAIL_SECTION_ALIASES.get(title, normalize_key(title)) + text = collect_until_next_section(header) + if text: + sections[key] = text + return sections + + +def parse_classification(soup: BeautifulSoup) -> list[dict[str, str]]: + levels: list[dict[str, str]] = [] + box = soup.select_one(".classification-levels") + if not box: + return levels + for tr in box.find_all("tr"): + cells = [clean_text(c.get_text(" ", strip=True)) for c in tr.find_all("td")] + if len(cells) >= 2: + levels.append({"code": cells[0], "name": cells[1]}) + return levels + + +def parse_detail_page(html: str, detail_url: str) -> dict: + soup = soup_from_html(html) + json_ld = parse_json_ld_product(soup) + h1 = soup.find("h1") + fields = parse_info_rows(soup) + sections = parse_detail_sections(soup) + + return { + "product_id": product_id_from_url(detail_url), + "slug": slug_from_url(detail_url), + "name": json_ld.get("name") or (clean_text(h1.get_text(" ", strip=True)) if h1 else None), + "short_description": clean_text(str(json_ld.get("description") or "")) or None, + "image_url": json_ld.get("image_url"), + "detail_fields": fields, + "sections": sections, + "classification": parse_classification(soup), + "active_substances": split_list_field(sections.get("active_substances") or ""), + "indication_group": sections.get("indication_group"), + } + + +def split_list_field(value: str) -> list[str]: + if not value: + return [] + items = [clean_text(item) for item in re.split(r"\n|,|;", value) if clean_text(item)] + return list(dict.fromkeys(items)) + + +def extract_article_text(html: str) -> str: + soup = soup_from_html(html) + article = soup.find("article") + if article: + return node_text(article) + + fallback = soup.find("div", id="product") or soup.body or soup + return node_text(fallback) + + +def split_by_numbered_pil_sections(text: str) -> dict[str, str]: + lines = [line.strip() for line in text.splitlines() if line.strip()] + starts: list[tuple[str, int]] = [] + for idx, line in enumerate(lines): + for key, pattern in PIL_SECTION_PATTERNS.items(): + if re.match(pattern, line, flags=re.IGNORECASE): + starts.append((key, idx)) + break + + sections: dict[str, str] = {} + for pos, (key, idx) in enumerate(starts): + end = starts[pos + 1][1] if pos + 1 < len(starts) else len(lines) + sections[key] = clean_text("\n".join(lines[idx:end])) + return sections + + +def split_pil_subsections(before_use_text: str) -> dict[str, str]: + if not before_use_text: + return {} + + lines = [line.strip() for line in before_use_text.splitlines() if line.strip()] + starts: list[tuple[str, int]] = [] + for idx, line in enumerate(lines): + for key, patterns in PIL_SUBSECTION_ALIASES.items(): + if any(re.match(pattern, line, flags=re.IGNORECASE) for pattern in patterns): + starts.append((key, idx)) + break + + result: dict[str, str] = {} + for pos, (key, idx) in enumerate(starts): + end = starts[pos + 1][1] if pos + 1 < len(starts) else len(lines) + result[key] = clean_text("\n".join(lines[idx:end])) + return result + + +def parse_pil_page(html: str) -> dict: + text = extract_article_text(html) + sections = split_by_numbered_pil_sections(text) + subsections = split_pil_subsections(sections.get("before_use", "")) + return { + "full_text": text, + "sections": sections, + "subsections": subsections, + } + + +def build_lightrag_text(detail: dict, pil: dict | None, urls: ProductUrls) -> str: + chunks: list[str] = [] + name = detail.get("name") or detail.get("slug") or urls.detail_url + chunks.append(f"Liek: {name}") + chunks.append(f"ADC detail URL: {urls.detail_url}") + chunks.append(f"ADC PIL URL: {urls.pil_url}") + + fields = detail.get("detail_fields") or {} + important_fields = [ + "registracne_cislo_produktu", + "kod_statnej_autority_sukl", + "nazov_produktu_podla_sukl", + "aplikacna_forma", + "vyrobca", + "drzitel_rozhodnutia", + "dodavatelia", + "vydaj", + "typ_produktu", + "legislativne_zatriedenie", + ] + for key in important_fields: + if fields.get(key): + chunks.append(f"{key}: {fields[key]}") + + for section_key, title in [ + ("description_and_indications", "Popis a indikácie"), + ("use_and_dosage", "Použitie a dávkovanie"), + ("side_effects", "Nežiaduce účinky"), + ("active_substances", "Účinné látky"), + ("indication_group", "Indikačná skupina"), + ("general_product_info", "Všeobecné informácie"), + ]: + text = (detail.get("sections") or {}).get(section_key) + if text: + chunks.append(f"\n{title}\n{text}") + + if pil: + subsections = pil.get("subsections") or {} + for key, title in [ + ("contraindications", "Kontraindikácie z PIL"), + ("warnings", "Upozornenia z PIL"), + ("interactions", "Interakcie z PIL"), + ("pregnancy_breastfeeding", "Tehotenstvo a dojčenie z PIL"), + ("driving", "Vedenie vozidiel z PIL"), + ]: + if subsections.get(key): + chunks.append(f"\n{title}\n{subsections[key]}") + + for key, title in [ + ("what_is_it", "Na čo sa používa z PIL"), + ("how_to_use", "Ako užívať z PIL"), + ("side_effects", "Vedľajšie účinky z PIL"), + ]: + section_text = (pil.get("sections") or {}).get(key) + if section_text: + chunks.append(f"\n{title}\n{section_text}") + + return clean_text("\n\n".join(chunks)) + + +def build_graph_hints(detail: dict, pil: dict | None) -> dict: + fields = detail.get("detail_fields") or {} + sections = detail.get("sections") or {} + pil_subsections = (pil or {}).get("subsections") or {} + pil_sections = (pil or {}).get("sections") or {} + + return { + "drug": detail.get("name"), + "active_substances": detail.get("active_substances") or [], + "dosage_form": fields.get("aplikacna_forma"), + "manufacturer": fields.get("vyrobca"), + "marketing_authorization_holder": fields.get("drzitel_rozhodnutia"), + "supplier": fields.get("dodavatelia"), + "sukl_code": fields.get("kod_statnej_autority_sukl"), + "registration_number": fields.get("registracne_cislo_produktu"), + "classification_codes": detail.get("classification") or [], + "indications_text": sections.get("description_and_indications") or pil_sections.get("what_is_it"), + "dosage_text": sections.get("use_and_dosage") or pil_sections.get("how_to_use"), + "contraindications_text": pil_subsections.get("contraindications"), + "warnings_text": pil_subsections.get("warnings"), + "interactions_text": pil_subsections.get("interactions"), + "side_effects_text": sections.get("side_effects") or pil_sections.get("side_effects"), + } + + +def build_record(detail_html: str, pil_html: str | None, urls: ProductUrls) -> dict: + detail = parse_detail_page(detail_html, urls.detail_url) + pil = parse_pil_page(pil_html) if pil_html else None + scraped_at = datetime.now(timezone.utc).isoformat(timespec="seconds") + + return { + "source": "adc.sk", + "scraped_at": scraped_at, + "urls": { + "detail": urls.detail_url, + "pil": urls.pil_url, + "spc": urls.spc_url, + }, + "product": detail, + "pil": pil, + "graph_hints": build_graph_hints(detail, pil), + "lightrag_text": build_lightrag_text(detail, pil, urls), + } + + +def fetch_requests(session: requests.Session, url: str, timeout: int, retries: int) -> str: + last_error: Exception | None = None + for attempt in range(1, retries + 1): + try: + response = session.get(url, headers=HEADERS, timeout=timeout) + response.raise_for_status() + response.encoding = response.apparent_encoding or "utf-8" + return response.text + except Exception as exc: + last_error = exc + if attempt < retries: + time.sleep(1.5 * attempt) + raise RuntimeError(f"Failed to fetch {url}: {last_error}") + + +def make_requests_fetcher(timeout: int, retries: int) -> Callable[[str], str]: + session = requests.Session() + return lambda url: fetch_requests(session, url, timeout, retries) + + +def make_browser_fetcher() -> tuple[Callable[[str], str], Callable[[], None]]: + try: + from playwright.sync_api import sync_playwright + except ImportError as exc: + raise SystemExit( + "Playwright is not installed. Run: pip install playwright; python -m playwright install chromium" + ) from exc + + playwright = sync_playwright().start() + browser = playwright.chromium.launch(headless=True) + page = browser.new_page( + user_agent=HEADERS["User-Agent"], + locale="sk-SK", + viewport={"width": 1366, "height": 900}, + ) + + def fetch(url: str) -> str: + response = page.goto(url, wait_until="domcontentloaded", timeout=60000) + if response is None or response.status >= 400: + status = response.status if response else "no-response" + raise RuntimeError(f"HTTP {status} for {url}") + return page.content() + + def close() -> None: + browser.close() + playwright.stop() + + return fetch, close + + +def iter_links(links: Iterable[str], limit: int | None) -> Iterable[str]: + count = 0 + for link in links: + if limit is not None and count >= limit: + break + count += 1 + yield link + + +def write_records_json( + out_path: Path, + links: list[str], + fetch: Callable[[str], str], + limit: int | None, + delay: float, + skip_failed: bool, +) -> list[dict[str, str]]: + out_path.parent.mkdir(parents=True, exist_ok=True) + failures: list[dict[str, str]] = [] + selected_links = list(iter_links(links, limit)) + + with out_path.open("w", encoding="utf-8") as out: + out.write("[\n") + wrote_any = False + for detail_url in tqdm(selected_links, desc="ADC products"): + urls = product_urls(detail_url) + try: + detail_html = fetch(urls.detail_url) + time.sleep(delay) + pil_html = fetch(urls.pil_url) + record = build_record(detail_html, pil_html, urls) + except Exception as exc: + failures.append({"url": detail_url, "error": str(exc)}) + tqdm.write(f"Failed product {detail_url}: {exc}") + if not skip_failed: + raise + continue + + if wrote_any: + out.write(",\n") + json.dump(record, out, ensure_ascii=False, indent=2) + wrote_any = True + out.flush() + time.sleep(delay) + + out.write("\n]\n") + + return failures + + +def main() -> None: + parser = argparse.ArgumentParser(description="Scrape ADC product detail + PIL data into structured JSON.") + parser.add_argument("--links", type=Path, default=DEFAULT_LINKS, help="Input JSON list with detail URLs.") + parser.add_argument("--out", type=Path, default=DEFAULT_OUT, help="Output structured JSON file.") + parser.add_argument("--limit", type=int, default=None, help="Scrape only the first N products.") + parser.add_argument("--delay", type=float, default=0.25, help="Delay between page loads in seconds.") + parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout in seconds for requests mode.") + parser.add_argument("--retries", type=int, default=3, help="Retries per URL in requests mode.") + parser.add_argument("--browser", action="store_true", help="Use Playwright Chromium. Use this if ADC returns 403.") + parser.add_argument("--stop-on-fail", action="store_true", help="Stop on first failed product.") + args = parser.parse_args() + + links = load_links(args.links) + close_browser: Callable[[], None] | None = None + + if args.browser: + fetch, close_browser = make_browser_fetcher() + else: + fetch = make_requests_fetcher(args.timeout, args.retries) + + try: + failures = write_records_json( + out_path=args.out, + links=links, + fetch=fetch, + limit=args.limit, + delay=args.delay, + skip_failed=not args.stop_on_fail, + ) + finally: + if close_browser: + close_browser() + + print(f"Saved structured product data to {args.out}") + if failures: + failed_path = args.out.with_suffix(".failed.json") + failed_path.write_text(json.dumps(failures, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Failed products: {len(failures)}. Saved errors to {failed_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/adc_scraper/scrape_adc_product_links.py b/scripts/adc_scraper/scrape_adc_product_links.py new file mode 100644 index 0000000..9f6894a --- /dev/null +++ b/scripts/adc_scraper/scrape_adc_product_links.py @@ -0,0 +1,182 @@ +"""Scrape product detail links from ADC product listing pages. + +Example: + python scripts/adc_scraper/scrape_adc_product_links.py --out data_adc_databaza/adc_scrape_2026_05_04/adc_product_links.json +""" + +from __future__ import annotations + +import argparse +import json +import time +from pathlib import Path +from urllib.parse import urljoin + +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm + + +BASE_URL = "https://www.adc.sk" +LISTING_URL = "https://www.adc.sk/databazy/produkty?page={page}&ord=a1" +DEFAULT_PAGES = 711 +HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/124.0.0.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "sk-SK,sk;q=0.9,cs;q=0.8,en;q=0.7", + "Cache-Control": "no-cache", + "Pragma": "no-cache", + "Referer": "https://www.adc.sk/databazy/produkty", +} + + +def fetch_page(session: requests.Session, page: int, timeout: int, retries: int) -> str: + url = LISTING_URL.format(page=page) + last_error: Exception | None = None + + for attempt in range(1, retries + 1): + try: + response = session.get(url, headers=HEADERS, timeout=timeout) + response.raise_for_status() + response.encoding = response.apparent_encoding or "utf-8" + return response.text + except Exception as exc: + last_error = exc + if attempt < retries: + time.sleep(1.5 * attempt) + + raise RuntimeError(f"Failed to fetch page {page}: {last_error}") + + +def extract_product_links(html: str) -> list[str]: + soup = BeautifulSoup(html, "lxml") + links: list[str] = [] + + for tag in soup.select('a.product[href^="/databazy/produkty/detail/"]'): + href = tag.get("href") + if not href: + continue + links.append(urljoin(BASE_URL, href)) + + return links + + +def scrape_with_requests( + start_page: int, + pages: int, + delay: float, + timeout: int, + retries: int, +) -> tuple[list[str], list[int]]: + session = requests.Session() + seen: set[str] = set() + all_links: list[str] = [] + failed_pages: list[int] = [] + + end_page = start_page + pages - 1 + for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"): + try: + html = fetch_page(session, page, timeout, retries) + page_links = extract_product_links(html) + except Exception as exc: + tqdm.write(str(exc)) + failed_pages.append(page) + continue + + for link in page_links: + if link not in seen: + seen.add(link) + all_links.append(link) + + time.sleep(delay) + + return all_links, failed_pages + + +def scrape_with_browser(start_page: int, pages: int, delay: float) -> tuple[list[str], list[int]]: + try: + from playwright.sync_api import sync_playwright + except ImportError as exc: + raise SystemExit( + "Playwright is not installed. Run: pip install playwright; python -m playwright install chromium" + ) from exc + + seen: set[str] = set() + all_links: list[str] = [] + failed_pages: list[int] = [] + end_page = start_page + pages - 1 + + with sync_playwright() as playwright: + browser = playwright.chromium.launch(headless=True) + page_obj = browser.new_page( + user_agent=HEADERS["User-Agent"], + locale="sk-SK", + viewport={"width": 1366, "height": 900}, + ) + + for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"): + url = LISTING_URL.format(page=page) + try: + response = page_obj.goto(url, wait_until="domcontentloaded", timeout=60000) + if response is None or response.status >= 400: + status = response.status if response else "no-response" + raise RuntimeError(f"HTTP {status}") + html = page_obj.content() + page_links = extract_product_links(html) + except Exception as exc: + tqdm.write(f"Failed page {page}: {exc}") + failed_pages.append(page) + continue + + for link in page_links: + if link not in seen: + seen.add(link) + all_links.append(link) + + time.sleep(delay) + + browser.close() + + return all_links, failed_pages + + +def main() -> None: + parser = argparse.ArgumentParser(description="Scrape ADC product detail links.") + parser.add_argument("--out", type=Path, required=True, help="Output JSON file.") + parser.add_argument("--pages", type=int, default=DEFAULT_PAGES, help="Number of ADC listing pages.") + parser.add_argument("--start-page", type=int, default=1, help="First page number.") + parser.add_argument("--delay", type=float, default=0.25, help="Delay between requests in seconds.") + parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout in seconds.") + parser.add_argument("--retries", type=int, default=3, help="Retries per page.") + parser.add_argument( + "--browser", + action="store_true", + help="Use Playwright Chromium instead of requests. Useful when ADC returns HTTP 403.", + ) + args = parser.parse_args() + + if args.browser: + all_links, failed_pages = scrape_with_browser(args.start_page, args.pages, args.delay) + else: + all_links, failed_pages = scrape_with_requests( + args.start_page, + args.pages, + args.delay, + args.timeout, + args.retries, + ) + + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(json.dumps(all_links, ensure_ascii=False, indent=2), encoding="utf-8") + + print(f"Saved {len(all_links)} unique product links to {args.out}") + if failed_pages: + print(f"Failed pages: {failed_pages}") + + +if __name__ == "__main__": + main() diff --git a/scripts/adc_scraper/validate_adc_json.py b/scripts/adc_scraper/validate_adc_json.py new file mode 100644 index 0000000..f06183a --- /dev/null +++ b/scripts/adc_scraper/validate_adc_json.py @@ -0,0 +1,47 @@ +"""Validate basic quality of structured ADC JSON.""" + +from __future__ import annotations + +import argparse +import json +from collections import Counter +from pathlib import Path + + +def main() -> None: + parser = argparse.ArgumentParser(description="Validate structured ADC JSON.") + parser.add_argument("--input", type=Path, required=True) + args = parser.parse_args() + + data = json.loads(args.input.read_text(encoding="utf-8")) + if not isinstance(data, list): + raise SystemExit("Input must be a JSON list.") + + missing = Counter() + section_counter = Counter() + total_pil_chars = 0 + total_spc_chars = 0 + + for record in data: + for key in ("source_url", "name", "pil_text", "spc_text", "sections"): + if not record.get(key): + missing[key] += 1 + total_pil_chars += len(record.get("pil_text") or "") + total_spc_chars += len(record.get("spc_text") or "") + for section_name, section_text in (record.get("sections") or {}).items(): + if section_text: + section_counter[section_name] += 1 + + print(f"Records: {len(data)}") + print(f"Average PIL chars: {total_pil_chars // max(len(data), 1)}") + print(f"Average SPC chars: {total_spc_chars // max(len(data), 1)}") + print("Missing fields:") + for key in ("source_url", "name", "pil_text", "spc_text", "sections"): + print(f" {key}: {missing[key]}") + print("Detected sections:") + for key, value in section_counter.most_common(): + print(f" {key}: {value}") + + +if __name__ == "__main__": + main() diff --git a/start_servers.py b/start_servers.py new file mode 100644 index 0000000..23f7170 --- /dev/null +++ b/start_servers.py @@ -0,0 +1,94 @@ +""" +Запуск всех серверов для работы с графом знаний ADC. + +Запускает: + 1. Embedding server — localhost:8010 (локальная модель, словацкий язык) + 2. LightRAG server — localhost:9621 (граф + API + WebUI) + +Использование: + python start_servers.py + +Остановка: Ctrl+C +""" + +import subprocess +import sys +import time +import urllib.request +import os +from pathlib import Path + +ROOT = Path(__file__).parent +LIGHTRAG_DIR = ROOT / "lightrag" +EMBEDDING_SCRIPT = ROOT / "embedding_server.py" + + +def wait_for(url, name, timeout=60): + print(f" Ожидаю {name}...", end="", flush=True) + for _ in range(timeout): + try: + urllib.request.urlopen(url, timeout=2) + print(" OK") + return True + except: + print(".", end="", flush=True) + time.sleep(1) + print(" ТАЙМАУТ") + return False + + +def main(): + print("=" * 50) + print("Запуск серверов LightRAG ADC") + print("=" * 50) + + env = os.environ.copy() + env["PYTHONUTF8"] = "1" + + # 1. Embedding server + print("\n[1/2] Запуск Embedding server (порт 8010)...") + embed_proc = subprocess.Popen( + [sys.executable, str(EMBEDDING_SCRIPT)], + env=env, + cwd=str(ROOT), + ) + + if not wait_for("http://localhost:8010/health", "embedding server"): + print("ОШИБКА: embedding server не запустился") + embed_proc.terminate() + sys.exit(1) + + # 2. LightRAG server + print("\n[2/2] Запуск LightRAG server (порт 9621)...") + lightrag_proc = subprocess.Popen( + [sys.executable, "-m", "lightrag.api.lightrag_server"], + env=env, + cwd=str(LIGHTRAG_DIR), + ) + + if not wait_for("http://localhost:9621/health", "LightRAG server", timeout=30): + print("ОШИБКА: LightRAG server не запустился") + embed_proc.terminate() + lightrag_proc.terminate() + sys.exit(1) + + print("\n" + "=" * 50) + print("Все серверы запущены!") + print(" Embedding: http://localhost:8010/health") + print(" LightRAG: http://localhost:9621/health") + print(" WebUI: http://localhost:9621/webui (если собран)") + print("=" * 50) + print("\nCtrl+C для остановки\n") + + try: + embed_proc.wait() + lightrag_proc.wait() + except KeyboardInterrupt: + print("\nОстанавливаю серверы...") + embed_proc.terminate() + lightrag_proc.terminate() + print("Готово.") + + +if __name__ == "__main__": + main()