Initial ADC scraper project setup
This commit is contained in:
commit
5f25004d05
32
.gitignore
vendored
Normal file
32
.gitignore
vendored
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
# Large local datasets
|
||||||
|
data_adc_databaza/
|
||||||
|
|
||||||
|
# External LightRAG checkout and generated RAG storage
|
||||||
|
lightrag/
|
||||||
|
|
||||||
|
# Scraped/debug HTML snapshots
|
||||||
|
pil.html
|
||||||
|
detail-product.html
|
||||||
|
|
||||||
|
# Generated graph artifacts
|
||||||
|
*.graphml
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Python cache and local environments
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
|
|
||||||
|
# Tool/local workspace metadata
|
||||||
|
.claude/
|
||||||
|
.tmp/
|
||||||
|
|
||||||
|
# OS/editor files
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
127
ARCHITECTURE.md
Normal file
127
ARCHITECTURE.md
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
# LightRAG ADC System Architecture
|
||||||
|
|
||||||
|
This document describes the full architecture of the LightRAG-based Adverse Drug Condition (ADC) system for processing and querying Slovak pharmaceutical leaflets.
|
||||||
|
|
||||||
|
The system consists of three main components running locally:
|
||||||
|
- **Embedding Server** (port 8010) — wraps a sentence-transformers model for vector generation
|
||||||
|
- **LightRAG Server** (port 9621) — core RAG engine managing the knowledge graph and vector DB
|
||||||
|
- **OpenWebUI LLM** (remote) — hosts the Qwen3.5-122B model used for entity extraction and answer generation
|
||||||
|
|
||||||
|
Both local servers are launched via `start_servers.py`. Source data is 6929 Slovak pharmaceutical leaflets stored in `cleaned_general_info_additional.json`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Flow 1: Ingestion (Loading Leaflets)
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A([👤 User runs load_leaflets.py]) --> B
|
||||||
|
|
||||||
|
B[("📄 cleaned_general_info_additional.json\n6929 Slovak leaflets")]
|
||||||
|
B --> C{Filter:\nclinical leaflets only\ninteractions +\ncontraindications}
|
||||||
|
|
||||||
|
C -->|Filtered leaflets| D["🔁 For each leaflet\n(loop)"]
|
||||||
|
|
||||||
|
D --> E["POST http://localhost:9621\n/documents/text\n\nBody: { text, metadata }"]
|
||||||
|
|
||||||
|
subgraph LightRAG_Server ["⚙️ LightRAG Server — port 9621"]
|
||||||
|
E --> F["Text chunker\n600 tokens per chunk"]
|
||||||
|
F --> G["🔁 For each chunk\n(loop)"]
|
||||||
|
|
||||||
|
G --> H["POST https://ui.tukekemt.xyz\n/api/v1/chat/completions\n\nModel: model2 (Qwen3.5-122B)\nTask: extract entities & relations"]
|
||||||
|
|
||||||
|
H --> I["Extracted:\n• Entities (drugs, conditions, etc.)\n• Relations between entities"]
|
||||||
|
|
||||||
|
I --> J["🔁 For each entity / chunk\n(loop)"]
|
||||||
|
|
||||||
|
J --> K["POST http://localhost:8010\n/embeddings\n\nBody: { input: text }"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph Embedding_Server ["🧠 Embedding Server — port 8010"]
|
||||||
|
K --> L["paraphrase-multilingual\n-MiniLM-L12-v2\n(sentence-transformers)"]
|
||||||
|
L --> M["Float vector\n(384 dimensions)"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph OpenWebUI ["☁️ OpenWebUI — ui.tukekemt.xyz"]
|
||||||
|
H
|
||||||
|
end
|
||||||
|
|
||||||
|
M --> N
|
||||||
|
|
||||||
|
subgraph RAG_Storage ["💾 rag_storage/"]
|
||||||
|
N["graph_chunk_entity_relation.graphml\n— knowledge graph (NetworkX)"]
|
||||||
|
O["vdb_entities.json\n— entity vectors (NanoVectorDB)"]
|
||||||
|
P["vdb_relationships.json\n— relation vectors (NanoVectorDB)"]
|
||||||
|
Q["kv_store_*.json\n— chunk text cache & metadata"]
|
||||||
|
end
|
||||||
|
|
||||||
|
I --> N
|
||||||
|
I --> P
|
||||||
|
M --> O
|
||||||
|
F --> Q
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Flow 2: Query (Answering Questions)
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A([👤 User sends query]) --> B
|
||||||
|
|
||||||
|
B["POST http://localhost:9621/query\n\nBody:\n{ query: string,\n mode: hybrid | local | global | naive }"]
|
||||||
|
|
||||||
|
subgraph LightRAG_Server ["⚙️ LightRAG Server — port 9621"]
|
||||||
|
B --> C["Parse query\n& select retrieval mode"]
|
||||||
|
|
||||||
|
C --> D["POST http://localhost:8010\n/embeddings\n\nEmbed the query text"]
|
||||||
|
|
||||||
|
subgraph Retrieval ["🔍 Retrieval (parallel)"]
|
||||||
|
E["Vector search\nNanoVectorDB\n(vdb_entities.json,\nvdb_relationships.json)"]
|
||||||
|
F["Graph traversal\nNetworkX\n(graph_chunk_entity_relation.graphml)"]
|
||||||
|
end
|
||||||
|
|
||||||
|
D --> Retrieval
|
||||||
|
Retrieval --> G["Merge & rank\nrelevant entities,\nrelations & text chunks"]
|
||||||
|
|
||||||
|
G --> H["Build context prompt\nfrom top-K results\n+ retrieved chunk texts\n(kv_store_*.json)"]
|
||||||
|
|
||||||
|
H --> I["POST https://ui.tukekemt.xyz\n/api/v1/chat/completions\n\nModel: model2 (Qwen3.5-122B)\nTask: generate answer\nfrom context"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph Embedding_Server ["🧠 Embedding Server — port 8010"]
|
||||||
|
D2["paraphrase-multilingual\n-MiniLM-L12-v2"]
|
||||||
|
D --> D2
|
||||||
|
D2 --> E
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph OpenWebUI ["☁️ OpenWebUI — ui.tukekemt.xyz"]
|
||||||
|
I
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph RAG_Storage ["💾 rag_storage/"]
|
||||||
|
VDB["vdb_entities.json\nvdb_relationships.json"]
|
||||||
|
GRAPH["graph_chunk_entity_relation.graphml"]
|
||||||
|
KV["kv_store_*.json"]
|
||||||
|
end
|
||||||
|
|
||||||
|
E --- VDB
|
||||||
|
F --- GRAPH
|
||||||
|
H --- KV
|
||||||
|
|
||||||
|
I --> J["Generated answer\n+ source references"]
|
||||||
|
J --> K([👤 User receives response])
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Component Summary
|
||||||
|
|
||||||
|
| Component | Type | Address | Key Endpoints |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `embedding_server.py` | FastAPI (local) | `http://localhost:8010` | `GET /health`, `POST /embeddings`, `POST /v1/embeddings` |
|
||||||
|
| LightRAG Server | FastAPI (local) | `http://localhost:9621` | `GET /health`, `POST /documents/text`, `POST /documents/scan`, `GET /documents/pipeline_status`, `POST /query` |
|
||||||
|
| OpenWebUI (model2) | Remote LLM API | `https://ui.tukekemt.xyz` | `POST /api/v1/chat/completions` |
|
||||||
|
| `rag_storage/` | File system | Local disk | `.graphml`, `.json` files |
|
||||||
|
| `cleaned_general_info_additional.json` | Source data | Local disk | 6929 Slovak pharmaceutical leaflets |
|
||||||
|
| `start_servers.py` | Launcher script | — | Starts embedding server + LightRAG server |
|
||||||
214
RUN_INSTRUCTION.md
Normal file
214
RUN_INSTRUCTION.md
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
# Run Instructions - LightRAG ADC Knowledge Graph
|
||||||
|
|
||||||
|
This project prepares ADC pharmaceutical leaflet data for a knowledge graph and
|
||||||
|
LightRAG-based question answering about drug interactions, contraindications,
|
||||||
|
warnings, indications, dosage, and side effects.
|
||||||
|
|
||||||
|
## Current Data
|
||||||
|
|
||||||
|
The current ADC scrape is stored in:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
data_adc_databaza/adc_scrape_2026_05_04/
|
||||||
|
```
|
||||||
|
|
||||||
|
Main files:
|
||||||
|
|
||||||
|
- `adc_product_links.json` - 35k+ ADC product detail URLs.
|
||||||
|
- `adc_products_structured.json` - main structured dataset for the next pipeline stage.
|
||||||
|
- `adc_products_structured.failed.json` - products that failed during scraping.
|
||||||
|
- `adc_products_structured_10.json` - small parser test sample.
|
||||||
|
|
||||||
|
Use `adc_products_structured.json` as the main source for new graph and
|
||||||
|
LightRAG ingestion work.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Python 3.10+
|
||||||
|
- ADC scraper dependencies:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
pip install -r scripts/adc_scraper/requirements.txt
|
||||||
|
python -m playwright install chromium
|
||||||
|
```
|
||||||
|
|
||||||
|
- Local embedding dependencies:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
pip install sentence-transformers fastapi uvicorn
|
||||||
|
```
|
||||||
|
|
||||||
|
- LightRAG package from `lightrag/`
|
||||||
|
- OpenWebUI-compatible LLM API access configured in `lightrag/.env`
|
||||||
|
|
||||||
|
## Scraping Pipeline
|
||||||
|
|
||||||
|
Collect ADC product detail links:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
python scripts/adc_scraper/scrape_adc_product_links.py `
|
||||||
|
--out data_adc_databaza/adc_scrape_2026_05_04/adc_product_links.json `
|
||||||
|
--browser
|
||||||
|
```
|
||||||
|
|
||||||
|
Scrape product detail pages and PIL pages into structured JSON:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
python scripts/adc_scraper/scrape_adc_product_data.py --browser
|
||||||
|
```
|
||||||
|
|
||||||
|
The default output is:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json
|
||||||
|
```
|
||||||
|
|
||||||
|
For a small test run:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
python scripts/adc_scraper/scrape_adc_product_data.py `
|
||||||
|
--browser `
|
||||||
|
--limit 10 `
|
||||||
|
--out data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured_10.json
|
||||||
|
```
|
||||||
|
|
||||||
|
## Start Servers
|
||||||
|
|
||||||
|
Start the local embedding server and LightRAG server:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
cd "c:\Users\Oleh\Desktop\Diplomova praca"
|
||||||
|
python start_servers.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Keep this terminal open. Stop with `Ctrl+C`.
|
||||||
|
|
||||||
|
Health checks:
|
||||||
|
|
||||||
|
```text
|
||||||
|
http://localhost:8010/health - embedding server
|
||||||
|
http://localhost:9621/health - LightRAG server
|
||||||
|
```
|
||||||
|
|
||||||
|
## Old Ingestion Pipeline
|
||||||
|
|
||||||
|
The folder `checkpoint_02_ingest/` contains an older ingestion pipeline that
|
||||||
|
loads data from:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
data_adc_databaza/cleaned_general_info_additional.json
|
||||||
|
```
|
||||||
|
|
||||||
|
It is kept as a reference because it already contains working LightRAG upload
|
||||||
|
logic and progress tracking:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
python checkpoint_02_ingest/load_leaflets.py --count 50
|
||||||
|
python checkpoint_02_ingest/load_leaflets.py --status
|
||||||
|
```
|
||||||
|
|
||||||
|
Do not treat this as the final ingestion path for the new dataset. The next
|
||||||
|
step is to create a new ingestion script that reads:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json
|
||||||
|
```
|
||||||
|
|
||||||
|
and sends each record's `lightrag_text` to LightRAG.
|
||||||
|
|
||||||
|
## Query LightRAG
|
||||||
|
|
||||||
|
After documents are ingested and LightRAG has finished processing them:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
python -c "
|
||||||
|
import urllib.request, json
|
||||||
|
payload = json.dumps({'query': 'Ake su kontraindikacie Abirateronu?', 'mode': 'hybrid'}).encode()
|
||||||
|
req = urllib.request.Request('http://localhost:9621/query', data=payload, headers={'Content-Type': 'application/json'})
|
||||||
|
r = urllib.request.urlopen(req, timeout=120)
|
||||||
|
print(json.loads(r.read())['response'])
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
Available query modes:
|
||||||
|
|
||||||
|
- `hybrid` - recommended combined retrieval mode.
|
||||||
|
- `local` - entity-centered retrieval.
|
||||||
|
- `global` - broader graph-level retrieval.
|
||||||
|
- `naive` - vector-only retrieval.
|
||||||
|
|
||||||
|
Avoid querying while the document pipeline is still busy. Entity extraction can
|
||||||
|
take several minutes per batch depending on the LLM API and concurrency limits.
|
||||||
|
|
||||||
|
## Reset LightRAG Storage
|
||||||
|
|
||||||
|
Stop the servers first, then clear generated graph/vector data:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
Remove-Item -LiteralPath "c:\Users\Oleh\Desktop\Diplomova praca\lightrag\rag_storage\*" -Force
|
||||||
|
python checkpoint_02_ingest/load_leaflets.py --reset
|
||||||
|
```
|
||||||
|
|
||||||
|
Use this only when you intentionally want to rebuild the graph.
|
||||||
|
|
||||||
|
## Recommended Next Steps
|
||||||
|
|
||||||
|
1. Update `validate_adc_json.py` for the new `adc_products_structured.json` schema.
|
||||||
|
2. Build an explicit knowledge graph from `graph_hints` and PIL subsections.
|
||||||
|
3. Create a new LightRAG ingestion script for the new dataset.
|
||||||
|
4. Retry failed scrape URLs from `adc_products_structured.failed.json`.
|
||||||
|
5. Prepare a small RAGAS evaluation set for contraindication and interaction questions.
|
||||||
|
|
||||||
|
## Project Layout
|
||||||
|
|
||||||
|
```text
|
||||||
|
Diplomova praca/
|
||||||
|
start_servers.py
|
||||||
|
embedding_server.py
|
||||||
|
scripts/adc_scraper/
|
||||||
|
scrape_adc_product_links.py
|
||||||
|
scrape_adc_product_data.py
|
||||||
|
validate_adc_json.py
|
||||||
|
data_adc_databaza/
|
||||||
|
adc_scrape_2026_05_04/
|
||||||
|
adc_product_links.json
|
||||||
|
adc_products_structured.json
|
||||||
|
adc_products_structured.failed.json
|
||||||
|
adc_products_structured_10.json
|
||||||
|
checkpoint_02_ingest/
|
||||||
|
load_leaflets.py
|
||||||
|
batch_ingest.py
|
||||||
|
progress.json
|
||||||
|
lightrag/
|
||||||
|
.env
|
||||||
|
rag_storage/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
If the embedding server does not start:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
pip install sentence-transformers fastapi uvicorn
|
||||||
|
```
|
||||||
|
|
||||||
|
If LightRAG has encoding issues:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
$env:PYTHONUTF8 = "1"
|
||||||
|
python -m lightrag.api.lightrag_server
|
||||||
|
```
|
||||||
|
|
||||||
|
If LLM extraction times out, reduce concurrency in `lightrag/.env`:
|
||||||
|
|
||||||
|
```text
|
||||||
|
MAX_ASYNC=3
|
||||||
|
MAX_PARALLEL_INSERT=1
|
||||||
|
```
|
||||||
|
|
||||||
|
If the graph looks empty after ingestion, wait for background processing and
|
||||||
|
check:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
python checkpoint_02_ingest/load_leaflets.py --status
|
||||||
|
```
|
||||||
69
embedding_server.py
Normal file
69
embedding_server.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
"""
|
||||||
|
Локальный OpenAI-compatible embedding сервер на базе sentence-transformers.
|
||||||
|
Модель: paraphrase-multilingual-MiniLM-L12-v2 (поддерживает словацкий язык!)
|
||||||
|
|
||||||
|
Запуск:
|
||||||
|
python embedding_server.py
|
||||||
|
|
||||||
|
Тест:
|
||||||
|
curl http://localhost:8010/v1/embeddings -H "Content-Type: application/json" \
|
||||||
|
-d '{"model": "local-embed", "input": "test"}'
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
from fastapi import FastAPI, Request
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
import uvicorn
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
|
||||||
|
PORT = 8010
|
||||||
|
|
||||||
|
print(f"Загрузка модели {MODEL_NAME}...")
|
||||||
|
model = SentenceTransformer(MODEL_NAME)
|
||||||
|
EMBED_DIM = model.get_sentence_embedding_dimension()
|
||||||
|
print(f"Модель загружена. Размерность: {EMBED_DIM}")
|
||||||
|
|
||||||
|
app = FastAPI(title="Local Embedding Server")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
return {"status": "ok", "model": MODEL_NAME, "dim": EMBED_DIM}
|
||||||
|
|
||||||
|
|
||||||
|
async def _handle_embeddings(request: Request):
|
||||||
|
body = await request.json()
|
||||||
|
inp = body.get("input", "")
|
||||||
|
if isinstance(inp, str):
|
||||||
|
texts = [inp]
|
||||||
|
else:
|
||||||
|
texts = inp
|
||||||
|
|
||||||
|
vecs = model.encode(texts, normalize_embeddings=True).tolist()
|
||||||
|
|
||||||
|
data = [
|
||||||
|
{"object": "embedding", "index": i, "embedding": vec}
|
||||||
|
for i, vec in enumerate(vecs)
|
||||||
|
]
|
||||||
|
return JSONResponse({
|
||||||
|
"object": "list",
|
||||||
|
"data": data,
|
||||||
|
"model": MODEL_NAME,
|
||||||
|
"usage": {"prompt_tokens": sum(len(t.split()) for t in texts), "total_tokens": sum(len(t.split()) for t in texts)}
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/v1/embeddings")
|
||||||
|
async def embeddings_v1(request: Request):
|
||||||
|
return await _handle_embeddings(request)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/embeddings")
|
||||||
|
async def embeddings_root(request: Request):
|
||||||
|
return await _handle_embeddings(request)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=PORT, log_level="warning")
|
||||||
1
scripts/adc_scraper/__init__.py
Normal file
1
scripts/adc_scraper/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
"""ADC scraper scripts for the diploma project."""
|
||||||
167
scripts/adc_scraper/parse_adc_json.py
Normal file
167
scripts/adc_scraper/parse_adc_json.py
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
"""Parse raw ADC HTML/JSONL into structured JSON for LightRAG ingestion."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
SECTION_PATTERNS = {
|
||||||
|
"contraindications": [
|
||||||
|
r"nepoužívajte",
|
||||||
|
r"kedy .* nepoužívať",
|
||||||
|
r"kontraindik",
|
||||||
|
],
|
||||||
|
"interactions": [
|
||||||
|
r"iné lieky",
|
||||||
|
r"vzájomné pôsobenie",
|
||||||
|
r"interakci",
|
||||||
|
],
|
||||||
|
"side_effects": [
|
||||||
|
r"možné vedľajšie účinky",
|
||||||
|
r"nežiaduce účinky",
|
||||||
|
r"vedľajšie účinky",
|
||||||
|
],
|
||||||
|
"dosage": [
|
||||||
|
r"ako používať",
|
||||||
|
r"dávkovanie",
|
||||||
|
r"spôsob podávania",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def html_to_text(html: str | None) -> str:
|
||||||
|
if not html:
|
||||||
|
return ""
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
for tag in soup(["script", "style", "noscript"]):
|
||||||
|
tag.decompose()
|
||||||
|
text = soup.get_text(" ", strip=True)
|
||||||
|
return normalize_text(text)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_text(text: str) -> str:
|
||||||
|
text = text.replace("\xa0", " ")
|
||||||
|
text = re.sub(r"\s+", " ", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def infer_name(source_url: str, text: str) -> str:
|
||||||
|
match = re.search(r"Písomná informácia pre používateľa\s+(.{3,160}?)(?:\s+Pozorne|\s+V tejto|\s+1\.)", text)
|
||||||
|
if match:
|
||||||
|
return normalize_text(match.group(1))
|
||||||
|
|
||||||
|
slug = source_url.rstrip("/").split("/")[-1].replace(".html", "")
|
||||||
|
slug = re.sub(r"-\d+$", "", slug)
|
||||||
|
return slug.replace("-", " ").title()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_sections(text: str) -> dict[str, str]:
|
||||||
|
sections: dict[str, str] = {}
|
||||||
|
lower = text.lower()
|
||||||
|
|
||||||
|
starts: list[tuple[int, str]] = []
|
||||||
|
for section_name, patterns in SECTION_PATTERNS.items():
|
||||||
|
found_positions = []
|
||||||
|
for pattern in patterns:
|
||||||
|
match = re.search(pattern, lower)
|
||||||
|
if match:
|
||||||
|
found_positions.append(match.start())
|
||||||
|
if found_positions:
|
||||||
|
starts.append((min(found_positions), section_name))
|
||||||
|
|
||||||
|
starts.sort()
|
||||||
|
for idx, (start, section_name) in enumerate(starts):
|
||||||
|
end = starts[idx + 1][0] if idx + 1 < len(starts) else min(len(text), start + 8000)
|
||||||
|
sections[section_name] = text[start:end].strip()
|
||||||
|
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
def iter_raw_records(path: Path) -> list[dict[str, Any]]:
|
||||||
|
if path.suffix.lower() == ".jsonl":
|
||||||
|
records = []
|
||||||
|
with path.open(encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
records.append(json.loads(line))
|
||||||
|
return records
|
||||||
|
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
if isinstance(data, list):
|
||||||
|
return data
|
||||||
|
if "records" in data:
|
||||||
|
return data["records"]
|
||||||
|
return [data]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_record(raw: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
source_url = raw.get("source_url") or raw.get("link") or raw.get("pil_url") or ""
|
||||||
|
|
||||||
|
pil_text = raw.get("pribalovy_letak")
|
||||||
|
if pil_text is None:
|
||||||
|
pil_text = html_to_text(raw.get("pil_html"))
|
||||||
|
else:
|
||||||
|
pil_text = normalize_text(str(pil_text))
|
||||||
|
|
||||||
|
spc_text = raw.get("spc")
|
||||||
|
if spc_text is None:
|
||||||
|
spc_text = html_to_text(raw.get("spc_html"))
|
||||||
|
else:
|
||||||
|
spc_text = normalize_text(str(spc_text))
|
||||||
|
|
||||||
|
combined_text = f"{pil_text} {spc_text}".strip()
|
||||||
|
name = raw.get("name") or infer_name(source_url, combined_text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"source_url": source_url,
|
||||||
|
"name": name,
|
||||||
|
"pil_url": raw.get("pil_url"),
|
||||||
|
"spc_url": raw.get("spc_url"),
|
||||||
|
"pil_text": pil_text,
|
||||||
|
"spc_text": spc_text,
|
||||||
|
"sections": extract_sections(combined_text),
|
||||||
|
"metadata": {
|
||||||
|
"source": "adc.sk",
|
||||||
|
"scraped_at": (raw.get("metadata") or {}).get("scraped_at"),
|
||||||
|
"parser": "scripts/adc_scraper/parse_adc_json.py",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Parse ADC raw data into structured JSON.")
|
||||||
|
parser.add_argument("--input", type=Path, required=True)
|
||||||
|
parser.add_argument("--out", type=Path, required=True)
|
||||||
|
parser.add_argument("--limit", type=int, default=None)
|
||||||
|
parser.add_argument(
|
||||||
|
"--keep-empty",
|
||||||
|
action="store_true",
|
||||||
|
help="Keep records where both PIL and SPC text are empty.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
raw_records = iter_raw_records(args.input)
|
||||||
|
|
||||||
|
parsed = []
|
||||||
|
for record in raw_records:
|
||||||
|
item = parse_record(record)
|
||||||
|
if not args.keep_empty and not item["pil_text"] and not item["spc_text"]:
|
||||||
|
continue
|
||||||
|
parsed.append(item)
|
||||||
|
if args.limit and len(parsed) >= args.limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
args.out.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"Saved {len(parsed)} structured records to {args.out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
5
scripts/adc_scraper/requirements.txt
Normal file
5
scripts/adc_scraper/requirements.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
requests>=2.31.0
|
||||||
|
beautifulsoup4>=4.12.0
|
||||||
|
lxml>=5.0.0
|
||||||
|
tqdm>=4.66.0
|
||||||
|
playwright>=1.45.0
|
||||||
119
scripts/adc_scraper/scrape_adc_index.py
Normal file
119
scripts/adc_scraper/scrape_adc_index.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
"""Collect ADC product/PIL/SPC links from index or search pages.
|
||||||
|
|
||||||
|
The script is intentionally conservative: it only stores discovered ADC product
|
||||||
|
URLs and does not try to parse clinical content. The next pipeline step downloads
|
||||||
|
the actual leaflet pages.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from collections import deque
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_HEADERS = {
|
||||||
|
"User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def is_adc_url(url: str) -> bool:
|
||||||
|
host = urlparse(url).netloc.lower()
|
||||||
|
return host.endswith("adc.sk")
|
||||||
|
|
||||||
|
|
||||||
|
def is_product_like_url(url: str) -> bool:
|
||||||
|
path = urlparse(url).path.lower()
|
||||||
|
return "/databazy/produkty/" in path and (
|
||||||
|
"/pil/" in path or "/spc/" in path or "/detail/" in path
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_links(html: str, base_url: str) -> tuple[set[str], set[str]]:
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
product_links: set[str] = set()
|
||||||
|
crawl_links: set[str] = set()
|
||||||
|
|
||||||
|
for tag in soup.find_all("a", href=True):
|
||||||
|
url = urljoin(base_url, tag["href"]).split("#", 1)[0]
|
||||||
|
if not is_adc_url(url):
|
||||||
|
continue
|
||||||
|
if is_product_like_url(url):
|
||||||
|
product_links.add(url)
|
||||||
|
|
||||||
|
path = urlparse(url).path.lower()
|
||||||
|
if "/databazy/produkty/" in path:
|
||||||
|
crawl_links.add(url)
|
||||||
|
|
||||||
|
return product_links, crawl_links
|
||||||
|
|
||||||
|
|
||||||
|
def fetch(session: requests.Session, url: str, timeout: int) -> str:
|
||||||
|
response = session.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
|
||||||
|
response.raise_for_status()
|
||||||
|
response.encoding = response.apparent_encoding or "utf-8"
|
||||||
|
return response.text
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Collect ADC product/PIL/SPC links.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--start-url",
|
||||||
|
action="append",
|
||||||
|
required=True,
|
||||||
|
help="ADC index/search URL. Can be supplied multiple times.",
|
||||||
|
)
|
||||||
|
parser.add_argument("--out", type=Path, required=True, help="Output JSON file.")
|
||||||
|
parser.add_argument("--max-pages", type=int, default=20)
|
||||||
|
parser.add_argument("--delay", type=float, default=0.5)
|
||||||
|
parser.add_argument("--timeout", type=int, default=30)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
queue: deque[str] = deque(args.start_url)
|
||||||
|
visited: set[str] = set()
|
||||||
|
product_links: set[str] = set()
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
|
with tqdm(total=args.max_pages, desc="ADC pages") as progress:
|
||||||
|
while queue and len(visited) < args.max_pages:
|
||||||
|
url = queue.popleft()
|
||||||
|
if url in visited:
|
||||||
|
continue
|
||||||
|
visited.add(url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
html = fetch(session, url, args.timeout)
|
||||||
|
except Exception as exc:
|
||||||
|
tqdm.write(f"Skip {url}: {exc}")
|
||||||
|
progress.update(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
found_products, found_crawl = extract_links(html, url)
|
||||||
|
product_links.update(found_products)
|
||||||
|
|
||||||
|
for link in sorted(found_crawl):
|
||||||
|
if link not in visited and len(visited) + len(queue) < args.max_pages * 4:
|
||||||
|
queue.append(link)
|
||||||
|
|
||||||
|
progress.update(1)
|
||||||
|
time.sleep(args.delay)
|
||||||
|
|
||||||
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
payload = {
|
||||||
|
"source": "adc.sk",
|
||||||
|
"visited_pages": sorted(visited),
|
||||||
|
"links": sorted(product_links),
|
||||||
|
}
|
||||||
|
args.out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"Saved {len(product_links)} links to {args.out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
124
scripts/adc_scraper/scrape_adc_leaflets.py
Normal file
124
scripts/adc_scraper/scrape_adc_leaflets.py
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
"""Download ADC PIL/SPC pages into a raw JSONL file."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": "DiplomaResearchBot/0.1 (+educational use; ADC leaflet KG)",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_links(path: Path) -> list[str]:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
if isinstance(data, list):
|
||||||
|
return [str(x) for x in data]
|
||||||
|
return [str(x) for x in data.get("links", [])]
|
||||||
|
|
||||||
|
|
||||||
|
def paired_leaflet_urls(url: str) -> dict[str, str]:
|
||||||
|
"""Return best-effort PIL/SPC URLs for an ADC product URL."""
|
||||||
|
urls: dict[str, str] = {}
|
||||||
|
path = urlparse(url).path.lower()
|
||||||
|
if "/pil/" in path:
|
||||||
|
urls["pil_url"] = url
|
||||||
|
urls["spc_url"] = url.replace("/pil/", "/spc/")
|
||||||
|
elif "/spc/" in path:
|
||||||
|
urls["spc_url"] = url
|
||||||
|
urls["pil_url"] = url.replace("/spc/", "/pil/")
|
||||||
|
else:
|
||||||
|
urls["detail_url"] = url
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def discover_leaflet_urls_from_detail(html: str, base_url: str) -> dict[str, str]:
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
result: dict[str, str] = {}
|
||||||
|
for tag in soup.find_all("a", href=True):
|
||||||
|
candidate = urljoin(base_url, tag["href"])
|
||||||
|
path = urlparse(candidate).path.lower()
|
||||||
|
if "/databazy/produkty/pil/" in path:
|
||||||
|
result["pil_url"] = candidate
|
||||||
|
elif "/databazy/produkty/spc/" in path:
|
||||||
|
result["spc_url"] = candidate
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def fetch(session: requests.Session, url: str, timeout: int) -> tuple[int, str]:
|
||||||
|
response = session.get(url, headers=HEADERS, timeout=timeout)
|
||||||
|
response.encoding = response.apparent_encoding or "utf-8"
|
||||||
|
return response.status_code, response.text
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Download ADC PIL/SPC HTML pages.")
|
||||||
|
parser.add_argument("--links", type=Path, required=True)
|
||||||
|
parser.add_argument("--out", type=Path, required=True)
|
||||||
|
parser.add_argument("--limit", type=int, default=None)
|
||||||
|
parser.add_argument("--delay", type=float, default=0.5)
|
||||||
|
parser.add_argument("--timeout", type=int, default=30)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
links = load_links(args.links)
|
||||||
|
if args.limit:
|
||||||
|
links = links[: args.limit]
|
||||||
|
|
||||||
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
|
with args.out.open("w", encoding="utf-8") as out:
|
||||||
|
for source_url in tqdm(links, desc="ADC leaflets"):
|
||||||
|
urls = paired_leaflet_urls(source_url)
|
||||||
|
|
||||||
|
if "detail_url" in urls:
|
||||||
|
status, html = fetch(session, urls["detail_url"], args.timeout)
|
||||||
|
if status == 200:
|
||||||
|
urls.update(discover_leaflet_urls_from_detail(html, urls["detail_url"]))
|
||||||
|
time.sleep(args.delay)
|
||||||
|
|
||||||
|
record = {
|
||||||
|
"source_url": source_url,
|
||||||
|
"pil_url": urls.get("pil_url"),
|
||||||
|
"spc_url": urls.get("spc_url"),
|
||||||
|
"pil_status": None,
|
||||||
|
"spc_status": None,
|
||||||
|
"pil_html": None,
|
||||||
|
"spc_html": None,
|
||||||
|
"metadata": {
|
||||||
|
"source": "adc.sk",
|
||||||
|
"scraped_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for kind in ("pil", "spc"):
|
||||||
|
url = urls.get(f"{kind}_url")
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
status, html = fetch(session, url, args.timeout)
|
||||||
|
record[f"{kind}_status"] = status
|
||||||
|
if status == 200:
|
||||||
|
record[f"{kind}_html"] = html
|
||||||
|
except Exception as exc:
|
||||||
|
record[f"{kind}_status"] = f"error: {exc}"
|
||||||
|
time.sleep(args.delay)
|
||||||
|
|
||||||
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
print(f"Saved raw leaflets to {args.out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
580
scripts/adc_scraper/scrape_adc_product_data.py
Normal file
580
scripts/adc_scraper/scrape_adc_product_data.py
Normal file
@ -0,0 +1,580 @@
|
|||||||
|
"""Scrape structured ADC product data from detail and PIL pages.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
python scripts/adc_scraper/scrape_adc_product_data.py --browser --limit 10
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable, Iterable
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
BASE_URL = "https://www.adc.sk"
|
||||||
|
DEFAULT_DATA_DIR = Path("data_adc_databaza/adc_scrape_2026_05_04")
|
||||||
|
DEFAULT_LINKS = DEFAULT_DATA_DIR / "adc_product_links.json"
|
||||||
|
DEFAULT_OUT = DEFAULT_DATA_DIR / "adc_products_structured.json"
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/124.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "sk-SK,sk;q=0.9,cs;q=0.8,en;q=0.7",
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Pragma": "no-cache",
|
||||||
|
"Referer": "https://www.adc.sk/databazy/produkty",
|
||||||
|
}
|
||||||
|
|
||||||
|
DETAIL_SECTION_ALIASES = {
|
||||||
|
"Popis a určenie": "description_and_indications",
|
||||||
|
"Použitie": "use_and_dosage",
|
||||||
|
"Nežiaduce účinky": "side_effects",
|
||||||
|
"Účinné látky": "active_substances",
|
||||||
|
"Indikačná skupina": "indication_group",
|
||||||
|
"ADC Klasifikácia produktu": "adc_classification",
|
||||||
|
"Všeobecné informácie vzťahujúce sa k produktu": "general_product_info",
|
||||||
|
}
|
||||||
|
|
||||||
|
PIL_SECTION_PATTERNS = {
|
||||||
|
"what_is_it": r"^1\.\s+Čo je .+",
|
||||||
|
"before_use": r"^2\.\s+Čo potrebujete vedieť .+",
|
||||||
|
"how_to_use": r"^3\.\s+Ako .+",
|
||||||
|
"side_effects": r"^4\.\s+Možné .+účinky",
|
||||||
|
"storage": r"^5\.\s+Ako uchovávať .+",
|
||||||
|
"package_info": r"^6\.\s+Obsah balenia .+",
|
||||||
|
}
|
||||||
|
|
||||||
|
PIL_SUBSECTION_ALIASES = {
|
||||||
|
"contraindications": [
|
||||||
|
r"^Neužívajte .+",
|
||||||
|
r"^Nepoužívajte .+",
|
||||||
|
r"^Nesmiete .+",
|
||||||
|
],
|
||||||
|
"warnings": [
|
||||||
|
r"^Upozornenia a opatrenia",
|
||||||
|
r"^Buďte zvlášť opatrný .+",
|
||||||
|
],
|
||||||
|
"interactions": [
|
||||||
|
r"^Iné lieky a .+",
|
||||||
|
r"^Užívanie .+ s inými liekmi",
|
||||||
|
],
|
||||||
|
"pregnancy_breastfeeding": [
|
||||||
|
r"^Tehotenstvo.*dojčenie.*",
|
||||||
|
r"^Tehotenstvo.*",
|
||||||
|
],
|
||||||
|
"driving": [
|
||||||
|
r"^Vedenie vozidiel .+",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ProductUrls:
|
||||||
|
detail_url: str
|
||||||
|
pil_url: str
|
||||||
|
spc_url: str
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(value: str) -> str:
|
||||||
|
value = value.replace("\xa0", " ")
|
||||||
|
value = re.sub(r"[ \t\r\f\v]+", " ", value)
|
||||||
|
value = re.sub(r"\n{3,}", "\n\n", value)
|
||||||
|
value = re.sub(r"(?im)^reklama$", "", value)
|
||||||
|
return value.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_key(value: str) -> str:
|
||||||
|
value = clean_text(value).lower()
|
||||||
|
replacements = {
|
||||||
|
"á": "a",
|
||||||
|
"ä": "a",
|
||||||
|
"č": "c",
|
||||||
|
"ď": "d",
|
||||||
|
"é": "e",
|
||||||
|
"í": "i",
|
||||||
|
"ľ": "l",
|
||||||
|
"ĺ": "l",
|
||||||
|
"ň": "n",
|
||||||
|
"ó": "o",
|
||||||
|
"ô": "o",
|
||||||
|
"ŕ": "r",
|
||||||
|
"š": "s",
|
||||||
|
"ť": "t",
|
||||||
|
"ú": "u",
|
||||||
|
"ý": "y",
|
||||||
|
"ž": "z",
|
||||||
|
}
|
||||||
|
for source, target in replacements.items():
|
||||||
|
value = value.replace(source, target)
|
||||||
|
value = re.sub(r"[^a-z0-9]+", "_", value)
|
||||||
|
return value.strip("_")
|
||||||
|
|
||||||
|
|
||||||
|
def product_urls(detail_url: str) -> ProductUrls:
|
||||||
|
return ProductUrls(
|
||||||
|
detail_url=detail_url,
|
||||||
|
pil_url=detail_url.replace("/detail/", "/pil/"),
|
||||||
|
spc_url=detail_url.replace("/detail/", "/spc/"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def product_id_from_url(url: str) -> str | None:
|
||||||
|
match = re.search(r"-(\d+)\.html(?:$|\?)", urlparse(url).path)
|
||||||
|
return match.group(1) if match else None
|
||||||
|
|
||||||
|
|
||||||
|
def slug_from_url(url: str) -> str:
|
||||||
|
name = Path(urlparse(url).path).name
|
||||||
|
return re.sub(r"-\d+\.html$", "", name)
|
||||||
|
|
||||||
|
|
||||||
|
def load_links(path: Path) -> list[str]:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
if not isinstance(data, list):
|
||||||
|
raise ValueError(f"Expected a JSON list in {path}")
|
||||||
|
return [str(item) for item in data if str(item).strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def soup_from_html(html: str) -> BeautifulSoup:
|
||||||
|
return BeautifulSoup(html, "lxml")
|
||||||
|
|
||||||
|
|
||||||
|
def remove_noise(root: Tag) -> None:
|
||||||
|
for tag in root.select(
|
||||||
|
"script, style, noscript, nav, header, footer, iframe, form, "
|
||||||
|
".modal, .adbl, .ad-video-fake, .breadcrumb, .piktograms"
|
||||||
|
):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
|
||||||
|
def node_text(node: Tag) -> str:
|
||||||
|
remove_noise(node)
|
||||||
|
return clean_text(node.get_text("\n", strip=True))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_json_ld_product(soup: BeautifulSoup) -> dict[str, str | None]:
|
||||||
|
for script in soup.find_all("script", {"type": "application/ld+json"}):
|
||||||
|
raw = script.string or script.get_text()
|
||||||
|
if not raw.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(raw)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
items = data if isinstance(data, list) else [data]
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, dict) and item.get("@type") == "Product":
|
||||||
|
return {
|
||||||
|
"name": item.get("name"),
|
||||||
|
"description": item.get("description"),
|
||||||
|
"image_url": item.get("image"),
|
||||||
|
}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_info_rows(soup: BeautifulSoup) -> dict[str, str]:
|
||||||
|
fields: dict[str, str] = {}
|
||||||
|
|
||||||
|
for row in soup.select(".pmi-info-row"):
|
||||||
|
children = [child for child in row.find_all(recursive=False) if isinstance(child, Tag)]
|
||||||
|
if len(children) >= 2:
|
||||||
|
key = clean_text(children[0].get_text(" ", strip=True))
|
||||||
|
value = clean_text(" ".join(child.get_text(" ", strip=True) for child in children[1:]))
|
||||||
|
else:
|
||||||
|
parts = [part.strip() for part in row.get_text("|", strip=True).split("|") if part.strip()]
|
||||||
|
if len(parts) < 2:
|
||||||
|
continue
|
||||||
|
key, value = parts[0], " ".join(parts[1:])
|
||||||
|
if key and value:
|
||||||
|
fields[normalize_key(key)] = value
|
||||||
|
|
||||||
|
for table in soup.find_all("table"):
|
||||||
|
for tr in table.find_all("tr"):
|
||||||
|
cells = [clean_text(c.get_text(" ", strip=True)) for c in tr.find_all(["th", "td"])]
|
||||||
|
if len(cells) == 2 and len(cells[0]) <= 80 and cells[1]:
|
||||||
|
fields.setdefault(normalize_key(cells[0]), cells[1])
|
||||||
|
|
||||||
|
return fields
|
||||||
|
|
||||||
|
|
||||||
|
def collect_until_next_section(header: Tag) -> str:
|
||||||
|
parts: list[str] = []
|
||||||
|
for sibling in header.next_siblings:
|
||||||
|
if isinstance(sibling, Tag) and sibling.name == "h4" and "section-product" in sibling.get("class", []):
|
||||||
|
break
|
||||||
|
if not isinstance(sibling, Tag):
|
||||||
|
continue
|
||||||
|
clone = BeautifulSoup(str(sibling), "lxml")
|
||||||
|
text = node_text(clone)
|
||||||
|
if text and text != clean_text(header.get_text(" ", strip=True)):
|
||||||
|
parts.append(text)
|
||||||
|
return clean_text("\n".join(parts))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_detail_sections(soup: BeautifulSoup) -> dict[str, str]:
|
||||||
|
sections: dict[str, str] = {}
|
||||||
|
for header in soup.select("h4.section-product"):
|
||||||
|
title = clean_text(header.get_text(" ", strip=True))
|
||||||
|
key = DETAIL_SECTION_ALIASES.get(title, normalize_key(title))
|
||||||
|
text = collect_until_next_section(header)
|
||||||
|
if text:
|
||||||
|
sections[key] = text
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
def parse_classification(soup: BeautifulSoup) -> list[dict[str, str]]:
|
||||||
|
levels: list[dict[str, str]] = []
|
||||||
|
box = soup.select_one(".classification-levels")
|
||||||
|
if not box:
|
||||||
|
return levels
|
||||||
|
for tr in box.find_all("tr"):
|
||||||
|
cells = [clean_text(c.get_text(" ", strip=True)) for c in tr.find_all("td")]
|
||||||
|
if len(cells) >= 2:
|
||||||
|
levels.append({"code": cells[0], "name": cells[1]})
|
||||||
|
return levels
|
||||||
|
|
||||||
|
|
||||||
|
def parse_detail_page(html: str, detail_url: str) -> dict:
|
||||||
|
soup = soup_from_html(html)
|
||||||
|
json_ld = parse_json_ld_product(soup)
|
||||||
|
h1 = soup.find("h1")
|
||||||
|
fields = parse_info_rows(soup)
|
||||||
|
sections = parse_detail_sections(soup)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"product_id": product_id_from_url(detail_url),
|
||||||
|
"slug": slug_from_url(detail_url),
|
||||||
|
"name": json_ld.get("name") or (clean_text(h1.get_text(" ", strip=True)) if h1 else None),
|
||||||
|
"short_description": clean_text(str(json_ld.get("description") or "")) or None,
|
||||||
|
"image_url": json_ld.get("image_url"),
|
||||||
|
"detail_fields": fields,
|
||||||
|
"sections": sections,
|
||||||
|
"classification": parse_classification(soup),
|
||||||
|
"active_substances": split_list_field(sections.get("active_substances") or ""),
|
||||||
|
"indication_group": sections.get("indication_group"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def split_list_field(value: str) -> list[str]:
|
||||||
|
if not value:
|
||||||
|
return []
|
||||||
|
items = [clean_text(item) for item in re.split(r"\n|,|;", value) if clean_text(item)]
|
||||||
|
return list(dict.fromkeys(items))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_article_text(html: str) -> str:
|
||||||
|
soup = soup_from_html(html)
|
||||||
|
article = soup.find("article")
|
||||||
|
if article:
|
||||||
|
return node_text(article)
|
||||||
|
|
||||||
|
fallback = soup.find("div", id="product") or soup.body or soup
|
||||||
|
return node_text(fallback)
|
||||||
|
|
||||||
|
|
||||||
|
def split_by_numbered_pil_sections(text: str) -> dict[str, str]:
|
||||||
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||||
|
starts: list[tuple[str, int]] = []
|
||||||
|
for idx, line in enumerate(lines):
|
||||||
|
for key, pattern in PIL_SECTION_PATTERNS.items():
|
||||||
|
if re.match(pattern, line, flags=re.IGNORECASE):
|
||||||
|
starts.append((key, idx))
|
||||||
|
break
|
||||||
|
|
||||||
|
sections: dict[str, str] = {}
|
||||||
|
for pos, (key, idx) in enumerate(starts):
|
||||||
|
end = starts[pos + 1][1] if pos + 1 < len(starts) else len(lines)
|
||||||
|
sections[key] = clean_text("\n".join(lines[idx:end]))
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
def split_pil_subsections(before_use_text: str) -> dict[str, str]:
|
||||||
|
if not before_use_text:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
lines = [line.strip() for line in before_use_text.splitlines() if line.strip()]
|
||||||
|
starts: list[tuple[str, int]] = []
|
||||||
|
for idx, line in enumerate(lines):
|
||||||
|
for key, patterns in PIL_SUBSECTION_ALIASES.items():
|
||||||
|
if any(re.match(pattern, line, flags=re.IGNORECASE) for pattern in patterns):
|
||||||
|
starts.append((key, idx))
|
||||||
|
break
|
||||||
|
|
||||||
|
result: dict[str, str] = {}
|
||||||
|
for pos, (key, idx) in enumerate(starts):
|
||||||
|
end = starts[pos + 1][1] if pos + 1 < len(starts) else len(lines)
|
||||||
|
result[key] = clean_text("\n".join(lines[idx:end]))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pil_page(html: str) -> dict:
|
||||||
|
text = extract_article_text(html)
|
||||||
|
sections = split_by_numbered_pil_sections(text)
|
||||||
|
subsections = split_pil_subsections(sections.get("before_use", ""))
|
||||||
|
return {
|
||||||
|
"full_text": text,
|
||||||
|
"sections": sections,
|
||||||
|
"subsections": subsections,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_lightrag_text(detail: dict, pil: dict | None, urls: ProductUrls) -> str:
|
||||||
|
chunks: list[str] = []
|
||||||
|
name = detail.get("name") or detail.get("slug") or urls.detail_url
|
||||||
|
chunks.append(f"Liek: {name}")
|
||||||
|
chunks.append(f"ADC detail URL: {urls.detail_url}")
|
||||||
|
chunks.append(f"ADC PIL URL: {urls.pil_url}")
|
||||||
|
|
||||||
|
fields = detail.get("detail_fields") or {}
|
||||||
|
important_fields = [
|
||||||
|
"registracne_cislo_produktu",
|
||||||
|
"kod_statnej_autority_sukl",
|
||||||
|
"nazov_produktu_podla_sukl",
|
||||||
|
"aplikacna_forma",
|
||||||
|
"vyrobca",
|
||||||
|
"drzitel_rozhodnutia",
|
||||||
|
"dodavatelia",
|
||||||
|
"vydaj",
|
||||||
|
"typ_produktu",
|
||||||
|
"legislativne_zatriedenie",
|
||||||
|
]
|
||||||
|
for key in important_fields:
|
||||||
|
if fields.get(key):
|
||||||
|
chunks.append(f"{key}: {fields[key]}")
|
||||||
|
|
||||||
|
for section_key, title in [
|
||||||
|
("description_and_indications", "Popis a indikácie"),
|
||||||
|
("use_and_dosage", "Použitie a dávkovanie"),
|
||||||
|
("side_effects", "Nežiaduce účinky"),
|
||||||
|
("active_substances", "Účinné látky"),
|
||||||
|
("indication_group", "Indikačná skupina"),
|
||||||
|
("general_product_info", "Všeobecné informácie"),
|
||||||
|
]:
|
||||||
|
text = (detail.get("sections") or {}).get(section_key)
|
||||||
|
if text:
|
||||||
|
chunks.append(f"\n{title}\n{text}")
|
||||||
|
|
||||||
|
if pil:
|
||||||
|
subsections = pil.get("subsections") or {}
|
||||||
|
for key, title in [
|
||||||
|
("contraindications", "Kontraindikácie z PIL"),
|
||||||
|
("warnings", "Upozornenia z PIL"),
|
||||||
|
("interactions", "Interakcie z PIL"),
|
||||||
|
("pregnancy_breastfeeding", "Tehotenstvo a dojčenie z PIL"),
|
||||||
|
("driving", "Vedenie vozidiel z PIL"),
|
||||||
|
]:
|
||||||
|
if subsections.get(key):
|
||||||
|
chunks.append(f"\n{title}\n{subsections[key]}")
|
||||||
|
|
||||||
|
for key, title in [
|
||||||
|
("what_is_it", "Na čo sa používa z PIL"),
|
||||||
|
("how_to_use", "Ako užívať z PIL"),
|
||||||
|
("side_effects", "Vedľajšie účinky z PIL"),
|
||||||
|
]:
|
||||||
|
section_text = (pil.get("sections") or {}).get(key)
|
||||||
|
if section_text:
|
||||||
|
chunks.append(f"\n{title}\n{section_text}")
|
||||||
|
|
||||||
|
return clean_text("\n\n".join(chunks))
|
||||||
|
|
||||||
|
|
||||||
|
def build_graph_hints(detail: dict, pil: dict | None) -> dict:
|
||||||
|
fields = detail.get("detail_fields") or {}
|
||||||
|
sections = detail.get("sections") or {}
|
||||||
|
pil_subsections = (pil or {}).get("subsections") or {}
|
||||||
|
pil_sections = (pil or {}).get("sections") or {}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"drug": detail.get("name"),
|
||||||
|
"active_substances": detail.get("active_substances") or [],
|
||||||
|
"dosage_form": fields.get("aplikacna_forma"),
|
||||||
|
"manufacturer": fields.get("vyrobca"),
|
||||||
|
"marketing_authorization_holder": fields.get("drzitel_rozhodnutia"),
|
||||||
|
"supplier": fields.get("dodavatelia"),
|
||||||
|
"sukl_code": fields.get("kod_statnej_autority_sukl"),
|
||||||
|
"registration_number": fields.get("registracne_cislo_produktu"),
|
||||||
|
"classification_codes": detail.get("classification") or [],
|
||||||
|
"indications_text": sections.get("description_and_indications") or pil_sections.get("what_is_it"),
|
||||||
|
"dosage_text": sections.get("use_and_dosage") or pil_sections.get("how_to_use"),
|
||||||
|
"contraindications_text": pil_subsections.get("contraindications"),
|
||||||
|
"warnings_text": pil_subsections.get("warnings"),
|
||||||
|
"interactions_text": pil_subsections.get("interactions"),
|
||||||
|
"side_effects_text": sections.get("side_effects") or pil_sections.get("side_effects"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_record(detail_html: str, pil_html: str | None, urls: ProductUrls) -> dict:
|
||||||
|
detail = parse_detail_page(detail_html, urls.detail_url)
|
||||||
|
pil = parse_pil_page(pil_html) if pil_html else None
|
||||||
|
scraped_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"source": "adc.sk",
|
||||||
|
"scraped_at": scraped_at,
|
||||||
|
"urls": {
|
||||||
|
"detail": urls.detail_url,
|
||||||
|
"pil": urls.pil_url,
|
||||||
|
"spc": urls.spc_url,
|
||||||
|
},
|
||||||
|
"product": detail,
|
||||||
|
"pil": pil,
|
||||||
|
"graph_hints": build_graph_hints(detail, pil),
|
||||||
|
"lightrag_text": build_lightrag_text(detail, pil, urls),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_requests(session: requests.Session, url: str, timeout: int, retries: int) -> str:
|
||||||
|
last_error: Exception | None = None
|
||||||
|
for attempt in range(1, retries + 1):
|
||||||
|
try:
|
||||||
|
response = session.get(url, headers=HEADERS, timeout=timeout)
|
||||||
|
response.raise_for_status()
|
||||||
|
response.encoding = response.apparent_encoding or "utf-8"
|
||||||
|
return response.text
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = exc
|
||||||
|
if attempt < retries:
|
||||||
|
time.sleep(1.5 * attempt)
|
||||||
|
raise RuntimeError(f"Failed to fetch {url}: {last_error}")
|
||||||
|
|
||||||
|
|
||||||
|
def make_requests_fetcher(timeout: int, retries: int) -> Callable[[str], str]:
|
||||||
|
session = requests.Session()
|
||||||
|
return lambda url: fetch_requests(session, url, timeout, retries)
|
||||||
|
|
||||||
|
|
||||||
|
def make_browser_fetcher() -> tuple[Callable[[str], str], Callable[[], None]]:
|
||||||
|
try:
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
except ImportError as exc:
|
||||||
|
raise SystemExit(
|
||||||
|
"Playwright is not installed. Run: pip install playwright; python -m playwright install chromium"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
playwright = sync_playwright().start()
|
||||||
|
browser = playwright.chromium.launch(headless=True)
|
||||||
|
page = browser.new_page(
|
||||||
|
user_agent=HEADERS["User-Agent"],
|
||||||
|
locale="sk-SK",
|
||||||
|
viewport={"width": 1366, "height": 900},
|
||||||
|
)
|
||||||
|
|
||||||
|
def fetch(url: str) -> str:
|
||||||
|
response = page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||||
|
if response is None or response.status >= 400:
|
||||||
|
status = response.status if response else "no-response"
|
||||||
|
raise RuntimeError(f"HTTP {status} for {url}")
|
||||||
|
return page.content()
|
||||||
|
|
||||||
|
def close() -> None:
|
||||||
|
browser.close()
|
||||||
|
playwright.stop()
|
||||||
|
|
||||||
|
return fetch, close
|
||||||
|
|
||||||
|
|
||||||
|
def iter_links(links: Iterable[str], limit: int | None) -> Iterable[str]:
|
||||||
|
count = 0
|
||||||
|
for link in links:
|
||||||
|
if limit is not None and count >= limit:
|
||||||
|
break
|
||||||
|
count += 1
|
||||||
|
yield link
|
||||||
|
|
||||||
|
|
||||||
|
def write_records_json(
|
||||||
|
out_path: Path,
|
||||||
|
links: list[str],
|
||||||
|
fetch: Callable[[str], str],
|
||||||
|
limit: int | None,
|
||||||
|
delay: float,
|
||||||
|
skip_failed: bool,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
failures: list[dict[str, str]] = []
|
||||||
|
selected_links = list(iter_links(links, limit))
|
||||||
|
|
||||||
|
with out_path.open("w", encoding="utf-8") as out:
|
||||||
|
out.write("[\n")
|
||||||
|
wrote_any = False
|
||||||
|
for detail_url in tqdm(selected_links, desc="ADC products"):
|
||||||
|
urls = product_urls(detail_url)
|
||||||
|
try:
|
||||||
|
detail_html = fetch(urls.detail_url)
|
||||||
|
time.sleep(delay)
|
||||||
|
pil_html = fetch(urls.pil_url)
|
||||||
|
record = build_record(detail_html, pil_html, urls)
|
||||||
|
except Exception as exc:
|
||||||
|
failures.append({"url": detail_url, "error": str(exc)})
|
||||||
|
tqdm.write(f"Failed product {detail_url}: {exc}")
|
||||||
|
if not skip_failed:
|
||||||
|
raise
|
||||||
|
continue
|
||||||
|
|
||||||
|
if wrote_any:
|
||||||
|
out.write(",\n")
|
||||||
|
json.dump(record, out, ensure_ascii=False, indent=2)
|
||||||
|
wrote_any = True
|
||||||
|
out.flush()
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
out.write("\n]\n")
|
||||||
|
|
||||||
|
return failures
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Scrape ADC product detail + PIL data into structured JSON.")
|
||||||
|
parser.add_argument("--links", type=Path, default=DEFAULT_LINKS, help="Input JSON list with detail URLs.")
|
||||||
|
parser.add_argument("--out", type=Path, default=DEFAULT_OUT, help="Output structured JSON file.")
|
||||||
|
parser.add_argument("--limit", type=int, default=None, help="Scrape only the first N products.")
|
||||||
|
parser.add_argument("--delay", type=float, default=0.25, help="Delay between page loads in seconds.")
|
||||||
|
parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout in seconds for requests mode.")
|
||||||
|
parser.add_argument("--retries", type=int, default=3, help="Retries per URL in requests mode.")
|
||||||
|
parser.add_argument("--browser", action="store_true", help="Use Playwright Chromium. Use this if ADC returns 403.")
|
||||||
|
parser.add_argument("--stop-on-fail", action="store_true", help="Stop on first failed product.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
links = load_links(args.links)
|
||||||
|
close_browser: Callable[[], None] | None = None
|
||||||
|
|
||||||
|
if args.browser:
|
||||||
|
fetch, close_browser = make_browser_fetcher()
|
||||||
|
else:
|
||||||
|
fetch = make_requests_fetcher(args.timeout, args.retries)
|
||||||
|
|
||||||
|
try:
|
||||||
|
failures = write_records_json(
|
||||||
|
out_path=args.out,
|
||||||
|
links=links,
|
||||||
|
fetch=fetch,
|
||||||
|
limit=args.limit,
|
||||||
|
delay=args.delay,
|
||||||
|
skip_failed=not args.stop_on_fail,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
if close_browser:
|
||||||
|
close_browser()
|
||||||
|
|
||||||
|
print(f"Saved structured product data to {args.out}")
|
||||||
|
if failures:
|
||||||
|
failed_path = args.out.with_suffix(".failed.json")
|
||||||
|
failed_path.write_text(json.dumps(failures, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"Failed products: {len(failures)}. Saved errors to {failed_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
182
scripts/adc_scraper/scrape_adc_product_links.py
Normal file
182
scripts/adc_scraper/scrape_adc_product_links.py
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
"""Scrape product detail links from ADC product listing pages.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
python scripts/adc_scraper/scrape_adc_product_links.py --out data_adc_databaza/adc_scrape_2026_05_04/adc_product_links.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
BASE_URL = "https://www.adc.sk"
|
||||||
|
LISTING_URL = "https://www.adc.sk/databazy/produkty?page={page}&ord=a1"
|
||||||
|
DEFAULT_PAGES = 711
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/124.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "sk-SK,sk;q=0.9,cs;q=0.8,en;q=0.7",
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Pragma": "no-cache",
|
||||||
|
"Referer": "https://www.adc.sk/databazy/produkty",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(session: requests.Session, page: int, timeout: int, retries: int) -> str:
|
||||||
|
url = LISTING_URL.format(page=page)
|
||||||
|
last_error: Exception | None = None
|
||||||
|
|
||||||
|
for attempt in range(1, retries + 1):
|
||||||
|
try:
|
||||||
|
response = session.get(url, headers=HEADERS, timeout=timeout)
|
||||||
|
response.raise_for_status()
|
||||||
|
response.encoding = response.apparent_encoding or "utf-8"
|
||||||
|
return response.text
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = exc
|
||||||
|
if attempt < retries:
|
||||||
|
time.sleep(1.5 * attempt)
|
||||||
|
|
||||||
|
raise RuntimeError(f"Failed to fetch page {page}: {last_error}")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_product_links(html: str) -> list[str]:
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
links: list[str] = []
|
||||||
|
|
||||||
|
for tag in soup.select('a.product[href^="/databazy/produkty/detail/"]'):
|
||||||
|
href = tag.get("href")
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
links.append(urljoin(BASE_URL, href))
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_with_requests(
|
||||||
|
start_page: int,
|
||||||
|
pages: int,
|
||||||
|
delay: float,
|
||||||
|
timeout: int,
|
||||||
|
retries: int,
|
||||||
|
) -> tuple[list[str], list[int]]:
|
||||||
|
session = requests.Session()
|
||||||
|
seen: set[str] = set()
|
||||||
|
all_links: list[str] = []
|
||||||
|
failed_pages: list[int] = []
|
||||||
|
|
||||||
|
end_page = start_page + pages - 1
|
||||||
|
for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"):
|
||||||
|
try:
|
||||||
|
html = fetch_page(session, page, timeout, retries)
|
||||||
|
page_links = extract_product_links(html)
|
||||||
|
except Exception as exc:
|
||||||
|
tqdm.write(str(exc))
|
||||||
|
failed_pages.append(page)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for link in page_links:
|
||||||
|
if link not in seen:
|
||||||
|
seen.add(link)
|
||||||
|
all_links.append(link)
|
||||||
|
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
return all_links, failed_pages
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_with_browser(start_page: int, pages: int, delay: float) -> tuple[list[str], list[int]]:
|
||||||
|
try:
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
except ImportError as exc:
|
||||||
|
raise SystemExit(
|
||||||
|
"Playwright is not installed. Run: pip install playwright; python -m playwright install chromium"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
|
all_links: list[str] = []
|
||||||
|
failed_pages: list[int] = []
|
||||||
|
end_page = start_page + pages - 1
|
||||||
|
|
||||||
|
with sync_playwright() as playwright:
|
||||||
|
browser = playwright.chromium.launch(headless=True)
|
||||||
|
page_obj = browser.new_page(
|
||||||
|
user_agent=HEADERS["User-Agent"],
|
||||||
|
locale="sk-SK",
|
||||||
|
viewport={"width": 1366, "height": 900},
|
||||||
|
)
|
||||||
|
|
||||||
|
for page in tqdm(range(start_page, end_page + 1), desc="ADC pages"):
|
||||||
|
url = LISTING_URL.format(page=page)
|
||||||
|
try:
|
||||||
|
response = page_obj.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||||
|
if response is None or response.status >= 400:
|
||||||
|
status = response.status if response else "no-response"
|
||||||
|
raise RuntimeError(f"HTTP {status}")
|
||||||
|
html = page_obj.content()
|
||||||
|
page_links = extract_product_links(html)
|
||||||
|
except Exception as exc:
|
||||||
|
tqdm.write(f"Failed page {page}: {exc}")
|
||||||
|
failed_pages.append(page)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for link in page_links:
|
||||||
|
if link not in seen:
|
||||||
|
seen.add(link)
|
||||||
|
all_links.append(link)
|
||||||
|
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
return all_links, failed_pages
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Scrape ADC product detail links.")
|
||||||
|
parser.add_argument("--out", type=Path, required=True, help="Output JSON file.")
|
||||||
|
parser.add_argument("--pages", type=int, default=DEFAULT_PAGES, help="Number of ADC listing pages.")
|
||||||
|
parser.add_argument("--start-page", type=int, default=1, help="First page number.")
|
||||||
|
parser.add_argument("--delay", type=float, default=0.25, help="Delay between requests in seconds.")
|
||||||
|
parser.add_argument("--timeout", type=int, default=30, help="HTTP timeout in seconds.")
|
||||||
|
parser.add_argument("--retries", type=int, default=3, help="Retries per page.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--browser",
|
||||||
|
action="store_true",
|
||||||
|
help="Use Playwright Chromium instead of requests. Useful when ADC returns HTTP 403.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.browser:
|
||||||
|
all_links, failed_pages = scrape_with_browser(args.start_page, args.pages, args.delay)
|
||||||
|
else:
|
||||||
|
all_links, failed_pages = scrape_with_requests(
|
||||||
|
args.start_page,
|
||||||
|
args.pages,
|
||||||
|
args.delay,
|
||||||
|
args.timeout,
|
||||||
|
args.retries,
|
||||||
|
)
|
||||||
|
|
||||||
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
args.out.write_text(json.dumps(all_links, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
|
||||||
|
print(f"Saved {len(all_links)} unique product links to {args.out}")
|
||||||
|
if failed_pages:
|
||||||
|
print(f"Failed pages: {failed_pages}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
47
scripts/adc_scraper/validate_adc_json.py
Normal file
47
scripts/adc_scraper/validate_adc_json.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
"""Validate basic quality of structured ADC JSON."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Validate structured ADC JSON.")
|
||||||
|
parser.add_argument("--input", type=Path, required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
data = json.loads(args.input.read_text(encoding="utf-8"))
|
||||||
|
if not isinstance(data, list):
|
||||||
|
raise SystemExit("Input must be a JSON list.")
|
||||||
|
|
||||||
|
missing = Counter()
|
||||||
|
section_counter = Counter()
|
||||||
|
total_pil_chars = 0
|
||||||
|
total_spc_chars = 0
|
||||||
|
|
||||||
|
for record in data:
|
||||||
|
for key in ("source_url", "name", "pil_text", "spc_text", "sections"):
|
||||||
|
if not record.get(key):
|
||||||
|
missing[key] += 1
|
||||||
|
total_pil_chars += len(record.get("pil_text") or "")
|
||||||
|
total_spc_chars += len(record.get("spc_text") or "")
|
||||||
|
for section_name, section_text in (record.get("sections") or {}).items():
|
||||||
|
if section_text:
|
||||||
|
section_counter[section_name] += 1
|
||||||
|
|
||||||
|
print(f"Records: {len(data)}")
|
||||||
|
print(f"Average PIL chars: {total_pil_chars // max(len(data), 1)}")
|
||||||
|
print(f"Average SPC chars: {total_spc_chars // max(len(data), 1)}")
|
||||||
|
print("Missing fields:")
|
||||||
|
for key in ("source_url", "name", "pil_text", "spc_text", "sections"):
|
||||||
|
print(f" {key}: {missing[key]}")
|
||||||
|
print("Detected sections:")
|
||||||
|
for key, value in section_counter.most_common():
|
||||||
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
94
start_servers.py
Normal file
94
start_servers.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
"""
|
||||||
|
Запуск всех серверов для работы с графом знаний ADC.
|
||||||
|
|
||||||
|
Запускает:
|
||||||
|
1. Embedding server — localhost:8010 (локальная модель, словацкий язык)
|
||||||
|
2. LightRAG server — localhost:9621 (граф + API + WebUI)
|
||||||
|
|
||||||
|
Использование:
|
||||||
|
python start_servers.py
|
||||||
|
|
||||||
|
Остановка: Ctrl+C
|
||||||
|
"""
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
ROOT = Path(__file__).parent
|
||||||
|
LIGHTRAG_DIR = ROOT / "lightrag"
|
||||||
|
EMBEDDING_SCRIPT = ROOT / "embedding_server.py"
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for(url, name, timeout=60):
|
||||||
|
print(f" Ожидаю {name}...", end="", flush=True)
|
||||||
|
for _ in range(timeout):
|
||||||
|
try:
|
||||||
|
urllib.request.urlopen(url, timeout=2)
|
||||||
|
print(" OK")
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
time.sleep(1)
|
||||||
|
print(" ТАЙМАУТ")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 50)
|
||||||
|
print("Запуск серверов LightRAG ADC")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["PYTHONUTF8"] = "1"
|
||||||
|
|
||||||
|
# 1. Embedding server
|
||||||
|
print("\n[1/2] Запуск Embedding server (порт 8010)...")
|
||||||
|
embed_proc = subprocess.Popen(
|
||||||
|
[sys.executable, str(EMBEDDING_SCRIPT)],
|
||||||
|
env=env,
|
||||||
|
cwd=str(ROOT),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not wait_for("http://localhost:8010/health", "embedding server"):
|
||||||
|
print("ОШИБКА: embedding server не запустился")
|
||||||
|
embed_proc.terminate()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 2. LightRAG server
|
||||||
|
print("\n[2/2] Запуск LightRAG server (порт 9621)...")
|
||||||
|
lightrag_proc = subprocess.Popen(
|
||||||
|
[sys.executable, "-m", "lightrag.api.lightrag_server"],
|
||||||
|
env=env,
|
||||||
|
cwd=str(LIGHTRAG_DIR),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not wait_for("http://localhost:9621/health", "LightRAG server", timeout=30):
|
||||||
|
print("ОШИБКА: LightRAG server не запустился")
|
||||||
|
embed_proc.terminate()
|
||||||
|
lightrag_proc.terminate()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print("\n" + "=" * 50)
|
||||||
|
print("Все серверы запущены!")
|
||||||
|
print(" Embedding: http://localhost:8010/health")
|
||||||
|
print(" LightRAG: http://localhost:9621/health")
|
||||||
|
print(" WebUI: http://localhost:9621/webui (если собран)")
|
||||||
|
print("=" * 50)
|
||||||
|
print("\nCtrl+C для остановки\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
embed_proc.wait()
|
||||||
|
lightrag_proc.wait()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nОстанавливаю серверы...")
|
||||||
|
embed_proc.terminate()
|
||||||
|
lightrag_proc.terminate()
|
||||||
|
print("Готово.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user