Add LightRAG ingestion for structured ADC data

This commit is contained in:
Oleh Poiasnik 2026-05-14 12:41:59 +02:00
parent 141437e968
commit 1eb36310d7
3 changed files with 334 additions and 8 deletions

View File

@ -90,17 +90,94 @@ http://localhost:8010/health - embedding server
http://localhost:9621/health - LightRAG server
```
## Build Explicit Knowledge Graph
This step does not require LightRAG servers. It builds a deterministic graph
directly from the structured ADC JSON.
Test on the small sample:
```powershell
python scripts/kg/build_adc_knowledge_graph.py `
--input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured_10.json `
--out-dir outputs/knowledge_graph_sample
```
Build the full graph:
```powershell
python scripts/kg/build_adc_knowledge_graph.py `
--input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json `
--out-dir outputs/knowledge_graph_full
```
Generated files:
```text
outputs/knowledge_graph_full/adc_knowledge_graph.graphml
outputs/knowledge_graph_full/adc_knowledge_triples.jsonl
outputs/knowledge_graph_full/adc_graph_stats.json
```
## Ingest New ADC Data Into LightRAG
First start the servers:
```powershell
python start_servers.py
```
Keep that terminal open. In a second terminal, run a dry-run:
```powershell
python scripts/lightrag_ingest/ingest_adc_structured.py `
--input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured_10.json `
--dry-run `
--limit 5
```
Upload a small clinical batch:
```powershell
python scripts/lightrag_ingest/ingest_adc_structured.py `
--input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json `
--limit 50 `
--resume
```
Check progress and LightRAG pipeline status:
```powershell
python scripts/lightrag_ingest/ingest_adc_structured.py --status
```
Continue with a larger batch:
```powershell
python scripts/lightrag_ingest/ingest_adc_structured.py `
--input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json `
--limit 200 `
--resume
```
By default the script uploads only clinically useful records that contain at
least one of: contraindications, interactions, warnings, or side effects. To
upload every record, add:
```powershell
--all-records
```
## Old Ingestion Pipeline
The folder `checkpoint_02_ingest/` contains an older ingestion pipeline that
loads data from:
Older local experiments used `checkpoint_02_ingest/` to load data from:
```powershell
data_adc_databaza/cleaned_general_info_additional.json
```
It is kept as a reference because it already contains working LightRAG upload
logic and progress tracking:
If that folder is present in your local workspace, treat it only as a reference
for older LightRAG upload logic:
```powershell
python checkpoint_02_ingest/load_leaflets.py --count 50
@ -108,13 +185,17 @@ python checkpoint_02_ingest/load_leaflets.py --status
```
Do not treat this as the final ingestion path for the new dataset. The next
step is to create a new ingestion script that reads:
current ingestion script for the new dataset reads:
```powershell
data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json
```
and sends each record's `lightrag_text` to LightRAG.
and sends each record's `lightrag_text` to LightRAG:
```powershell
python scripts/lightrag_ingest/ingest_adc_structured.py --limit 50 --resume
```
## Query LightRAG
@ -140,13 +221,13 @@ Available query modes:
Avoid querying while the document pipeline is still busy. Entity extraction can
take several minutes per batch depending on the LLM API and concurrency limits.
## Reset LightRAG Storage
## Reset LightRAG Storage For a Clean Rebuild
Stop the servers first, then clear generated graph/vector data:
```powershell
Remove-Item -LiteralPath "c:\Users\Oleh\Desktop\Diplomova praca\lightrag\rag_storage\*" -Force
python checkpoint_02_ingest/load_leaflets.py --reset
Remove-Item -LiteralPath "c:\Users\Oleh\Desktop\Diplomova praca\outputs\lightrag_ingest\adc_structured_progress.json" -Force
```
Use this only when you intentionally want to rebuild the graph.

View File

@ -0,0 +1 @@
"""LightRAG ingestion scripts for ADC structured data."""

View File

@ -0,0 +1,244 @@
"""Ingest the new structured ADC scrape into a running LightRAG server.
Input records are produced by scripts/adc_scraper/scrape_adc_product_data.py.
The script sends each record's `lightrag_text` to `/documents/text` and uses
the ADC detail URL as `file_source`, so repeated runs can detect duplicates.
"""
from __future__ import annotations
import argparse
import json
import sys
import time
import urllib.error
import urllib.request
from collections import Counter
from pathlib import Path
from typing import Any
DEFAULT_INPUT = Path("data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json")
DEFAULT_PROGRESS = Path("outputs/lightrag_ingest/adc_structured_progress.json")
DEFAULT_LIGHTRAG_URL = "http://localhost:9621"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
def load_json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, data: Any) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
def load_progress(path: Path, resume: bool) -> dict[str, Any]:
if resume and path.exists():
return load_json(path)
return {"done": [], "failed": [], "skipped": Counter(), "last_track_ids": []}
def save_progress(path: Path, progress: dict[str, Any]) -> None:
payload = {
"done": sorted(set(progress.get("done", []))),
"failed": sorted(set(progress.get("failed", []))),
"skipped": dict(progress.get("skipped", {})),
"last_track_ids": progress.get("last_track_ids", [])[-50:],
}
write_json(path, payload)
def healthcheck(base_url: str) -> dict[str, Any]:
with urllib.request.urlopen(f"{base_url}/health", timeout=10) as response:
return json.loads(response.read().decode("utf-8"))
def pipeline_status(base_url: str) -> dict[str, Any]:
with urllib.request.urlopen(f"{base_url}/documents/pipeline_status", timeout=10) as response:
return json.loads(response.read().decode("utf-8"))
def post_text(base_url: str, text: str, file_source: str, timeout: int) -> dict[str, Any]:
payload = json.dumps({"text": text, "file_source": file_source}, ensure_ascii=False).encode("utf-8")
request = urllib.request.Request(
f"{base_url}/documents/text",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(request, timeout=timeout) as response:
return json.loads(response.read().decode("utf-8"))
def product_name(record: dict[str, Any]) -> str:
product = record.get("product") or {}
graph_hints = record.get("graph_hints") or {}
return str(graph_hints.get("drug") or product.get("name") or product.get("slug") or "unknown")
def source_id(record: dict[str, Any]) -> str:
urls = record.get("urls") or {}
product = record.get("product") or {}
return str(urls.get("detail") or product.get("product_id") or product_name(record))
def is_clinical(record: dict[str, Any]) -> bool:
hints = record.get("graph_hints") or {}
return any(
hints.get(key)
for key in (
"contraindications_text",
"interactions_text",
"warnings_text",
"side_effects_text",
)
)
def iter_selected_records(
records: list[dict[str, Any]],
progress: dict[str, Any],
clinical_only: bool,
) -> tuple[list[dict[str, Any]], Counter]:
done = set(progress.get("done", []))
failed = set(progress.get("failed", []))
skipped: Counter = Counter()
selected: list[dict[str, Any]] = []
for record in records:
source = source_id(record)
if source in done or source in failed:
skipped["already_processed"] += 1
continue
if clinical_only and not is_clinical(record):
skipped["not_clinical"] += 1
continue
if not str(record.get("lightrag_text") or "").strip():
skipped["missing_lightrag_text"] += 1
continue
selected.append(record)
return selected, skipped
def show_status(base_url: str, progress_path: Path) -> None:
if progress_path.exists():
progress = load_json(progress_path)
print(f"Progress file: {progress_path}")
print(f"Done: {len(progress.get('done', []))}")
print(f"Failed: {len(progress.get('failed', []))}")
print(f"Skipped: {progress.get('skipped', {})}")
else:
print(f"No progress file found: {progress_path}")
try:
status = pipeline_status(base_url)
except Exception as exc:
print(f"LightRAG pipeline status unavailable: {exc}")
return
print(f"Pipeline busy: {status.get('busy')}")
print(f"Latest message: {status.get('latest_message', '')}")
def ingest(args: argparse.Namespace) -> None:
records = load_json(args.input)
if not isinstance(records, list):
raise SystemExit(f"Input must be a JSON list: {args.input}")
progress = load_progress(args.progress, resume=args.resume)
selected, skipped = iter_selected_records(records, progress, clinical_only=not args.all_records)
progress["skipped"] = Counter(progress.get("skipped", {})) + skipped
if args.limit is not None:
selected = selected[: args.limit]
print(f"Input records: {len(records)}")
print(f"Selected for upload: {len(selected)}")
print(f"Skipped now: {dict(skipped)}")
print(f"Progress: done={len(progress.get('done', []))}, failed={len(progress.get('failed', []))}")
if args.dry_run:
for record in selected[:10]:
print(f"- {product_name(record)[:90]} | {source_id(record)}")
save_progress(args.progress, progress)
return
health = healthcheck(args.url)
configuration = health.get("configuration") or {}
print(f"LightRAG OK: {args.url}")
if configuration:
print(f"LLM: {configuration.get('llm_model')} | Embedding: {configuration.get('embedding_model')}")
done = set(progress.get("done", []))
failed = set(progress.get("failed", []))
last_track_ids = list(progress.get("last_track_ids", []))
for index, record in enumerate(selected, start=1):
source = source_id(record)
text = str(record.get("lightrag_text") or "").strip()
name = product_name(record)
try:
result = post_text(args.url, text=text, file_source=source, timeout=args.timeout)
status = result.get("status", "unknown")
track_id = result.get("track_id")
if track_id:
last_track_ids.append(track_id)
if status in {"success", "duplicated"}:
done.add(source)
mark = "OK"
else:
failed.add(source)
mark = "!!"
print(f"[{index:5d}/{len(selected):5d}] {mark} {status:<10} {name[:80]}")
except urllib.error.HTTPError as exc:
failed.add(source)
detail = exc.read().decode("utf-8", errors="replace")[:300]
print(f"[{index:5d}/{len(selected):5d}] HTTP {exc.code} {name[:80]} | {detail}")
except Exception as exc:
failed.add(source)
print(f"[{index:5d}/{len(selected):5d}] ERROR {name[:80]} | {exc}")
progress["done"] = sorted(done)
progress["failed"] = sorted(failed)
progress["last_track_ids"] = last_track_ids[-50:]
if index % args.save_every == 0:
save_progress(args.progress, progress)
print(f"Saved progress: done={len(done)}, failed={len(failed)}")
time.sleep(args.delay)
save_progress(args.progress, progress)
print(f"Finished upload loop. Done={len(done)}, failed={len(failed)}")
print("LightRAG will continue extracting graph entities and relations in the background.")
def main() -> None:
parser = argparse.ArgumentParser(description="Ingest structured ADC data into LightRAG.")
parser.add_argument("--input", type=Path, default=DEFAULT_INPUT)
parser.add_argument("--progress", type=Path, default=DEFAULT_PROGRESS)
parser.add_argument("--url", default=DEFAULT_LIGHTRAG_URL)
parser.add_argument("--limit", type=int, default=50)
parser.add_argument("--delay", type=float, default=0.3)
parser.add_argument("--timeout", type=int, default=60)
parser.add_argument("--save-every", type=int, default=10)
parser.add_argument("--resume", action="store_true", help="Resume from the progress file.")
parser.add_argument("--all-records", action="store_true", help="Upload all records, not only clinical ones.")
parser.add_argument("--dry-run", action="store_true", help="Show selected records without uploading.")
parser.add_argument("--status", action="store_true", help="Show local progress and LightRAG pipeline status.")
args = parser.parse_args()
if args.status:
show_status(args.url, args.progress)
return
ingest(args)
if __name__ == "__main__":
main()