Add LightRAG ingestion for structured ADC data

2026-05-14 12:41:59 +02:00 · 2026-05-14 12:41:59 +02:00 · 1eb36310d7
commit 1eb36310d7
parent 141437e968
3 changed files with 334 additions and 8 deletions
--- a/RUN_INSTRUCTION.md
+++ b/RUN_INSTRUCTION.md
@ -90,17 +90,94 @@ http://localhost:8010/health   - embedding server
 http://localhost:9621/health   - LightRAG server
 ```

+## Build Explicit Knowledge Graph
+
+This step does not require LightRAG servers. It builds a deterministic graph
+directly from the structured ADC JSON.
+
+Test on the small sample:
+
+```powershell
+python scripts/kg/build_adc_knowledge_graph.py `
+  --input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured_10.json `
+  --out-dir outputs/knowledge_graph_sample
+```
+
+Build the full graph:
+
+```powershell
+python scripts/kg/build_adc_knowledge_graph.py `
+  --input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json `
+  --out-dir outputs/knowledge_graph_full
+```
+
+Generated files:
+
+```text
+outputs/knowledge_graph_full/adc_knowledge_graph.graphml
+outputs/knowledge_graph_full/adc_knowledge_triples.jsonl
+outputs/knowledge_graph_full/adc_graph_stats.json
+```
+
+## Ingest New ADC Data Into LightRAG
+
+First start the servers:
+
+```powershell
+python start_servers.py
+```
+
+Keep that terminal open. In a second terminal, run a dry-run:
+
+```powershell
+python scripts/lightrag_ingest/ingest_adc_structured.py `
+  --input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured_10.json `
+  --dry-run `
+  --limit 5
+```
+
+Upload a small clinical batch:
+
+```powershell
+python scripts/lightrag_ingest/ingest_adc_structured.py `
+  --input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json `
+  --limit 50 `
+  --resume
+```
+
+Check progress and LightRAG pipeline status:
+
+```powershell
+python scripts/lightrag_ingest/ingest_adc_structured.py --status
+```
+
+Continue with a larger batch:
+
+```powershell
+python scripts/lightrag_ingest/ingest_adc_structured.py `
+  --input data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json `
+  --limit 200 `
+  --resume
+```
+
+By default the script uploads only clinically useful records that contain at
+least one of: contraindications, interactions, warnings, or side effects. To
+upload every record, add:
+
+```powershell
+--all-records
+```
+
 ## Old Ingestion Pipeline

-The folder `checkpoint_02_ingest/` contains an older ingestion pipeline that
-loads data from:
+Older local experiments used `checkpoint_02_ingest/` to load data from:

 ```powershell
 data_adc_databaza/cleaned_general_info_additional.json
 ```

-It is kept as a reference because it already contains working LightRAG upload
-logic and progress tracking:
+If that folder is present in your local workspace, treat it only as a reference
+for older LightRAG upload logic:

 ```powershell
 python checkpoint_02_ingest/load_leaflets.py --count 50
@ -108,13 +185,17 @@ python checkpoint_02_ingest/load_leaflets.py --status
 ```

 Do not treat this as the final ingestion path for the new dataset. The next
-step is to create a new ingestion script that reads:
+current ingestion script for the new dataset reads:

 ```powershell
 data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json
 ```

-and sends each record's `lightrag_text` to LightRAG.
+and sends each record's `lightrag_text` to LightRAG:
+
+```powershell
+python scripts/lightrag_ingest/ingest_adc_structured.py --limit 50 --resume
+```

 ## Query LightRAG

@ -140,13 +221,13 @@ Available query modes:
 Avoid querying while the document pipeline is still busy. Entity extraction can
 take several minutes per batch depending on the LLM API and concurrency limits.

-## Reset LightRAG Storage
+## Reset LightRAG Storage For a Clean Rebuild

 Stop the servers first, then clear generated graph/vector data:

 ```powershell
 Remove-Item -LiteralPath "c:\Users\Oleh\Desktop\Diplomova praca\lightrag\rag_storage\*" -Force
-python checkpoint_02_ingest/load_leaflets.py --reset
+Remove-Item -LiteralPath "c:\Users\Oleh\Desktop\Diplomova praca\outputs\lightrag_ingest\adc_structured_progress.json" -Force
 ```

 Use this only when you intentionally want to rebuild the graph.
--- a/scripts/lightrag_ingest/init.py
+++ b/scripts/lightrag_ingest/init.py
@ -0,0 +1 @@
+"""LightRAG ingestion scripts for ADC structured data."""
--- a/scripts/lightrag_ingest/ingest_adc_structured.py
+++ b/scripts/lightrag_ingest/ingest_adc_structured.py
@ -0,0 +1,244 @@
+"""Ingest the new structured ADC scrape into a running LightRAG server.
+
+Input records are produced by scripts/adc_scraper/scrape_adc_product_data.py.
+The script sends each record's `lightrag_text` to `/documents/text` and uses
+the ADC detail URL as `file_source`, so repeated runs can detect duplicates.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+import urllib.error
+import urllib.request
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+
+DEFAULT_INPUT = Path("data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json")
+DEFAULT_PROGRESS = Path("outputs/lightrag_ingest/adc_structured_progress.json")
+DEFAULT_LIGHTRAG_URL = "http://localhost:9621"
+
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8")
+
+
+def load_json(path: Path) -> Any:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def write_json(path: Path, data: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def load_progress(path: Path, resume: bool) -> dict[str, Any]:
+    if resume and path.exists():
+        return load_json(path)
+    return {"done": [], "failed": [], "skipped": Counter(), "last_track_ids": []}
+
+
+def save_progress(path: Path, progress: dict[str, Any]) -> None:
+    payload = {
+        "done": sorted(set(progress.get("done", []))),
+        "failed": sorted(set(progress.get("failed", []))),
+        "skipped": dict(progress.get("skipped", {})),
+        "last_track_ids": progress.get("last_track_ids", [])[-50:],
+    }
+    write_json(path, payload)
+
+
+def healthcheck(base_url: str) -> dict[str, Any]:
+    with urllib.request.urlopen(f"{base_url}/health", timeout=10) as response:
+        return json.loads(response.read().decode("utf-8"))
+
+
+def pipeline_status(base_url: str) -> dict[str, Any]:
+    with urllib.request.urlopen(f"{base_url}/documents/pipeline_status", timeout=10) as response:
+        return json.loads(response.read().decode("utf-8"))
+
+
+def post_text(base_url: str, text: str, file_source: str, timeout: int) -> dict[str, Any]:
+    payload = json.dumps({"text": text, "file_source": file_source}, ensure_ascii=False).encode("utf-8")
+    request = urllib.request.Request(
+        f"{base_url}/documents/text",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(request, timeout=timeout) as response:
+        return json.loads(response.read().decode("utf-8"))
+
+
+def product_name(record: dict[str, Any]) -> str:
+    product = record.get("product") or {}
+    graph_hints = record.get("graph_hints") or {}
+    return str(graph_hints.get("drug") or product.get("name") or product.get("slug") or "unknown")
+
+
+def source_id(record: dict[str, Any]) -> str:
+    urls = record.get("urls") or {}
+    product = record.get("product") or {}
+    return str(urls.get("detail") or product.get("product_id") or product_name(record))
+
+
+def is_clinical(record: dict[str, Any]) -> bool:
+    hints = record.get("graph_hints") or {}
+    return any(
+        hints.get(key)
+        for key in (
+            "contraindications_text",
+            "interactions_text",
+            "warnings_text",
+            "side_effects_text",
+        )
+    )
+
+
+def iter_selected_records(
+    records: list[dict[str, Any]],
+    progress: dict[str, Any],
+    clinical_only: bool,
+) -> tuple[list[dict[str, Any]], Counter]:
+    done = set(progress.get("done", []))
+    failed = set(progress.get("failed", []))
+    skipped: Counter = Counter()
+    selected: list[dict[str, Any]] = []
+
+    for record in records:
+        source = source_id(record)
+        if source in done or source in failed:
+            skipped["already_processed"] += 1
+            continue
+        if clinical_only and not is_clinical(record):
+            skipped["not_clinical"] += 1
+            continue
+        if not str(record.get("lightrag_text") or "").strip():
+            skipped["missing_lightrag_text"] += 1
+            continue
+        selected.append(record)
+
+    return selected, skipped
+
+
+def show_status(base_url: str, progress_path: Path) -> None:
+    if progress_path.exists():
+        progress = load_json(progress_path)
+        print(f"Progress file: {progress_path}")
+        print(f"Done: {len(progress.get('done', []))}")
+        print(f"Failed: {len(progress.get('failed', []))}")
+        print(f"Skipped: {progress.get('skipped', {})}")
+    else:
+        print(f"No progress file found: {progress_path}")
+
+    try:
+        status = pipeline_status(base_url)
+    except Exception as exc:
+        print(f"LightRAG pipeline status unavailable: {exc}")
+        return
+
+    print(f"Pipeline busy: {status.get('busy')}")
+    print(f"Latest message: {status.get('latest_message', '')}")
+
+
+def ingest(args: argparse.Namespace) -> None:
+    records = load_json(args.input)
+    if not isinstance(records, list):
+        raise SystemExit(f"Input must be a JSON list: {args.input}")
+
+    progress = load_progress(args.progress, resume=args.resume)
+    selected, skipped = iter_selected_records(records, progress, clinical_only=not args.all_records)
+    progress["skipped"] = Counter(progress.get("skipped", {})) + skipped
+
+    if args.limit is not None:
+        selected = selected[: args.limit]
+
+    print(f"Input records: {len(records)}")
+    print(f"Selected for upload: {len(selected)}")
+    print(f"Skipped now: {dict(skipped)}")
+    print(f"Progress: done={len(progress.get('done', []))}, failed={len(progress.get('failed', []))}")
+
+    if args.dry_run:
+        for record in selected[:10]:
+            print(f"- {product_name(record)[:90]} | {source_id(record)}")
+        save_progress(args.progress, progress)
+        return
+
+    health = healthcheck(args.url)
+    configuration = health.get("configuration") or {}
+    print(f"LightRAG OK: {args.url}")
+    if configuration:
+        print(f"LLM: {configuration.get('llm_model')} | Embedding: {configuration.get('embedding_model')}")
+
+    done = set(progress.get("done", []))
+    failed = set(progress.get("failed", []))
+    last_track_ids = list(progress.get("last_track_ids", []))
+
+    for index, record in enumerate(selected, start=1):
+        source = source_id(record)
+        text = str(record.get("lightrag_text") or "").strip()
+        name = product_name(record)
+
+        try:
+            result = post_text(args.url, text=text, file_source=source, timeout=args.timeout)
+            status = result.get("status", "unknown")
+            track_id = result.get("track_id")
+            if track_id:
+                last_track_ids.append(track_id)
+            if status in {"success", "duplicated"}:
+                done.add(source)
+                mark = "OK"
+            else:
+                failed.add(source)
+                mark = "!!"
+            print(f"[{index:5d}/{len(selected):5d}] {mark} {status:<10} {name[:80]}")
+        except urllib.error.HTTPError as exc:
+            failed.add(source)
+            detail = exc.read().decode("utf-8", errors="replace")[:300]
+            print(f"[{index:5d}/{len(selected):5d}] HTTP {exc.code} {name[:80]} | {detail}")
+        except Exception as exc:
+            failed.add(source)
+            print(f"[{index:5d}/{len(selected):5d}] ERROR {name[:80]} | {exc}")
+
+        progress["done"] = sorted(done)
+        progress["failed"] = sorted(failed)
+        progress["last_track_ids"] = last_track_ids[-50:]
+
+        if index % args.save_every == 0:
+            save_progress(args.progress, progress)
+            print(f"Saved progress: done={len(done)}, failed={len(failed)}")
+
+        time.sleep(args.delay)
+
+    save_progress(args.progress, progress)
+    print(f"Finished upload loop. Done={len(done)}, failed={len(failed)}")
+    print("LightRAG will continue extracting graph entities and relations in the background.")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Ingest structured ADC data into LightRAG.")
+    parser.add_argument("--input", type=Path, default=DEFAULT_INPUT)
+    parser.add_argument("--progress", type=Path, default=DEFAULT_PROGRESS)
+    parser.add_argument("--url", default=DEFAULT_LIGHTRAG_URL)
+    parser.add_argument("--limit", type=int, default=50)
+    parser.add_argument("--delay", type=float, default=0.3)
+    parser.add_argument("--timeout", type=int, default=60)
+    parser.add_argument("--save-every", type=int, default=10)
+    parser.add_argument("--resume", action="store_true", help="Resume from the progress file.")
+    parser.add_argument("--all-records", action="store_true", help="Upload all records, not only clinical ones.")
+    parser.add_argument("--dry-run", action="store_true", help="Show selected records without uploading.")
+    parser.add_argument("--status", action="store_true", help="Show local progress and LightRAG pipeline status.")
+    args = parser.parse_args()
+
+    if args.status:
+        show_status(args.url, args.progress)
+        return
+
+    ingest(args)
+
+
+if __name__ == "__main__":
+    main()
				`@ -0,0 +1 @@`
				`"""LightRAG ingestion scripts for ADC structured data."""`