DiplomovaPraca/scripts/lightrag_ingest/ingest_adc_structured.py
2026-05-14 12:41:59 +02:00

245 lines
8.8 KiB
Python

"""Ingest the new structured ADC scrape into a running LightRAG server.
Input records are produced by scripts/adc_scraper/scrape_adc_product_data.py.
The script sends each record's `lightrag_text` to `/documents/text` and uses
the ADC detail URL as `file_source`, so repeated runs can detect duplicates.
"""
from __future__ import annotations
import argparse
import json
import sys
import time
import urllib.error
import urllib.request
from collections import Counter
from pathlib import Path
from typing import Any
DEFAULT_INPUT = Path("data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json")
DEFAULT_PROGRESS = Path("outputs/lightrag_ingest/adc_structured_progress.json")
DEFAULT_LIGHTRAG_URL = "http://localhost:9621"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
def load_json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, data: Any) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
def load_progress(path: Path, resume: bool) -> dict[str, Any]:
if resume and path.exists():
return load_json(path)
return {"done": [], "failed": [], "skipped": Counter(), "last_track_ids": []}
def save_progress(path: Path, progress: dict[str, Any]) -> None:
payload = {
"done": sorted(set(progress.get("done", []))),
"failed": sorted(set(progress.get("failed", []))),
"skipped": dict(progress.get("skipped", {})),
"last_track_ids": progress.get("last_track_ids", [])[-50:],
}
write_json(path, payload)
def healthcheck(base_url: str) -> dict[str, Any]:
with urllib.request.urlopen(f"{base_url}/health", timeout=10) as response:
return json.loads(response.read().decode("utf-8"))
def pipeline_status(base_url: str) -> dict[str, Any]:
with urllib.request.urlopen(f"{base_url}/documents/pipeline_status", timeout=10) as response:
return json.loads(response.read().decode("utf-8"))
def post_text(base_url: str, text: str, file_source: str, timeout: int) -> dict[str, Any]:
payload = json.dumps({"text": text, "file_source": file_source}, ensure_ascii=False).encode("utf-8")
request = urllib.request.Request(
f"{base_url}/documents/text",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(request, timeout=timeout) as response:
return json.loads(response.read().decode("utf-8"))
def product_name(record: dict[str, Any]) -> str:
product = record.get("product") or {}
graph_hints = record.get("graph_hints") or {}
return str(graph_hints.get("drug") or product.get("name") or product.get("slug") or "unknown")
def source_id(record: dict[str, Any]) -> str:
urls = record.get("urls") or {}
product = record.get("product") or {}
return str(urls.get("detail") or product.get("product_id") or product_name(record))
def is_clinical(record: dict[str, Any]) -> bool:
hints = record.get("graph_hints") or {}
return any(
hints.get(key)
for key in (
"contraindications_text",
"interactions_text",
"warnings_text",
"side_effects_text",
)
)
def iter_selected_records(
records: list[dict[str, Any]],
progress: dict[str, Any],
clinical_only: bool,
) -> tuple[list[dict[str, Any]], Counter]:
done = set(progress.get("done", []))
failed = set(progress.get("failed", []))
skipped: Counter = Counter()
selected: list[dict[str, Any]] = []
for record in records:
source = source_id(record)
if source in done or source in failed:
skipped["already_processed"] += 1
continue
if clinical_only and not is_clinical(record):
skipped["not_clinical"] += 1
continue
if not str(record.get("lightrag_text") or "").strip():
skipped["missing_lightrag_text"] += 1
continue
selected.append(record)
return selected, skipped
def show_status(base_url: str, progress_path: Path) -> None:
if progress_path.exists():
progress = load_json(progress_path)
print(f"Progress file: {progress_path}")
print(f"Done: {len(progress.get('done', []))}")
print(f"Failed: {len(progress.get('failed', []))}")
print(f"Skipped: {progress.get('skipped', {})}")
else:
print(f"No progress file found: {progress_path}")
try:
status = pipeline_status(base_url)
except Exception as exc:
print(f"LightRAG pipeline status unavailable: {exc}")
return
print(f"Pipeline busy: {status.get('busy')}")
print(f"Latest message: {status.get('latest_message', '')}")
def ingest(args: argparse.Namespace) -> None:
records = load_json(args.input)
if not isinstance(records, list):
raise SystemExit(f"Input must be a JSON list: {args.input}")
progress = load_progress(args.progress, resume=args.resume)
selected, skipped = iter_selected_records(records, progress, clinical_only=not args.all_records)
progress["skipped"] = Counter(progress.get("skipped", {})) + skipped
if args.limit is not None:
selected = selected[: args.limit]
print(f"Input records: {len(records)}")
print(f"Selected for upload: {len(selected)}")
print(f"Skipped now: {dict(skipped)}")
print(f"Progress: done={len(progress.get('done', []))}, failed={len(progress.get('failed', []))}")
if args.dry_run:
for record in selected[:10]:
print(f"- {product_name(record)[:90]} | {source_id(record)}")
save_progress(args.progress, progress)
return
health = healthcheck(args.url)
configuration = health.get("configuration") or {}
print(f"LightRAG OK: {args.url}")
if configuration:
print(f"LLM: {configuration.get('llm_model')} | Embedding: {configuration.get('embedding_model')}")
done = set(progress.get("done", []))
failed = set(progress.get("failed", []))
last_track_ids = list(progress.get("last_track_ids", []))
for index, record in enumerate(selected, start=1):
source = source_id(record)
text = str(record.get("lightrag_text") or "").strip()
name = product_name(record)
try:
result = post_text(args.url, text=text, file_source=source, timeout=args.timeout)
status = result.get("status", "unknown")
track_id = result.get("track_id")
if track_id:
last_track_ids.append(track_id)
if status in {"success", "duplicated"}:
done.add(source)
mark = "OK"
else:
failed.add(source)
mark = "!!"
print(f"[{index:5d}/{len(selected):5d}] {mark} {status:<10} {name[:80]}")
except urllib.error.HTTPError as exc:
failed.add(source)
detail = exc.read().decode("utf-8", errors="replace")[:300]
print(f"[{index:5d}/{len(selected):5d}] HTTP {exc.code} {name[:80]} | {detail}")
except Exception as exc:
failed.add(source)
print(f"[{index:5d}/{len(selected):5d}] ERROR {name[:80]} | {exc}")
progress["done"] = sorted(done)
progress["failed"] = sorted(failed)
progress["last_track_ids"] = last_track_ids[-50:]
if index % args.save_every == 0:
save_progress(args.progress, progress)
print(f"Saved progress: done={len(done)}, failed={len(failed)}")
time.sleep(args.delay)
save_progress(args.progress, progress)
print(f"Finished upload loop. Done={len(done)}, failed={len(failed)}")
print("LightRAG will continue extracting graph entities and relations in the background.")
def main() -> None:
parser = argparse.ArgumentParser(description="Ingest structured ADC data into LightRAG.")
parser.add_argument("--input", type=Path, default=DEFAULT_INPUT)
parser.add_argument("--progress", type=Path, default=DEFAULT_PROGRESS)
parser.add_argument("--url", default=DEFAULT_LIGHTRAG_URL)
parser.add_argument("--limit", type=int, default=50)
parser.add_argument("--delay", type=float, default=0.3)
parser.add_argument("--timeout", type=int, default=60)
parser.add_argument("--save-every", type=int, default=10)
parser.add_argument("--resume", action="store_true", help="Resume from the progress file.")
parser.add_argument("--all-records", action="store_true", help="Upload all records, not only clinical ones.")
parser.add_argument("--dry-run", action="store_true", help="Show selected records without uploading.")
parser.add_argument("--status", action="store_true", help="Show local progress and LightRAG pipeline status.")
args = parser.parse_args()
if args.status:
show_status(args.url, args.progress)
return
ingest(args)
if __name__ == "__main__":
main()