"""Build an explicit ADC knowledge graph from structured scraper output. The script reads records produced by scripts/adc_scraper/scrape_adc_product_data.py and creates: - GraphML for graph tools / NetworkX / Gephi - JSONL triples for inspection and downstream processing - JSON statistics for quick validation """ from __future__ import annotations import argparse import hashlib import json import re from collections import Counter from pathlib import Path from typing import Any import networkx as nx DEFAULT_INPUT = Path("data_adc_databaza/adc_scrape_2026_05_04/adc_products_structured.json") DEFAULT_OUT_DIR = Path("outputs/knowledge_graph") def clean_text(value: Any) -> str: if value is None: return "" text = str(value).replace("\xa0", " ") text = re.sub(r"\s+", " ", text) return text.strip() def slugify(value: str, fallback: str = "unknown") -> str: value = clean_text(value).lower() value = re.sub(r"[^a-z0-9]+", "_", value) value = value.strip("_") return value or fallback def short_hash(value: str, length: int = 12) -> str: return hashlib.sha1(value.encode("utf-8")).hexdigest()[:length] def node_id(kind: str, value: str) -> str: cleaned = clean_text(value) return f"{kind}:{slugify(cleaned)[:80]}:{short_hash(cleaned)}" def text_node_id(kind: str, drug_id: str, text: str) -> str: return f"{kind}:{short_hash(drug_id + '|' + clean_text(text), 16)}" def add_node(graph: nx.MultiDiGraph, node: str, label: str, kind: str, **attrs: Any) -> None: payload = { "label": clean_text(label), "kind": kind, } for key, value in attrs.items(): if value is not None: payload[key] = clean_text(value) if not isinstance(value, (int, float)) else value if graph.has_node(node): graph.nodes[node].update({k: v for k, v in payload.items() if v}) else: graph.add_node(node, **payload) def add_edge( graph: nx.MultiDiGraph, triples: list[dict[str, str]], source: str, relation: str, target: str, source_label: str, target_label: str, evidence: str | None = None, ) -> None: edge_attrs = {"relation": relation} if evidence: edge_attrs["evidence"] = clean_text(evidence)[:1000] graph.add_edge(source, target, **edge_attrs) triples.append( { "subject_id": source, "subject": clean_text(source_label), "predicate": relation, "object_id": target, "object": clean_text(target_label), "evidence": clean_text(evidence or ""), } ) def split_active_substances(values: Any) -> list[str]: if not values: return [] if isinstance(values, list): items = values else: items = re.split(r"[,;\n]", str(values)) cleaned = [clean_text(item) for item in items if clean_text(item)] return list(dict.fromkeys(cleaned)) def iter_records(path: Path, limit: int | None = None) -> list[dict[str, Any]]: data = json.loads(path.read_text(encoding="utf-8")) if not isinstance(data, list): raise ValueError(f"Expected a JSON list in {path}") if limit is not None: return data[:limit] return data def add_text_evidence( graph: nx.MultiDiGraph, triples: list[dict[str, str]], drug_id: str, drug_label: str, relation: str, kind: str, label: str, text: str | None, ) -> None: text = clean_text(text) if not text: return evidence_id = text_node_id(kind, drug_id, text) add_node( graph, evidence_id, label, kind, text=text[:4000], char_count=len(text), ) add_edge(graph, triples, drug_id, relation, evidence_id, drug_label, label, evidence=text) def build_graph(records: list[dict[str, Any]]) -> tuple[nx.MultiDiGraph, list[dict[str, str]], dict[str, Any]]: graph = nx.MultiDiGraph() triples: list[dict[str, str]] = [] counters: Counter[str] = Counter() for record in records: product = record.get("product") or {} graph_hints = record.get("graph_hints") or {} fields = product.get("detail_fields") or {} urls = record.get("urls") or {} drug_label = clean_text(graph_hints.get("drug") or product.get("name") or product.get("slug")) if not drug_label: counters["skipped_missing_drug_name"] += 1 continue drug_key = product.get("product_id") or fields.get("kod_statnej_autority_sukl") or urls.get("detail") or drug_label drug_id = node_id("drug", str(drug_key)) add_node( graph, drug_id, drug_label, "Drug", product_id=product.get("product_id"), sukl_code=fields.get("kod_statnej_autority_sukl"), detail_url=urls.get("detail"), pil_url=urls.get("pil"), dosage_form=graph_hints.get("dosage_form"), ) counters["drugs"] += 1 for substance in split_active_substances(graph_hints.get("active_substances")): substance_id = node_id("substance", substance) add_node(graph, substance_id, substance, "ActiveSubstance") add_edge( graph, triples, drug_id, "HAS_ACTIVE_SUBSTANCE", substance_id, drug_label, substance, ) counters["active_substance_edges"] += 1 dosage_form = clean_text(graph_hints.get("dosage_form")) if dosage_form: dosage_id = node_id("dosage_form", dosage_form) add_node(graph, dosage_id, dosage_form, "DosageForm") add_edge(graph, triples, drug_id, "HAS_DOSAGE_FORM", dosage_id, drug_label, dosage_form) counters["dosage_form_edges"] += 1 for holder_key, relation, kind in [ ("marketing_authorization_holder", "HAS_MARKETING_AUTHORIZATION_HOLDER", "MAHolder"), ("manufacturer", "HAS_MANUFACTURER", "Manufacturer"), ("supplier", "HAS_SUPPLIER", "Supplier"), ]: value = clean_text(graph_hints.get(holder_key)) if not value: continue target_id = node_id(kind.lower(), value) add_node(graph, target_id, value, kind) add_edge(graph, triples, drug_id, relation, target_id, drug_label, value) counters[f"{relation.lower()}_edges"] += 1 sukl_code = clean_text(graph_hints.get("sukl_code")) if sukl_code: sukl_id = node_id("sukl_code", sukl_code) add_node(graph, sukl_id, sukl_code, "SuklCode") add_edge(graph, triples, drug_id, "HAS_SUKL_CODE", sukl_id, drug_label, sukl_code) counters["sukl_code_edges"] += 1 for item in graph_hints.get("classification_codes") or []: if not isinstance(item, dict): continue code = clean_text(item.get("code")) name = clean_text(item.get("name")) if not code and not name: continue label = f"{code} - {name}".strip(" -") class_id = node_id("classification", label) add_node(graph, class_id, label, "Classification", code=code, name=name) add_edge(graph, triples, drug_id, "BELONGS_TO_CLASS", class_id, drug_label, label) counters["classification_edges"] += 1 add_text_evidence( graph, triples, drug_id, drug_label, "HAS_INDICATION_TEXT", "IndicationText", "Indications", graph_hints.get("indications_text"), ) add_text_evidence( graph, triples, drug_id, drug_label, "HAS_DOSAGE_TEXT", "DosageText", "Dosage", graph_hints.get("dosage_text"), ) add_text_evidence( graph, triples, drug_id, drug_label, "HAS_CONTRAINDICATION_TEXT", "ContraindicationText", "Contraindications", graph_hints.get("contraindications_text"), ) add_text_evidence( graph, triples, drug_id, drug_label, "HAS_WARNING_TEXT", "WarningText", "Warnings", graph_hints.get("warnings_text"), ) add_text_evidence( graph, triples, drug_id, drug_label, "HAS_INTERACTION_TEXT", "InteractionText", "Interactions", graph_hints.get("interactions_text"), ) add_text_evidence( graph, triples, drug_id, drug_label, "HAS_SIDE_EFFECT_TEXT", "SideEffectText", "Side effects", graph_hints.get("side_effects_text"), ) kind_counts = Counter(data.get("kind", "Unknown") for _, data in graph.nodes(data=True)) relation_counts = Counter(data.get("relation", "UNKNOWN") for _, _, data in graph.edges(data=True)) stats = { "input_records": len(records), "nodes": graph.number_of_nodes(), "edges": graph.number_of_edges(), "node_kinds": dict(kind_counts.most_common()), "relations": dict(relation_counts.most_common()), "counters": dict(counters.most_common()), } return graph, triples, stats def write_triples(path: Path, triples: list[dict[str, str]]) -> None: with path.open("w", encoding="utf-8") as out: for triple in triples: out.write(json.dumps(triple, ensure_ascii=False) + "\n") def main() -> None: parser = argparse.ArgumentParser(description="Build ADC knowledge graph from structured JSON.") parser.add_argument("--input", type=Path, default=DEFAULT_INPUT) parser.add_argument("--out-dir", type=Path, default=DEFAULT_OUT_DIR) parser.add_argument("--limit", type=int, default=None) args = parser.parse_args() records = iter_records(args.input, limit=args.limit) graph, triples, stats = build_graph(records) args.out_dir.mkdir(parents=True, exist_ok=True) graphml_path = args.out_dir / "adc_knowledge_graph.graphml" triples_path = args.out_dir / "adc_knowledge_triples.jsonl" stats_path = args.out_dir / "adc_graph_stats.json" nx.write_graphml(graph, graphml_path) write_triples(triples_path, triples) stats_path.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8") print(f"Input records: {stats['input_records']}") print(f"Nodes: {stats['nodes']}") print(f"Edges: {stats['edges']}") print(f"GraphML: {graphml_path}") print(f"Triples: {triples_path}") print(f"Stats: {stats_path}") if __name__ == "__main__": main()