Nahrát soubory do „/“

2026-05-16 06:22:02 +00:00 · 2026-05-16 06:22:02 +00:00 · 22f8a3144a
commit 22f8a3144a
9 changed files with 807 additions and 0 deletions
--- a/graph_builder.py
+++ b/graph_builder.py
@ -0,0 +1,411 @@
 import json
 import os
 import networkx as nx
 import re
 import hashlib
 from ingest import load_documents
 from llm import llm_call
 # Cleans response to avoid errors
 def clean_llm_response(text: str) -> str:
    """
    Removes markdown, extra text, and extracts JSON safely.
    """
    if not text:
        raise ValueError("LLM returned empty response")
    text = text.strip()
    # Remove ```json or ``` wrappers
    text = re.sub(r"^```json", "", text)
    text = re.sub(r"^```", "", text)
    text = re.sub(r"```$", "", text)
    text = text.strip()
    # Try to extract JSON object if model adds extra text
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        return match.group(0)
    return text
 # Gets cached extraction for a chunk, or extracts and caches it
 def get_cached_extraction(chunk, cache_dir="cache"):
    os.makedirs(cache_dir, exist_ok=True)
    # Create unique hash for chunk
    chunk_hash = hashlib.md5(
        chunk.encode("utf-8")
    ).hexdigest()
    cache_path = os.path.join(
        cache_dir,
        f"{chunk_hash}.json"
    )
    # Cache Load (if exists) 
    if os.path.exists(cache_path):
        print(f"[CACHE HIT] {chunk_hash}")
        with open(cache_path, "r", encoding="utf-8") as f:
            return json.load(f)
    # Extract json and store in cache
    print(f"[CACHE MISS] {chunk_hash}")
    extraction = extract_entities_and_relations(chunk)
    # Save cache
    with open(cache_path, "w", encoding="utf-8") as f:
        json.dump(
            extraction,
            f,
            indent=2,
            ensure_ascii=False
        )
    return extraction
 # Calls LLM to get entities and realtions (.json) and saves them to cache
 def extract_entities_and_relations(text):
    text = text[:8000]  # prevent context overflow
    prompt = f"""
 You are an educational knowledge graph extraction system.
 Your task:
 Extract educational entities and relations.
 Rules:
 - Return ONLY EXACT JSON
 - Use ONLY meaningful educational concepts
 - NO markdown
 - NO explanations
 - NO extra text
 - Use these relations ONLY:
 [  "is_a",
  "part_of",
  "uses",
  "teaches",
  "requires",
  "depends_on",
  "implements",
  "applies_to",
  "subfield_of",
  "contains",
  "defined_by"]
 - Entity names must be normalized:
    GOOD:
    - "Machine Learning"
    - "Artificial Intelligence"
    BAD:
    - "machine learning"
    - "ML"
    - "AI systems"
 - Prefer fewer HIGH QUALITY relations over many weak relations
 - Every entity should participate in at least one relation
 OUTPUT FORMAT EXACTLY:
 {{
  "entities": [
    {{"name": "...", "type": "..."}}
  ],
  "relations": [
    {{
      "source": "...",
      "target": "...",
      "relation": "..."
    }}
  ]
 }}
 TEXT:
 {text}
 """
    response = llm_call([
        {"role": "user", "content": prompt}
    ])
    cleaned = clean_llm_response(response)
    try:
        data = json.loads(cleaned)
    except json.JSONDecodeError:
        print("\nINVALID JSON")
        print(cleaned)
        raise
    # Checks if llm output is valid
    entities = data.get("entities", [])
    relations = data.get("relations", [])
    # Normalize entity names
    normalized_entities = {}
    for ent in entities:
        name = ent["name"].strip()
        # title case normalization
        name = " ".join(word.capitalize() for word in name.split())
        normalized_entities[name] = {
            "name": name,
            "type": ent.get("type", "concept")
        }
    entity_names = set(normalized_entities.keys())
    valid_relations = []
    for rel in relations:
        source = rel["source"].strip()
        target = rel["target"].strip()
        relation = rel["relation"].strip()
        source = " ".join(word.capitalize() for word in source.split())
        target = " ".join(word.capitalize() for word in target.split())
        # Skip self-loops
        if source == target:
            continue
        # Ensure both entities exist
        if source not in entity_names:
            continue
        if target not in entity_names:
            continue
        valid_relations.append({
            "source": source,
            "target": target,
            "relation": relation
        })
    # Remove isolated entities
    connected = set()
    for rel in valid_relations:
        connected.add(rel["source"])
        connected.add(rel["target"])
    final_entities = [
        ent for ent in normalized_entities.values()
        if ent["name"] in connected
    ]
    return {
        "entities": final_entities,
        "relations": valid_relations
    }
 # Chunks document into chunks (with overlap)
 def chunk_text(text, chunk_size=8000, overlap=300):
    text = text.replace("\r", " ")
    paragraphs = text.split("\n")
    chunks = []
    current = ""
    for para in paragraphs:
        para = para.strip()
        # Skip useless tiny paragraphs
        if len(para) < 40:
            continue
        if len(current) + len(para) < chunk_size:
            current += para + "\n"
        else:
            chunks.append(current)
            # overlap keeps context continuity
            current = current[-overlap:] + "\n" + para + "\n"
    if current:
        chunks.append(current)
    return chunks
 # Builds graph from documents (must be in documents folder [.pdf, .txt, .docx])
 def build_graph_from_documents(folder="documents"):
    G = nx.DiGraph()
    docs = load_documents(folder)
    for doc in docs:
        print(f"\n[INFO] Processing document: {doc['filename']}")
        # Document is split into chunks
        chunks = chunk_text(doc["content"])
        print(f"[INFO] Total chunks: {len(chunks)}")
        # Process chunk by chunk
        for idx, chunk in enumerate(chunks):
            print(f"[INFO] Processing chunk {idx + 1}/{len(chunks)}")
            try:
                extraction = get_cached_extraction(chunk)
            except Exception as e:
                print(f"[WARN] Extraction failed on chunk {idx + 1}")
                print(e)
                continue
            # Add entities (nodes) to graph
            for ent in extraction["entities"]:
                G.add_node(
                    ent["name"],
                    type=ent["type"],
                    source_doc=doc["filename"]
                )
            # Adds entity realtions (links nodes)
            for rel in extraction["relations"]:
                G.add_edge(
                    rel["source"],
                    rel["target"],
                    relation=rel["relation"],
                    source_doc=doc["filename"]
                )
    return G
 # Saves graph to file 
 def save_graph(G, path="graph/kg.graphml"):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    nx.write_graphml(G, path)
    print(f"Graph saved to {path}")
 # Loads graph from file
 def load_graph(path="graph/kg.graphml"):
    return nx.read_graphml(path)
 # Adds document to graph
 def add_document(filepath, G):
    docs = load_documents(folder=filepath)
    if not docs:
        print(f"[WARN] No documents found in: {filepath}")
        return
    doc = docs[0]
    print(f"\n[INFO] Adding document: {doc['filename']}")
    # Document is split into chunks 
    chunks = chunk_text(doc["content"])
    print(f"[INFO] Total chunks: {len(chunks)}")
    # Process chunk by chunk
    for idx, chunk in enumerate(chunks):
        print(f"[INFO] Processing chunk {idx + 1}/{len(chunks)}")
        try:
            extraction = get_cached_extraction(chunk)
        except Exception as e:
            print(f"[WARN] Extraction failed on chunk {idx + 1}")
            print(e)
            continue
        # Add entities (nodes) to graph
        for ent in extraction["entities"]:
            G.add_node(
                ent["name"],
                type=ent["type"],
                source_doc=doc["filename"]
            )
        # Adds entity realtions (links nodes)
        for rel in extraction["relations"]:
            G.add_edge(
                rel["source"],
                rel["target"],
                relation=rel["relation"],
                source_doc=doc["filename"]
            )
    print(f"[INFO] Finished adding {doc['filename']}")
 # Removes document from graph
 def remove_document(filename, G):
    edges_to_remove = []
    for u, v, data in G.edges(data=True):
        if data.get("source_doc") == filename:
            edges_to_remove.append((u, v))
    G.remove_edges_from(edges_to_remove)
    # Remove orphan nodes
    isolated_nodes = list(nx.isolates(G))
    G.remove_nodes_from(isolated_nodes)
    print(f"[INFO] Removed document: {filename}")
 '''
 #test
 if __name__ == "__main__":
    docs = load_documents()
    extraction = extract_entities_and_relations(
        docs[0]["content"]
    )
    print(extraction)
 '''
 #test
 if __name__ == "__main__":
    #G2 = load_graph()
    #print(G2.nodes(data=True))
    G = build_graph_from_documents()
    save_graph(G)
    #print(G.nodes(data=True))
    #print(G.edges(data=True))
--- a/ingest.py
+++ b/ingest.py
@ -0,0 +1,43 @@
 from pathlib import Path
 from pypdf import PdfReader
 from docx import Document
 def load_documents(folder="documents"):
    docs = []
    for file in Path(folder).glob("*"):
        if file.suffix == ".txt":
            text = file.read_text(encoding="utf-8")
            docs.append({
                "filename": file.name,
                "content": text
            })
        elif file.suffix == ".pdf":
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            docs.append({
                "filename": file.name,
                "content": text
            })
        elif file.suffix == ".docx":
            doc = Document(file)
            text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
            docs.append({
                "filename": file.name,
                "content": text
            })
    return docs
 #test
 if __name__ == "__main__":
    docs = load_documents()
    print(docs)
--- a/llm.py
+++ b/llm.py
@ -0,0 +1,35 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Dict, List
 import requests
 # TUKE LLM Connection
 UNIVERSITY_BASE_URL = "https://ui.tukekemt.xyz/api/v1/chat/completions"
 UNIVERSITY_API_KEY  = "sk-06098ff9afb946c2b9d197cb400cd752"
 UNIVERSITY_MODEL    = "model2"
 # LLM call
 def llm_call(messages: List[Dict[str, str]]) -> str:
    """Send a list of {role, content} dicts to the university LLM and return the reply text."""
    resp = requests.post(
        UNIVERSITY_BASE_URL,
        json={"model": UNIVERSITY_MODEL, "messages": messages, "stream": False},
        headers={
            "Authorization": f"Bearer {UNIVERSITY_API_KEY}",
            "Content-Type": "application/json",
        },
        timeout=600,
    )
    resp.raise_for_status()
    return resp.json()["choices"][0]["message"]["content"]
 #test
 if __name__ == "__main__":
    response = llm_call([
        {
            "role": "user",
            "content": "Say hello"
        }
    ])
    print(response)
--- a/main.py
+++ b/main.py
@ -0,0 +1,88 @@
 from graph_builder import (
    build_graph_from_documents,
    save_graph,
    load_graph
 )
 from react_agent import react_agent  
 from visualization import visualize_graph
 GRAPH_PATH = "graph/kg.graphml"
 # Create graph or load existing one
 def get_or_build_graph(force_rebuild=False):
    if not force_rebuild:
        try:
            print("\n[INFO] Loading existing graph...")
            G = load_graph(GRAPH_PATH)
            print(f"[INFO] Loaded graph: {len(G.nodes())} nodes, {len(G.edges())} edges")
            return G
        except Exception as e:
            print(f"[WARN] Could not load graph: {e}")
            print("[INFO] Rebuilding graph...")
    print("\n[INFO] Building graph from documents...")
    G = build_graph_from_documents()
    print(f"[INFO] Built graph: {len(G.nodes())} nodes, {len(G.edges())} edges")
    print("\n[INFO] Saving graph...")
    save_graph(G, GRAPH_PATH)
    return G
 # Testing questions (query)
 def ask_question(G, question):
    print("\n" + "=" * 60)
    print(f"QUESTION: {question}")
    print("=" * 60)
    history = []
    result = react_agent(
        user_message=question,
        history=history,
        G=G
    )
    print("\n--- FINAL ANSWER ---")
    print(result.answer)
    print("\n--- EVIDENCE (GRAPH) ---")
    print(result.evidence)
    return result
 # Visualisation
 def show_graph(G):
    print("\n[INFO] Generating visualization...")
    visualize_graph(G)
    print("[INFO] Graph saved as kg.html")
 # Run main
 if __name__ == "__main__":
    # Create graph or load existing one
    G = get_or_build_graph(force_rebuild=False)
    # Run visualisation.py (optional and for testing)
    #show_graph(G)
    # Testing questions (query)
    test_questions = [
        #"What is taught in Informatics?",
        #"What is used in machine learning?",
        #"What Literary Texts are taught in Slovak Language And Slovak Literature?",
        "Čo sa učí na predmete Slovenský jazyk a literatúra?"
    ]
    for q in test_questions:
        ask_question(G, q)
    print("\n[INFO] PIPELINE COMPLETED")
--- a/models.py
+++ b/models.py
@ -0,0 +1,30 @@
 from dataclasses import dataclass
 from typing import Optional, List
@dataclass
 class GraphRAGResponse:
    question: str
    answer: str
    evidence: Optional[str] = None
    entities_used: Optional[List[str]] = None
    relations_used: Optional[List[str]] = None
    source_documents: Optional[List[str]] = None
    def __str__(self):
        return (
            "\nGraphRAGResponse(\n"
            f"  question          = {self.question!r}\n"
            f"  answer            = {self.answer!r}\n"
            f"  evidence          = {self.evidence!r}\n"
            f"  entities_used     = {self.entities_used!r}\n"
            f"  relations_used    = {self.relations_used!r}\n"
            f"  source_documents  = {self.source_documents!r}\n"
            ")"
        )
--- a/react_agent.py
+++ b/react_agent.py
@ -0,0 +1,82 @@
 from tools import TOOL_MAP
 from llm import llm_call
 import re
 from typing import List, Dict
 from models import GraphRAGResponse
 SYSTEM_PROMPT = """
 You are an educational GraphRAG assistant for schools.
 You MUST use tools when answering knowledge questions.
 Available tool:
 - query_knowledge_graph
 Format:
 Action: query_knowledge_graph
 Action Input: <question>
 OR:
 Final Answer: <answer>
 """
 def parse_action(text: str):
    action = re.search(r"Action:\s*(\w+)", text)
    input_ = re.search(r"Action Input:\s*(.+)", text)
    if action and input_:
        return action.group(1), input_.group(1)
    return None
 def parse_final(text: str):
    m = re.search(r"Final Answer:\s*(.+)", text, re.DOTALL)
    return m.group(1).strip() if m else None
 def react_agent(user_message: str, history: List[Dict], G=None, max_steps: int = 5):
    messages = (
        [{"role": "system", "content": SYSTEM_PROMPT}]
        + history
        + [{"role": "user", "content": user_message}]
    )
    last = ""
    for _ in range(max_steps):
        reply = llm_call(messages)
        last = reply
        final = parse_final(reply)
        if final:
            return GraphRAGResponse(
                question=user_message,
                answer=final,
                evidence=""
            )
        action = parse_action(reply)
        if action:
            tool_name, tool_input = action
            if tool_name == "query_knowledge_graph":
                result = TOOL_MAP[tool_name](tool_input, G)
                messages.append({"role": "assistant", "content": reply})
                messages.append({"role": "user", "content": f"Observation: {result}"})
                return GraphRAGResponse(
                    question=user_message,
                    answer=result["answer"],
                    evidence=result["evidence"]
                )
    return GraphRAGResponse(
        question=user_message,
        answer=last,
        evidence=""
 )
--- a/retrieval.py
+++ b/retrieval.py
@ -0,0 +1,37 @@
 def retrieve_subgraph(G, query):
    query = query.lower()
    context = []
    for node in G.nodes(data=True):
        node_name = node[0].lower()
        if (
            query in node_name
            or node_name in query
        ):
            for neighbor in G.neighbors(node[0]):
                edge = G[node[0]][neighbor]
                context.append(
                    f"{node[0]} --{edge.get('relation','related_to')}--> {neighbor}"
                )
    return "\n".join(context)
 #test
 from graph_builder import load_graph
 if __name__ == "__main__":
    G = load_graph()
    result = retrieve_subgraph(
        G,
        "Python"
    )
    print(result)
--- a/tools.py
+++ b/tools.py
@ -0,0 +1,50 @@
 from retrieval import retrieve_subgraph
 from llm import llm_call
 def query_knowledge_graph(question: str, G):
    graph_context = retrieve_subgraph(G, question)
    prompt = f"""
 You are an educational assistant.
 Use ONLY the graph knowledge below to answer.
 Graph Knowledge:
 {graph_context}
 Question:
 {question}
 Rules:
 - If graph is empty, say "No information in knowledge graph"
 - Be concise and educational
 """
    response = llm_call([
        {"role": "user", "content": prompt}
    ])
    return {
        "answer": response,
        "evidence": graph_context
    }
 TOOL_MAP = {
    "query_knowledge_graph": query_knowledge_graph,
 }
 #test
 from graph_builder import load_graph
 if __name__ == "__main__":
    G = load_graph()
    result = query_knowledge_graph(
        "What is used in machine learning?",
        G
    )
    print(result)
--- a/visualization.py
+++ b/visualization.py
@ -0,0 +1,31 @@
 from pyvis.network import Network
 from graph_builder import load_graph
 def visualize_graph(G):
    net = Network(
        directed=True,
        notebook=False   
    )
    for node in G.nodes():
        net.add_node(node)
    for u, v, data in G.edges(data=True):
        net.add_edge(
            u,
            v,
            label=data.get("relation", "")
        )
    output_path = "kg.html"
    net.write_html(output_path)
    print(f"Graph saved to {output_path}")
 #test
 if __name__ == "__main__":
    G = load_graph()
    visualize_graph(G)