Add sync and reindex endpoint

2026-06-04 17:19:18 +02:00 · 2026-06-04 17:19:18 +02:00 · b6f4857ba6
commit b6f4857ba6
parent 10c45de1d7
6 changed files with 379 additions and 74 deletions
--- a/4
+++ b/4
@ -5,6 +5,10 @@ WORKDIR /app
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 RUN apt-get update \
    && apt-get install -y --no-install-recommends git \
    && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
--- a/app/main.py
+++ b/app/main.py
@ -1,14 +1,19 @@
 from pathlib import Path
-import sqlite3
+import os
 import re
 import sqlite3
 import subprocess
 import sys
 import time
 import unicodedata
 from collections import Counter
-from fastapi import FastAPI
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
 DB_FILE = Path("data/zp_index.sqlite")
 ZPWIKI_ROOT = Path(os.getenv("ZPWIKI_ROOT", "../zpwiki"))
 TECHNICAL_TERMS = {
@ -46,7 +51,7 @@ TECHNICAL_TERMS = {
 app = FastAPI(
    title="ZP Agent API",
    description="API pre vyhľadávanie v repozitári záverečných prác zpwiki.",
-    version="0.1.0",
+    version="0.2.0",
 )
@ -55,18 +60,11 @@ class SearchRequest(BaseModel):
    limit: int = Field(default=10, ge=1, le=50)
-class SearchResult(BaseModel):
+class SyncRequest(BaseModel):
-    score: int
+    pull_git: bool = Field(
-    chunk_id: str
+        default=False,
-    document_path: str
+        description="Ak je true, pred reindexovaním sa vykoná git pull v repozitári zpwiki.",
-    source_url: str
+    )
    title: str | None
    author: str | None
    chunk_index: int
    categories: list[str]
    tags: list[str]
    text: str
    text_length: int
 def normalize_text(text: str) -> str:
@ -268,18 +266,95 @@ def search_database(query: str, limit: int) -> tuple[str, list[dict]]:
    return mode, results[:limit]
 def run_command(command: list[str], cwd: Path | None = None) -> str:
    result = subprocess.run(
        command,
        cwd=cwd,
        text=True,
        capture_output=True,
    )
    output = ""
    if result.stdout:
        output += result.stdout
    if result.stderr:
        output += result.stderr
    if result.returncode != 0:
        raise RuntimeError(output.strip())
    return output.strip()
 def get_index_counts() -> dict:
    if not DB_FILE.exists():
        return {
            "documents": 0,
            "chunks": 0,
            "tags": 0,
            "categories": 0,
        }
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    counts = {
        "documents": cursor.execute("SELECT COUNT(*) FROM documents").fetchone()[0],
        "chunks": cursor.execute("SELECT COUNT(*) FROM chunks").fetchone()[0],
        "tags": cursor.execute("SELECT COUNT(*) FROM chunk_tags").fetchone()[0],
        "categories": cursor.execute("SELECT COUNT(*) FROM chunk_categories").fetchone()[0],
    }
    conn.close()
    return counts
 def rebuild_index(pull_git: bool = False) -> dict:
    start = time.time()
    logs = []
    if pull_git:
        if not ZPWIKI_ROOT.exists():
            raise RuntimeError(f"ZPWIKI_ROOT neexistuje: {ZPWIKI_ROOT}")
        if not (ZPWIKI_ROOT / ".git").exists():
            raise RuntimeError(f"Nie je to git repozitár: {ZPWIKI_ROOT}")
        logs.append(run_command(["git", "pull"], cwd=ZPWIKI_ROOT))
    logs.append(run_command([sys.executable, "scripts/scan_zpwiki.py"]))
    logs.append(run_command([sys.executable, "scripts/build_chunks.py"]))
    logs.append(run_command([sys.executable, "scripts/build_sqlite_index.py"]))
    counts = get_index_counts()
    duration = round(time.time() - start, 2)
    return {
        "duration_seconds": duration,
        "counts": counts,
        "logs": logs,
    }
@app.get("/health")
 def health():
    return {
        "status": "ok",
        "database_exists": DB_FILE.exists(),
        "database_path": str(DB_FILE),
        "zpwiki_root": str(ZPWIKI_ROOT),
        "zpwiki_exists": ZPWIKI_ROOT.exists(),
    }
@app.post("/search")
 def search(request: SearchRequest):
-    mode, results = search_database(request.query, request.limit)
+    try:
        mode, results = search_database(request.query, request.limit)
    except FileNotFoundError as error:
        raise HTTPException(status_code=500, detail=str(error)) from error
    return {
        "query": request.query,
@ -287,3 +362,18 @@ def search(request: SearchRequest):
        "count": len(results),
        "results": results,
    }
@app.post("/sync")
 def sync(request: SyncRequest):
    try:
        result = rebuild_index(pull_git=request.pull_git)
    except RuntimeError as error:
        raise HTTPException(status_code=500, detail=str(error)) from error
    return {
        "status": "ok",
        "pull_git": request.pull_git,
        "duration_seconds": result["duration_seconds"],
        "counts": result["counts"],
    }
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -4,6 +4,9 @@ services:
    container_name: zp-agent-api
    ports:
      - "8000:8000"
    environment:
      - ZPWIKI_ROOT=/zpwiki
    volumes:
      - ./data:/app/data
      - ../zpwiki:/zpwiki
    restart: unless-stopped
--- a/scripts/rebuild_index.py
+++ b/scripts/rebuild_index.py
@ -0,0 +1,107 @@
 from pathlib import Path
 import argparse
 import os
 import sqlite3
 import subprocess
 import sys
 import time
 from rich import print
 ZPWIKI_ROOT = Path(os.getenv("ZPWIKI_ROOT", "../zpwiki"))
 DB_FILE = Path("data/zp_index.sqlite")
 def run_command(command: list[str], cwd: Path | None = None) -> None:
    print(f"[cyan]Spúšťam:[/cyan] {' '.join(command)}")
    result = subprocess.run(
        command,
        cwd=cwd,
        text=True,
        capture_output=True,
    )
    if result.stdout:
        print(result.stdout.strip())
    if result.stderr:
        print(result.stderr.strip())
    if result.returncode != 0:
        raise RuntimeError(
            f"Príkaz zlyhal: {' '.join(command)}"
        )
 def git_pull() -> None:
    if not ZPWIKI_ROOT.exists():
        raise RuntimeError(f"ZPWIKI_ROOT neexistuje: {ZPWIKI_ROOT}")
    if not (ZPWIKI_ROOT / ".git").exists():
        raise RuntimeError(f"Nie je to git repozitár: {ZPWIKI_ROOT}")
    run_command(["git", "pull"], cwd=ZPWIKI_ROOT)
 def rebuild_index() -> None:
    run_command([sys.executable, "scripts/scan_zpwiki.py"])
    run_command([sys.executable, "scripts/build_chunks.py"])
    run_command([sys.executable, "scripts/build_sqlite_index.py"])
 def get_counts() -> dict:
    if not DB_FILE.exists():
        return {
            "documents": 0,
            "chunks": 0,
            "tags": 0,
            "categories": 0,
        }
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    counts = {
        "documents": cursor.execute("SELECT COUNT(*) FROM documents").fetchone()[0],
        "chunks": cursor.execute("SELECT COUNT(*) FROM chunks").fetchone()[0],
        "tags": cursor.execute("SELECT COUNT(*) FROM chunk_tags").fetchone()[0],
        "categories": cursor.execute("SELECT COUNT(*) FROM chunk_categories").fetchone()[0],
    }
    conn.close()
    return counts
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pull",
        action="store_true",
        help="Pred reindexovaním spustí git pull v zpwiki repozitári.",
    )
    args = parser.parse_args()
    start = time.time()
    print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
    if args.pull:
        git_pull()
    rebuild_index()
    counts = get_counts()
    duration = round(time.time() - start, 2)
    print("[green]Reindex hotový.[/green]")
    print(f"Trvanie: {duration} s")
    print(f"Dokumentov: {counts['documents']}")
    print(f"Chunkov: {counts['chunks']}")
    print(f"Tag záznamov: {counts['tags']}")
    print(f"Kategória záznamov: {counts['categories']}")
 if __name__ == "__main__":
    main()
--- a/scripts/scan_zpwiki.py
+++ b/scripts/scan_zpwiki.py
@ -1,11 +1,12 @@
 from pathlib import Path
 from collections import Counter
 import json
 import os
 import frontmatter
 from rich import print
-ZPWIKI_ROOT = Path("../zpwiki")
+ZPWIKI_ROOT = Path(os.getenv("ZPWIKI_ROOT", "../zpwiki"))
 PAGES_ROOT = ZPWIKI_ROOT / "pages"
 OUTPUT_FILE = Path("data/documents.json")
@ -111,6 +112,7 @@ def main():
    with OUTPUT_FILE.open("w", encoding="utf-8") as file:
        json.dump(documents, file, ensure_ascii=False, indent=2)
    print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
    print(f"[green]Našiel som README.md súborov:[/green] {len(markdown_files)}")
    print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")
--- a/scripts/search_chunks.py
+++ b/scripts/search_chunks.py
@ -1,89 +1,188 @@
 from pathlib import Path
 import json
 import re
-import sys
+import os
-from collections import Counter
+import frontmatter
 from rich import print
-CHUNKS_FILE = Path("data/chunks.json")
+ZPWIKI_ROOT = Path(os.getenv("ZPWIKI_ROOT", "../zpwiki"))
 PAGES_ROOT = ZPWIKI_ROOT / "pages"
 OUTPUT_FILE = Path("data/chunks.json")
 MAX_CHARS = 1200
 OVERLAP_CHARS = 200
-def tokenize(text: str) -> list[str]:
+def json_safe(value):
-    text = text.lower()
+    if value is None:
-    text = re.sub(r"[^a-záäčďéíĺľňóôŕšťúýž0-9]+", " ", text)
+        return None
-    return [word for word in text.split() if len(word) >= 2]
+
    if isinstance(value, (str, int, float, bool)):
        return value
    if isinstance(value, list):
        return [json_safe(item) for item in value]
    if isinstance(value, dict):
        return {str(key): json_safe(val) for key, val in value.items()}
    return str(value)
-def score_chunk(query_tokens: list[str], chunk: dict) -> int:
+def normalize_list(value):
-    text = " ".join([
+    if value is None:
-        chunk.get("title") or "",
+        return []
        " ".join(chunk.get("tags") or []),
        " ".join(chunk.get("categories") or []),
        chunk.get("author") or "",
        chunk.get("text") or "",
    ])
-    tokens = tokenize(text)
+    if isinstance(value, list):
-    token_counts = Counter(tokens)
+        return [str(item).strip() for item in value if str(item).strip()]
-    score = 0
+    if isinstance(value, str):
        return [item.strip() for item in value.split(",") if item.strip()]
-    for query_token in query_tokens:
+    return [str(value)]
        score += token_counts.get(query_token, 0) * 3
        if query_token in [tag.lower() for tag in chunk.get("tags", [])]:
            score += 10
-        if query_token in [category.lower() for category in chunk.get("categories", [])]:
+def clean_markdown(text: str) -> str:
-            score += 6
+    text = text.replace("\r\n", "\n")
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = text.strip()
    return text
        title = (chunk.get("title") or "").lower()
        if query_token in title:
            score += 5
-    return score
+def split_by_headings(text: str) -> list[str]:
    parts = re.split(r"(?m)(?=^#{1,6}\s+)", text)
    return [part.strip() for part in parts if part.strip()]
 def split_long_text(
    text: str,
    max_chars: int = MAX_CHARS,
    overlap: int = OVERLAP_CHARS,
 ) -> list[str]:
    if len(text) <= max_chars:
        return [text]
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chars
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        if end >= len(text):
            break
        start = max(0, end - overlap)
    return chunks
 def chunk_markdown(text: str) -> list[str]:
    text = clean_markdown(text)
    if not text:
        return []
    heading_parts = split_by_headings(text)
    chunks = []
    for part in heading_parts:
        if len(part) <= MAX_CHARS:
            chunks.append(part)
        else:
            chunks.extend(split_long_text(part))
    return chunks
 def extract_document(file_path: Path) -> dict:
    post = frontmatter.load(file_path)
    metadata = {
        key: json_safe(value)
        for key, value in post.metadata.items()
    }
    taxonomy = metadata.get("taxonomy") or {}
    categories = normalize_list(
        metadata.get("category")
        or taxonomy.get("category")
    )
    tags = normalize_list(
        metadata.get("tag")
        or metadata.get("tags")
        or taxonomy.get("tag")
        or taxonomy.get("tags")
    )
    author = (
        metadata.get("author")
        or taxonomy.get("author")
    )
    relative_path = file_path.relative_to(ZPWIKI_ROOT)
    return {
        "path": str(relative_path),
        "title": metadata.get("title"),
        "categories": categories,
        "tags": tags,
        "published": metadata.get("published"),
        "author": author,
        "content": post.content.strip(),
        "metadata": metadata,
    }
 def main():
-    if len(sys.argv) < 2:
+    if not PAGES_ROOT.exists():
-        print("[red]Použitie:[/red] python scripts/search_chunks.py \"rag agent\"")
+        raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
        raise SystemExit(1)
-    query = " ".join(sys.argv[1:])
+    markdown_files = sorted(PAGES_ROOT.glob("**/README.md"))
    query_tokens = tokenize(query)
-    if not CHUNKS_FILE.exists():
+    all_chunks = []
-        raise SystemExit(f"Súbor neexistuje: {CHUNKS_FILE}")
+    document_count = 0
-    with CHUNKS_FILE.open("r", encoding="utf-8") as file:
+    for file_path in markdown_files:
-        chunks = json.load(file)
+        document = extract_document(file_path)
        chunks = chunk_markdown(document["content"])
-    results = []
+        document_count += 1
-    for chunk in chunks:
+        for index, chunk_text in enumerate(chunks):
-        score = score_chunk(query_tokens, chunk)
+            all_chunks.append({
                "chunk_id": f"{document['path']}::chunk-{index}",
                "document_path": document["path"],
                "title": document["title"],
                "categories": document["categories"],
                "tags": document["tags"],
                "author": document["author"],
                "published": document["published"],
                "chunk_index": index,
                "text": chunk_text,
                "text_length": len(chunk_text),
            })
-        if score > 0:
+    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
            results.append((score, chunk))
-    results.sort(key=lambda item: item[0], reverse=True)
+    with OUTPUT_FILE.open("w", encoding="utf-8") as file:
        json.dump(all_chunks, file, ensure_ascii=False, indent=2)
-    print(f"[bold]Dopyt:[/bold] {query}")
+    print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
-    print(f"[bold]Počet výsledkov:[/bold] {len(results)}")
+    print(f"[green]Dokumentov:[/green] {document_count}")
    print(f"[green]Chunkov:[/green] {len(all_chunks)}")
    print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")
-    print("\n[bold]Top výsledky:[/bold]\n")
+    if all_chunks:
-
+        print("\n[bold]Ukážka prvého chunku:[/bold]")
-    for rank, (score, chunk) in enumerate(results[:10], start=1):
+        print(all_chunks[0])
        print(f"[cyan]{rank}. Skóre: {score}[/cyan]")
        print(f"[bold]Názov:[/bold] {chunk.get('title')}")
        print(f"[bold]Cesta:[/bold] {chunk.get('document_path')}")
        print(f"[bold]Kategórie:[/bold] {chunk.get('categories')}")
        print(f"[bold]Tagy:[/bold] {chunk.get('tags')}")
        print(f"[bold]Autor:[/bold] {chunk.get('author')}")
        print("[bold]Text:[/bold]")
        print((chunk.get("text") or "")[:700])
        print("-" * 80)
 if __name__ == "__main__":