Add sync and reindex endpoint

This commit is contained in:
Ján Pták 2026-06-04 17:19:18 +02:00
parent 10c45de1d7
commit b6f4857ba6
6 changed files with 379 additions and 74 deletions

View File

@ -5,6 +5,10 @@ WORKDIR /app
ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1 ENV PYTHONUNBUFFERED=1
RUN apt-get update \
&& apt-get install -y --no-install-recommends git \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt

View File

@ -1,14 +1,19 @@
from pathlib import Path from pathlib import Path
import sqlite3 import os
import re import re
import sqlite3
import subprocess
import sys
import time
import unicodedata import unicodedata
from collections import Counter from collections import Counter
from fastapi import FastAPI from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
DB_FILE = Path("data/zp_index.sqlite") DB_FILE = Path("data/zp_index.sqlite")
ZPWIKI_ROOT = Path(os.getenv("ZPWIKI_ROOT", "../zpwiki"))
TECHNICAL_TERMS = { TECHNICAL_TERMS = {
@ -46,7 +51,7 @@ TECHNICAL_TERMS = {
app = FastAPI( app = FastAPI(
title="ZP Agent API", title="ZP Agent API",
description="API pre vyhľadávanie v repozitári záverečných prác zpwiki.", description="API pre vyhľadávanie v repozitári záverečných prác zpwiki.",
version="0.1.0", version="0.2.0",
) )
@ -55,18 +60,11 @@ class SearchRequest(BaseModel):
limit: int = Field(default=10, ge=1, le=50) limit: int = Field(default=10, ge=1, le=50)
class SearchResult(BaseModel): class SyncRequest(BaseModel):
score: int pull_git: bool = Field(
chunk_id: str default=False,
document_path: str description="Ak je true, pred reindexovaním sa vykoná git pull v repozitári zpwiki.",
source_url: str )
title: str | None
author: str | None
chunk_index: int
categories: list[str]
tags: list[str]
text: str
text_length: int
def normalize_text(text: str) -> str: def normalize_text(text: str) -> str:
@ -268,18 +266,95 @@ def search_database(query: str, limit: int) -> tuple[str, list[dict]]:
return mode, results[:limit] return mode, results[:limit]
def run_command(command: list[str], cwd: Path | None = None) -> str:
result = subprocess.run(
command,
cwd=cwd,
text=True,
capture_output=True,
)
output = ""
if result.stdout:
output += result.stdout
if result.stderr:
output += result.stderr
if result.returncode != 0:
raise RuntimeError(output.strip())
return output.strip()
def get_index_counts() -> dict:
if not DB_FILE.exists():
return {
"documents": 0,
"chunks": 0,
"tags": 0,
"categories": 0,
}
conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()
counts = {
"documents": cursor.execute("SELECT COUNT(*) FROM documents").fetchone()[0],
"chunks": cursor.execute("SELECT COUNT(*) FROM chunks").fetchone()[0],
"tags": cursor.execute("SELECT COUNT(*) FROM chunk_tags").fetchone()[0],
"categories": cursor.execute("SELECT COUNT(*) FROM chunk_categories").fetchone()[0],
}
conn.close()
return counts
def rebuild_index(pull_git: bool = False) -> dict:
start = time.time()
logs = []
if pull_git:
if not ZPWIKI_ROOT.exists():
raise RuntimeError(f"ZPWIKI_ROOT neexistuje: {ZPWIKI_ROOT}")
if not (ZPWIKI_ROOT / ".git").exists():
raise RuntimeError(f"Nie je to git repozitár: {ZPWIKI_ROOT}")
logs.append(run_command(["git", "pull"], cwd=ZPWIKI_ROOT))
logs.append(run_command([sys.executable, "scripts/scan_zpwiki.py"]))
logs.append(run_command([sys.executable, "scripts/build_chunks.py"]))
logs.append(run_command([sys.executable, "scripts/build_sqlite_index.py"]))
counts = get_index_counts()
duration = round(time.time() - start, 2)
return {
"duration_seconds": duration,
"counts": counts,
"logs": logs,
}
@app.get("/health") @app.get("/health")
def health(): def health():
return { return {
"status": "ok", "status": "ok",
"database_exists": DB_FILE.exists(), "database_exists": DB_FILE.exists(),
"database_path": str(DB_FILE), "database_path": str(DB_FILE),
"zpwiki_root": str(ZPWIKI_ROOT),
"zpwiki_exists": ZPWIKI_ROOT.exists(),
} }
@app.post("/search") @app.post("/search")
def search(request: SearchRequest): def search(request: SearchRequest):
try:
mode, results = search_database(request.query, request.limit) mode, results = search_database(request.query, request.limit)
except FileNotFoundError as error:
raise HTTPException(status_code=500, detail=str(error)) from error
return { return {
"query": request.query, "query": request.query,
@ -287,3 +362,18 @@ def search(request: SearchRequest):
"count": len(results), "count": len(results),
"results": results, "results": results,
} }
@app.post("/sync")
def sync(request: SyncRequest):
try:
result = rebuild_index(pull_git=request.pull_git)
except RuntimeError as error:
raise HTTPException(status_code=500, detail=str(error)) from error
return {
"status": "ok",
"pull_git": request.pull_git,
"duration_seconds": result["duration_seconds"],
"counts": result["counts"],
}

View File

@ -4,6 +4,9 @@ services:
container_name: zp-agent-api container_name: zp-agent-api
ports: ports:
- "8000:8000" - "8000:8000"
environment:
- ZPWIKI_ROOT=/zpwiki
volumes: volumes:
- ./data:/app/data - ./data:/app/data
- ../zpwiki:/zpwiki
restart: unless-stopped restart: unless-stopped

107
scripts/rebuild_index.py Normal file
View File

@ -0,0 +1,107 @@
from pathlib import Path
import argparse
import os
import sqlite3
import subprocess
import sys
import time
from rich import print
ZPWIKI_ROOT = Path(os.getenv("ZPWIKI_ROOT", "../zpwiki"))
DB_FILE = Path("data/zp_index.sqlite")
def run_command(command: list[str], cwd: Path | None = None) -> None:
print(f"[cyan]Spúšťam:[/cyan] {' '.join(command)}")
result = subprocess.run(
command,
cwd=cwd,
text=True,
capture_output=True,
)
if result.stdout:
print(result.stdout.strip())
if result.stderr:
print(result.stderr.strip())
if result.returncode != 0:
raise RuntimeError(
f"Príkaz zlyhal: {' '.join(command)}"
)
def git_pull() -> None:
if not ZPWIKI_ROOT.exists():
raise RuntimeError(f"ZPWIKI_ROOT neexistuje: {ZPWIKI_ROOT}")
if not (ZPWIKI_ROOT / ".git").exists():
raise RuntimeError(f"Nie je to git repozitár: {ZPWIKI_ROOT}")
run_command(["git", "pull"], cwd=ZPWIKI_ROOT)
def rebuild_index() -> None:
run_command([sys.executable, "scripts/scan_zpwiki.py"])
run_command([sys.executable, "scripts/build_chunks.py"])
run_command([sys.executable, "scripts/build_sqlite_index.py"])
def get_counts() -> dict:
if not DB_FILE.exists():
return {
"documents": 0,
"chunks": 0,
"tags": 0,
"categories": 0,
}
conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()
counts = {
"documents": cursor.execute("SELECT COUNT(*) FROM documents").fetchone()[0],
"chunks": cursor.execute("SELECT COUNT(*) FROM chunks").fetchone()[0],
"tags": cursor.execute("SELECT COUNT(*) FROM chunk_tags").fetchone()[0],
"categories": cursor.execute("SELECT COUNT(*) FROM chunk_categories").fetchone()[0],
}
conn.close()
return counts
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--pull",
action="store_true",
help="Pred reindexovaním spustí git pull v zpwiki repozitári.",
)
args = parser.parse_args()
start = time.time()
print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
if args.pull:
git_pull()
rebuild_index()
counts = get_counts()
duration = round(time.time() - start, 2)
print("[green]Reindex hotový.[/green]")
print(f"Trvanie: {duration} s")
print(f"Dokumentov: {counts['documents']}")
print(f"Chunkov: {counts['chunks']}")
print(f"Tag záznamov: {counts['tags']}")
print(f"Kategória záznamov: {counts['categories']}")
if __name__ == "__main__":
main()

View File

@ -1,11 +1,12 @@
from pathlib import Path from pathlib import Path
from collections import Counter from collections import Counter
import json import json
import os
import frontmatter import frontmatter
from rich import print from rich import print
ZPWIKI_ROOT = Path("../zpwiki") ZPWIKI_ROOT = Path(os.getenv("ZPWIKI_ROOT", "../zpwiki"))
PAGES_ROOT = ZPWIKI_ROOT / "pages" PAGES_ROOT = ZPWIKI_ROOT / "pages"
OUTPUT_FILE = Path("data/documents.json") OUTPUT_FILE = Path("data/documents.json")
@ -111,6 +112,7 @@ def main():
with OUTPUT_FILE.open("w", encoding="utf-8") as file: with OUTPUT_FILE.open("w", encoding="utf-8") as file:
json.dump(documents, file, ensure_ascii=False, indent=2) json.dump(documents, file, ensure_ascii=False, indent=2)
print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
print(f"[green]Našiel som README.md súborov:[/green] {len(markdown_files)}") print(f"[green]Našiel som README.md súborov:[/green] {len(markdown_files)}")
print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}") print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")

View File

@ -1,89 +1,188 @@
from pathlib import Path from pathlib import Path
import json import json
import re import re
import sys import os
from collections import Counter import frontmatter
from rich import print from rich import print
CHUNKS_FILE = Path("data/chunks.json") ZPWIKI_ROOT = Path(os.getenv("ZPWIKI_ROOT", "../zpwiki"))
PAGES_ROOT = ZPWIKI_ROOT / "pages"
OUTPUT_FILE = Path("data/chunks.json")
MAX_CHARS = 1200
OVERLAP_CHARS = 200
def tokenize(text: str) -> list[str]: def json_safe(value):
text = text.lower() if value is None:
text = re.sub(r"[^a-záäčďéíĺľňóôŕšťúýž0-9]+", " ", text) return None
return [word for word in text.split() if len(word) >= 2]
if isinstance(value, (str, int, float, bool)):
return value
if isinstance(value, list):
return [json_safe(item) for item in value]
if isinstance(value, dict):
return {str(key): json_safe(val) for key, val in value.items()}
return str(value)
def score_chunk(query_tokens: list[str], chunk: dict) -> int: def normalize_list(value):
text = " ".join([ if value is None:
chunk.get("title") or "", return []
" ".join(chunk.get("tags") or []),
" ".join(chunk.get("categories") or []),
chunk.get("author") or "",
chunk.get("text") or "",
])
tokens = tokenize(text) if isinstance(value, list):
token_counts = Counter(tokens) return [str(item).strip() for item in value if str(item).strip()]
score = 0 if isinstance(value, str):
return [item.strip() for item in value.split(",") if item.strip()]
for query_token in query_tokens: return [str(value)]
score += token_counts.get(query_token, 0) * 3
if query_token in [tag.lower() for tag in chunk.get("tags", [])]:
score += 10
if query_token in [category.lower() for category in chunk.get("categories", [])]: def clean_markdown(text: str) -> str:
score += 6 text = text.replace("\r\n", "\n")
text = re.sub(r"\n{3,}", "\n\n", text)
text = text.strip()
return text
title = (chunk.get("title") or "").lower()
if query_token in title:
score += 5
return score def split_by_headings(text: str) -> list[str]:
parts = re.split(r"(?m)(?=^#{1,6}\s+)", text)
return [part.strip() for part in parts if part.strip()]
def split_long_text(
text: str,
max_chars: int = MAX_CHARS,
overlap: int = OVERLAP_CHARS,
) -> list[str]:
if len(text) <= max_chars:
return [text]
chunks = []
start = 0
while start < len(text):
end = start + max_chars
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
if end >= len(text):
break
start = max(0, end - overlap)
return chunks
def chunk_markdown(text: str) -> list[str]:
text = clean_markdown(text)
if not text:
return []
heading_parts = split_by_headings(text)
chunks = []
for part in heading_parts:
if len(part) <= MAX_CHARS:
chunks.append(part)
else:
chunks.extend(split_long_text(part))
return chunks
def extract_document(file_path: Path) -> dict:
post = frontmatter.load(file_path)
metadata = {
key: json_safe(value)
for key, value in post.metadata.items()
}
taxonomy = metadata.get("taxonomy") or {}
categories = normalize_list(
metadata.get("category")
or taxonomy.get("category")
)
tags = normalize_list(
metadata.get("tag")
or metadata.get("tags")
or taxonomy.get("tag")
or taxonomy.get("tags")
)
author = (
metadata.get("author")
or taxonomy.get("author")
)
relative_path = file_path.relative_to(ZPWIKI_ROOT)
return {
"path": str(relative_path),
"title": metadata.get("title"),
"categories": categories,
"tags": tags,
"published": metadata.get("published"),
"author": author,
"content": post.content.strip(),
"metadata": metadata,
}
def main(): def main():
if len(sys.argv) < 2: if not PAGES_ROOT.exists():
print("[red]Použitie:[/red] python scripts/search_chunks.py \"rag agent\"") raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
raise SystemExit(1)
query = " ".join(sys.argv[1:]) markdown_files = sorted(PAGES_ROOT.glob("**/README.md"))
query_tokens = tokenize(query)
if not CHUNKS_FILE.exists(): all_chunks = []
raise SystemExit(f"Súbor neexistuje: {CHUNKS_FILE}") document_count = 0
with CHUNKS_FILE.open("r", encoding="utf-8") as file: for file_path in markdown_files:
chunks = json.load(file) document = extract_document(file_path)
chunks = chunk_markdown(document["content"])
results = [] document_count += 1
for chunk in chunks: for index, chunk_text in enumerate(chunks):
score = score_chunk(query_tokens, chunk) all_chunks.append({
"chunk_id": f"{document['path']}::chunk-{index}",
"document_path": document["path"],
"title": document["title"],
"categories": document["categories"],
"tags": document["tags"],
"author": document["author"],
"published": document["published"],
"chunk_index": index,
"text": chunk_text,
"text_length": len(chunk_text),
})
if score > 0: OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
results.append((score, chunk))
results.sort(key=lambda item: item[0], reverse=True) with OUTPUT_FILE.open("w", encoding="utf-8") as file:
json.dump(all_chunks, file, ensure_ascii=False, indent=2)
print(f"[bold]Dopyt:[/bold] {query}") print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
print(f"[bold]Počet výsledkov:[/bold] {len(results)}") print(f"[green]Dokumentov:[/green] {document_count}")
print(f"[green]Chunkov:[/green] {len(all_chunks)}")
print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")
print("\n[bold]Top výsledky:[/bold]\n") if all_chunks:
print("\n[bold]Ukážka prvého chunku:[/bold]")
for rank, (score, chunk) in enumerate(results[:10], start=1): print(all_chunks[0])
print(f"[cyan]{rank}. Skóre: {score}[/cyan]")
print(f"[bold]Názov:[/bold] {chunk.get('title')}")
print(f"[bold]Cesta:[/bold] {chunk.get('document_path')}")
print(f"[bold]Kategórie:[/bold] {chunk.get('categories')}")
print(f"[bold]Tagy:[/bold] {chunk.get('tags')}")
print(f"[bold]Autor:[/bold] {chunk.get('author')}")
print("[bold]Text:[/bold]")
print((chunk.get("text") or "")[:700])
print("-" * 80)
if __name__ == "__main__": if __name__ == "__main__":