Parser a Fast API

2026-06-03 21:04:03 +02:00 · 2026-06-03 21:04:03 +02:00 · fe79c9c2ed
commit fe79c9c2ed
11 changed files with 21410 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+.venv/
+__pycache__/
+*.pyc
+data/*.sqlite
+data/*.db
--- a/app/init.py
+++ b/app/init.py
--- a/app/main.py
+++ b/app/main.py
@ -0,0 +1,289 @@
+from pathlib import Path
+import sqlite3
+import re
+import unicodedata
+from collections import Counter
+
+from fastapi import FastAPI
+from pydantic import BaseModel, Field
+
+
+DB_FILE = Path("data/zp_index.sqlite")
+
+
+TECHNICAL_TERMS = {
+    "rag",
+    "agent",
+    "graph",
+    "knowledge",
+    "chatbot",
+    "nlp",
+    "llm",
+    "lm",
+    "openwebui",
+    "docker",
+    "webhook",
+    "database",
+    "db",
+    "neo4j",
+    "python",
+    "search",
+    "retrieval",
+    "generation",
+    "embedding",
+    "vector",
+    "vectors",
+    "langchain",
+    "graphrag",
+    "qa",
+    "question",
+    "answer",
+    "cloud",
+    "api",
+}
+
+
+app = FastAPI(
+    title="ZP Agent API",
+    description="API pre vyhľadávanie v repozitári záverečných prác zpwiki.",
+    version="0.1.0",
+)
+
+
+class SearchRequest(BaseModel):
+    query: str = Field(..., min_length=1)
+    limit: int = Field(default=10, ge=1, le=50)
+
+
+class SearchResult(BaseModel):
+    score: int
+    chunk_id: str
+    document_path: str
+    source_url: str
+    title: str | None
+    author: str | None
+    chunk_index: int
+    categories: list[str]
+    tags: list[str]
+    text: str
+    text_length: int
+
+
+def normalize_text(text: str) -> str:
+    text = text.lower()
+    text = text.replace("_", " ")
+    text = text.replace("/", " ")
+    text = text.replace("-", " ")
+
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join(ch for ch in text if not unicodedata.combining(ch))
+
+    text = re.sub(r"[^a-z0-9]+", " ", text)
+    return text.strip()
+
+
+def tokenize(text: str) -> list[str]:
+    text = normalize_text(text)
+    return [word for word in text.split() if len(word) >= 2]
+
+
+def detect_search_mode(query_tokens: list[str]) -> str:
+    if not query_tokens:
+        return "topic"
+
+    has_technical_term = any(token in TECHNICAL_TERMS for token in query_tokens)
+
+    if len(query_tokens) == 2 and not has_technical_term:
+        return "person"
+
+    return "topic"
+
+
+def score_tokens(query_tokens: list[str], field_tokens: list[str], weight: int) -> int:
+    counts = Counter(field_tokens)
+    score = 0
+
+    for token in query_tokens:
+        score += counts.get(token, 0) * weight
+
+    return score
+
+
+def contains_all_tokens(query_tokens: list[str], field_tokens: list[str]) -> bool:
+    return all(token in field_tokens for token in query_tokens)
+
+
+def get_tags(conn: sqlite3.Connection, chunk_id: str) -> list[str]:
+    rows = conn.execute(
+        "SELECT tag FROM chunk_tags WHERE chunk_id = ?",
+        (chunk_id,),
+    ).fetchall()
+
+    return [row[0] for row in rows]
+
+
+def get_categories(conn: sqlite3.Connection, chunk_id: str) -> list[str]:
+    rows = conn.execute(
+        "SELECT category FROM chunk_categories WHERE chunk_id = ?",
+        (chunk_id,),
+    ).fetchall()
+
+    return [row[0] for row in rows]
+
+
+def person_match(query_tokens: list[str], item: dict) -> bool:
+    title_tokens = tokenize(item.get("title") or "")
+    path_tokens = tokenize(item.get("document_path") or "")
+    author_tokens = tokenize(item.get("author") or "")
+    text_tokens = tokenize(item.get("text") or "")
+
+    if contains_all_tokens(query_tokens, title_tokens):
+        return True
+
+    if contains_all_tokens(query_tokens, path_tokens):
+        return True
+
+    if contains_all_tokens(query_tokens, author_tokens):
+        return True
+
+    if contains_all_tokens(query_tokens, text_tokens):
+        return True
+
+    return False
+
+
+def score_item(query: str, query_tokens: list[str], item: dict, mode: str) -> int:
+    title = item.get("title") or ""
+    path = item.get("document_path") or ""
+    author = item.get("author") or ""
+    text = item.get("text") or ""
+    tags = item.get("tags") or []
+    categories = item.get("categories") or []
+
+    title_tokens = tokenize(title)
+    path_tokens = tokenize(path)
+    author_tokens = tokenize(author)
+    text_tokens = tokenize(text)
+    tag_tokens = tokenize(" ".join(tags))
+    category_tokens = tokenize(" ".join(categories))
+
+    score = 0
+
+    if mode == "person":
+        score += score_tokens(query_tokens, title_tokens, 30)
+        score += score_tokens(query_tokens, path_tokens, 30)
+        score += score_tokens(query_tokens, author_tokens, 15)
+        score += score_tokens(query_tokens, text_tokens, 2)
+
+        if contains_all_tokens(query_tokens, title_tokens):
+            score += 100
+
+        if contains_all_tokens(query_tokens, path_tokens):
+            score += 100
+
+        if contains_all_tokens(query_tokens, author_tokens):
+            score += 60
+
+        return score
+
+    score += score_tokens(query_tokens, title_tokens, 12)
+    score += score_tokens(query_tokens, path_tokens, 12)
+    score += score_tokens(query_tokens, tag_tokens, 10)
+    score += score_tokens(query_tokens, category_tokens, 6)
+    score += score_tokens(query_tokens, author_tokens, 3)
+    score += score_tokens(query_tokens, text_tokens, 2)
+
+    normalized_query = normalize_text(query)
+    normalized_title = normalize_text(title)
+    normalized_path = normalize_text(path)
+
+    if normalized_query and normalized_query in normalized_title:
+        score += 30
+
+    if normalized_query and normalized_query in normalized_path:
+        score += 30
+
+    matched_title_tokens = sum(1 for token in query_tokens if token in title_tokens)
+    matched_path_tokens = sum(1 for token in query_tokens if token in path_tokens)
+
+    if query_tokens and matched_title_tokens == len(query_tokens):
+        score += 25
+
+    if query_tokens and matched_path_tokens == len(query_tokens):
+        score += 25
+
+    return score
+
+
+def make_source_url(document_path: str) -> str:
+    clean_path = document_path.replace("pages/", "").replace("/README.md", "")
+    return f"https://zp.kemt.fei.tuke.sk/{clean_path}"
+
+
+def search_database(query: str, limit: int) -> tuple[str, list[dict]]:
+    if not DB_FILE.exists():
+        raise FileNotFoundError(f"Databáza neexistuje: {DB_FILE}")
+
+    query_tokens = tokenize(query)
+    mode = detect_search_mode(query_tokens)
+
+    conn = sqlite3.connect(DB_FILE)
+
+    rows = conn.execute("""
+        SELECT chunk_id, document_path, title, author, chunk_index, text, text_length
+        FROM chunks
+    """).fetchall()
+
+    results = []
+
+    for row in rows:
+        chunk_id, document_path, title, author, chunk_index, text, text_length = row
+
+        item = {
+            "chunk_id": chunk_id,
+            "document_path": document_path,
+            "title": title,
+            "author": author,
+            "chunk_index": chunk_index,
+            "text": text,
+            "text_length": text_length,
+            "tags": get_tags(conn, chunk_id),
+            "categories": get_categories(conn, chunk_id),
+        }
+
+        if mode == "person" and not person_match(query_tokens, item):
+            continue
+
+        score = score_item(query, query_tokens, item, mode)
+
+        if score > 0:
+            item["score"] = score
+            item["source_url"] = make_source_url(document_path)
+            results.append(item)
+
+    conn.close()
+
+    results.sort(key=lambda item: item["score"], reverse=True)
+
+    return mode, results[:limit]
+
+
+@app.get("/health")
+def health():
+    return {
+        "status": "ok",
+        "database_exists": DB_FILE.exists(),
+        "database_path": str(DB_FILE),
+    }
+
+
+@app.post("/search")
+def search(request: SearchRequest):
+    mode, results = search_database(request.query, request.limit)
+
+    return {
+        "query": request.query,
+        "mode": mode,
+        "count": len(results),
+        "results": results,
+    }
--- a/data/chunks.json
+++ b/data/chunks.json
--- a/data/documents.json
+++ b/data/documents.json
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,23 @@
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.13.0
+click==8.4.1
+exceptiongroup==1.3.1
+fastapi==0.136.3
+gitdb==4.0.12
+GitPython==3.1.50
+h11==0.16.0
+idna==3.18
+markdown-it-py==4.2.0
+mdurl==0.1.2
+pydantic==2.13.4
+pydantic_core==2.46.4
+Pygments==2.20.0
+python-frontmatter==1.3.0
+PyYAML==6.0.3
+rich==15.0.0
+smmap==5.0.3
+starlette==1.2.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+uvicorn==0.48.0
--- a/scripts/build_chunks.py
+++ b/scripts/build_chunks.py
@ -0,0 +1,183 @@
+from pathlib import Path
+import json
+import re
+import frontmatter
+from rich import print
+
+
+ZPWIKI_ROOT = Path("../zpwiki")
+PAGES_ROOT = ZPWIKI_ROOT / "pages"
+OUTPUT_FILE = Path("data/chunks.json")
+
+MAX_CHARS = 1200
+OVERLAP_CHARS = 200
+
+
+def json_safe(value):
+    if value is None:
+        return None
+
+    if isinstance(value, (str, int, float, bool)):
+        return value
+
+    if isinstance(value, list):
+        return [json_safe(item) for item in value]
+
+    if isinstance(value, dict):
+        return {str(key): json_safe(val) for key, val in value.items()}
+
+    return str(value)
+
+
+def normalize_list(value):
+    if value is None:
+        return []
+
+    if isinstance(value, list):
+        return [str(item).strip() for item in value if str(item).strip()]
+
+    if isinstance(value, str):
+        return [item.strip() for item in value.split(",") if item.strip()]
+
+    return [str(value)]
+
+
+def clean_markdown(text: str) -> str:
+    text = text.replace("\r\n", "\n")
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = text.strip()
+    return text
+
+
+def split_by_headings(text: str) -> list[str]:
+    parts = re.split(r"(?m)(?=^#{1,6}\s+)", text)
+    return [part.strip() for part in parts if part.strip()]
+
+
+def split_long_text(text: str, max_chars: int = MAX_CHARS, overlap: int = OVERLAP_CHARS) -> list[str]:
+    if len(text) <= max_chars:
+        return [text]
+
+    chunks = []
+    start = 0
+
+    while start < len(text):
+        end = start + max_chars
+        chunk = text[start:end].strip()
+
+        if chunk:
+            chunks.append(chunk)
+
+        if end >= len(text):
+            break
+
+        start = max(0, end - overlap)
+
+    return chunks
+
+
+def chunk_markdown(text: str) -> list[str]:
+    text = clean_markdown(text)
+
+    if not text:
+        return []
+
+    heading_parts = split_by_headings(text)
+
+    chunks = []
+
+    for part in heading_parts:
+        if len(part) <= MAX_CHARS:
+            chunks.append(part)
+        else:
+            chunks.extend(split_long_text(part))
+
+    return chunks
+
+
+def extract_document(file_path: Path) -> dict:
+    post = frontmatter.load(file_path)
+
+    metadata = {
+        key: json_safe(value)
+        for key, value in post.metadata.items()
+    }
+
+    taxonomy = metadata.get("taxonomy") or {}
+
+    categories = normalize_list(
+        metadata.get("category")
+        or taxonomy.get("category")
+    )
+
+    tags = normalize_list(
+        metadata.get("tag")
+        or metadata.get("tags")
+        or taxonomy.get("tag")
+        or taxonomy.get("tags")
+    )
+
+    author = (
+        metadata.get("author")
+        or taxonomy.get("author")
+    )
+
+    relative_path = file_path.relative_to(ZPWIKI_ROOT)
+
+    return {
+        "path": str(relative_path),
+        "title": metadata.get("title"),
+        "categories": categories,
+        "tags": tags,
+        "published": metadata.get("published"),
+        "author": author,
+        "content": post.content.strip(),
+        "metadata": metadata,
+    }
+
+
+def main():
+    if not PAGES_ROOT.exists():
+        raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
+
+    markdown_files = sorted(PAGES_ROOT.glob("**/README.md"))
+
+    all_chunks = []
+    document_count = 0
+
+    for file_path in markdown_files:
+        document = extract_document(file_path)
+        chunks = chunk_markdown(document["content"])
+
+        document_count += 1
+
+        for index, chunk_text in enumerate(chunks):
+            all_chunks.append({
+                "chunk_id": f"{document['path']}::chunk-{index}",
+                "document_path": document["path"],
+                "title": document["title"],
+                "categories": document["categories"],
+                "tags": document["tags"],
+                "author": document["author"],
+                "published": document["published"],
+                "chunk_index": index,
+                "text": chunk_text,
+                "text_length": len(chunk_text),
+            })
+
+    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+
+    with OUTPUT_FILE.open("w", encoding="utf-8") as file:
+        json.dump(all_chunks, file, ensure_ascii=False, indent=2)
+
+    print(f"[green]Dokumentov:[/green] {document_count}")
+    print(f"[green]Chunkov:[/green] {len(all_chunks)}")
+    print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")
+
+    if all_chunks:
+        print("\n[bold]Ukážka prvého chunku:[/bold]")
+        print(all_chunks[0])
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/build_sqlite_index.py
+++ b/scripts/build_sqlite_index.py
@ -0,0 +1,167 @@
+from pathlib import Path
+import json
+import sqlite3
+from rich import print
+
+
+DOCUMENTS_FILE = Path("data/documents.json")
+CHUNKS_FILE = Path("data/chunks.json")
+DB_FILE = Path("data/zp_index.sqlite")
+
+
+def create_tables(conn: sqlite3.Connection):
+    cursor = conn.cursor()
+
+    cursor.execute("DROP TABLE IF EXISTS chunk_tags")
+    cursor.execute("DROP TABLE IF EXISTS chunk_categories")
+    cursor.execute("DROP TABLE IF EXISTS chunks")
+    cursor.execute("DROP TABLE IF EXISTS documents")
+
+    cursor.execute("""
+        CREATE TABLE documents (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            path TEXT UNIQUE NOT NULL,
+            title TEXT,
+            author TEXT,
+            published INTEGER,
+            content_length INTEGER,
+            metadata_json TEXT
+        )
+    """)
+
+    cursor.execute("""
+        CREATE TABLE chunks (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            chunk_id TEXT UNIQUE NOT NULL,
+            document_path TEXT NOT NULL,
+            title TEXT,
+            author TEXT,
+            chunk_index INTEGER,
+            text TEXT NOT NULL,
+            text_length INTEGER,
+            FOREIGN KEY(document_path) REFERENCES documents(path)
+        )
+    """)
+
+    cursor.execute("""
+        CREATE TABLE chunk_tags (
+            chunk_id TEXT NOT NULL,
+            tag TEXT NOT NULL
+        )
+    """)
+
+    cursor.execute("""
+        CREATE TABLE chunk_categories (
+            chunk_id TEXT NOT NULL,
+            category TEXT NOT NULL
+        )
+    """)
+
+    cursor.execute("CREATE INDEX idx_documents_path ON documents(path)")
+    cursor.execute("CREATE INDEX idx_chunks_document_path ON chunks(document_path)")
+    cursor.execute("CREATE INDEX idx_chunks_title ON chunks(title)")
+    cursor.execute("CREATE INDEX idx_chunk_tags_tag ON chunk_tags(tag)")
+    cursor.execute("CREATE INDEX idx_chunk_categories_category ON chunk_categories(category)")
+
+    conn.commit()
+
+
+def load_json(path: Path):
+    if not path.exists():
+        raise SystemExit(f"Súbor neexistuje: {path}")
+
+    with path.open("r", encoding="utf-8") as file:
+        return json.load(file)
+
+
+def insert_documents(conn: sqlite3.Connection, documents: list[dict]):
+    cursor = conn.cursor()
+
+    for doc in documents:
+        cursor.execute("""
+            INSERT INTO documents (
+                path, title, author, published, content_length, metadata_json
+            )
+            VALUES (?, ?, ?, ?, ?, ?)
+        """, (
+            doc.get("path"),
+            doc.get("title"),
+            doc.get("author"),
+            1 if doc.get("published") else 0,
+            doc.get("content_length"),
+            json.dumps(doc.get("metadata") or {}, ensure_ascii=False),
+        ))
+
+    conn.commit()
+
+
+def insert_chunks(conn: sqlite3.Connection, chunks: list[dict]):
+    cursor = conn.cursor()
+
+    for chunk in chunks:
+        cursor.execute("""
+            INSERT INTO chunks (
+                chunk_id, document_path, title, author, chunk_index, text, text_length
+            )
+            VALUES (?, ?, ?, ?, ?, ?, ?)
+        """, (
+            chunk.get("chunk_id"),
+            chunk.get("document_path"),
+            chunk.get("title"),
+            chunk.get("author"),
+            chunk.get("chunk_index"),
+            chunk.get("text"),
+            chunk.get("text_length"),
+        ))
+
+        for tag in chunk.get("tags") or []:
+            cursor.execute("""
+                INSERT INTO chunk_tags (chunk_id, tag)
+                VALUES (?, ?)
+            """, (
+                chunk.get("chunk_id"),
+                tag,
+            ))
+
+        for category in chunk.get("categories") or []:
+            cursor.execute("""
+                INSERT INTO chunk_categories (chunk_id, category)
+                VALUES (?, ?)
+            """, (
+                chunk.get("chunk_id"),
+                category,
+            ))
+
+    conn.commit()
+
+
+def main():
+    documents = load_json(DOCUMENTS_FILE)
+    chunks = load_json(CHUNKS_FILE)
+
+    DB_FILE.parent.mkdir(parents=True, exist_ok=True)
+
+    conn = sqlite3.connect(DB_FILE)
+
+    create_tables(conn)
+    insert_documents(conn, documents)
+    insert_chunks(conn, chunks)
+
+    cursor = conn.cursor()
+
+    document_count = cursor.execute("SELECT COUNT(*) FROM documents").fetchone()[0]
+    chunk_count = cursor.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
+    tag_count = cursor.execute("SELECT COUNT(*) FROM chunk_tags").fetchone()[0]
+    category_count = cursor.execute("SELECT COUNT(*) FROM chunk_categories").fetchone()[0]
+
+    conn.close()
+
+    print(f"[green]SQLite index vytvorený:[/green] {DB_FILE}")
+    print(f"Dokumentov: {document_count}")
+    print(f"Chunkov: {chunk_count}")
+    print(f"Tag záznamov: {tag_count}")
+    print(f"Kategória záznamov: {category_count}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/scan_zpwiki.py
+++ b/scripts/scan_zpwiki.py
@ -0,0 +1,139 @@
+from pathlib import Path
+from collections import Counter
+import json
+import frontmatter
+from rich import print
+
+
+ZPWIKI_ROOT = Path("../zpwiki")
+PAGES_ROOT = ZPWIKI_ROOT / "pages"
+OUTPUT_FILE = Path("data/documents.json")
+
+
+def json_safe(value):
+    if value is None:
+        return None
+
+    if isinstance(value, (str, int, float, bool)):
+        return value
+
+    if isinstance(value, list):
+        return [json_safe(item) for item in value]
+
+    if isinstance(value, dict):
+        return {str(key): json_safe(val) for key, val in value.items()}
+
+    return str(value)
+
+
+def normalize_list(value):
+    if value is None:
+        return []
+
+    if isinstance(value, list):
+        return [str(item).strip() for item in value if str(item).strip()]
+
+    if isinstance(value, str):
+        return [item.strip() for item in value.split(",") if item.strip()]
+
+    return [str(value)]
+
+
+def main():
+    if not PAGES_ROOT.exists():
+        raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
+
+    markdown_files = sorted(PAGES_ROOT.glob("**/README.md"))
+
+    documents = []
+    metadata_keys = Counter()
+    categories_counter = Counter()
+    tags_counter = Counter()
+    authors_counter = Counter()
+
+    for file_path in markdown_files:
+        post = frontmatter.load(file_path)
+
+        metadata = {
+            key: json_safe(value)
+            for key, value in post.metadata.items()
+        }
+
+        taxonomy = metadata.get("taxonomy") or {}
+        content = post.content.strip()
+
+        for key in metadata.keys():
+            metadata_keys[key] += 1
+
+        categories = normalize_list(
+            metadata.get("category")
+            or taxonomy.get("category")
+        )
+
+        tags = normalize_list(
+            metadata.get("tag")
+            or metadata.get("tags")
+            or taxonomy.get("tag")
+            or taxonomy.get("tags")
+        )
+
+        author = (
+            metadata.get("author")
+            or taxonomy.get("author")
+        )
+
+        for category in categories:
+            categories_counter[category] += 1
+
+        for tag in tags:
+            tags_counter[tag] += 1
+
+        if author:
+            authors_counter[str(author)] += 1
+
+        relative_path = file_path.relative_to(ZPWIKI_ROOT)
+
+        documents.append({
+            "path": str(relative_path),
+            "title": metadata.get("title"),
+            "categories": categories,
+            "tags": tags,
+            "published": metadata.get("published"),
+            "author": author,
+            "taxonomy": taxonomy,
+            "metadata": metadata,
+            "content_preview": content[:500],
+            "content_length": len(content),
+        })
+
+    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+
+    with OUTPUT_FILE.open("w", encoding="utf-8") as file:
+        json.dump(documents, file, ensure_ascii=False, indent=2)
+
+    print(f"[green]Našiel som README.md súborov:[/green] {len(markdown_files)}")
+    print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")
+
+    print("\n[bold]Najčastejšie metadata kľúče:[/bold]")
+    for key, count in metadata_keys.most_common(30):
+        print(f"{key}: {count}")
+
+    print("\n[bold]Najčastejšie kategórie:[/bold]")
+    for key, count in categories_counter.most_common(30):
+        print(f"{key}: {count}")
+
+    print("\n[bold]Najčastejšie tagy:[/bold]")
+    for key, count in tags_counter.most_common(40):
+        print(f"{key}: {count}")
+
+    print("\n[bold]Najčastejší autori:[/bold]")
+    for key, count in authors_counter.most_common(20):
+        print(f"{key}: {count}")
+
+    print("\n[bold]Ukážka prvého dokumentu:[/bold]")
+    if documents:
+        print(documents[0])
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/search_chunks.py
+++ b/scripts/search_chunks.py
@ -0,0 +1,90 @@
+from pathlib import Path
+import json
+import re
+import sys
+from collections import Counter
+from rich import print
+
+
+CHUNKS_FILE = Path("data/chunks.json")
+
+
+def tokenize(text: str) -> list[str]:
+    text = text.lower()
+    text = re.sub(r"[^a-záäčďéíĺľňóôŕšťúýž0-9]+", " ", text)
+    return [word for word in text.split() if len(word) >= 2]
+
+
+def score_chunk(query_tokens: list[str], chunk: dict) -> int:
+    text = " ".join([
+        chunk.get("title") or "",
+        " ".join(chunk.get("tags") or []),
+        " ".join(chunk.get("categories") or []),
+        chunk.get("author") or "",
+        chunk.get("text") or "",
+    ])
+
+    tokens = tokenize(text)
+    token_counts = Counter(tokens)
+
+    score = 0
+
+    for query_token in query_tokens:
+        score += token_counts.get(query_token, 0) * 3
+
+        if query_token in [tag.lower() for tag in chunk.get("tags", [])]:
+            score += 10
+
+        if query_token in [category.lower() for category in chunk.get("categories", [])]:
+            score += 6
+
+        title = (chunk.get("title") or "").lower()
+        if query_token in title:
+            score += 5
+
+    return score
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("[red]Použitie:[/red] python scripts/search_chunks.py \"rag agent\"")
+        raise SystemExit(1)
+
+    query = " ".join(sys.argv[1:])
+    query_tokens = tokenize(query)
+
+    if not CHUNKS_FILE.exists():
+        raise SystemExit(f"Súbor neexistuje: {CHUNKS_FILE}")
+
+    with CHUNKS_FILE.open("r", encoding="utf-8") as file:
+        chunks = json.load(file)
+
+    results = []
+
+    for chunk in chunks:
+        score = score_chunk(query_tokens, chunk)
+
+        if score > 0:
+            results.append((score, chunk))
+
+    results.sort(key=lambda item: item[0], reverse=True)
+
+    print(f"[bold]Dopyt:[/bold] {query}")
+    print(f"[bold]Počet výsledkov:[/bold] {len(results)}")
+
+    print("\n[bold]Top výsledky:[/bold]\n")
+
+    for rank, (score, chunk) in enumerate(results[:10], start=1):
+        print(f"[cyan]{rank}. Skóre: {score}[/cyan]")
+        print(f"[bold]Názov:[/bold] {chunk.get('title')}")
+        print(f"[bold]Cesta:[/bold] {chunk.get('document_path')}")
+        print(f"[bold]Kategórie:[/bold] {chunk.get('categories')}")
+        print(f"[bold]Tagy:[/bold] {chunk.get('tags')}")
+        print(f"[bold]Autor:[/bold] {chunk.get('author')}")
+        print("[bold]Text:[/bold]")
+        print((chunk.get("text") or "")[:700])
+        print("-" * 80)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/search_db.py
+++ b/scripts/search_db.py
@ -0,0 +1,271 @@
+from pathlib import Path
+import sqlite3
+import re
+import sys
+import unicodedata
+from collections import Counter
+from rich import print
+
+
+DB_FILE = Path("data/zp_index.sqlite")
+
+
+TECHNICAL_TERMS = {
+    "rag",
+    "agent",
+    "graph",
+    "knowledge",
+    "chatbot",
+    "nlp",
+    "llm",
+    "lm",
+    "openwebui",
+    "docker",
+    "webhook",
+    "database",
+    "db",
+    "neo4j",
+    "python",
+    "search",
+    "retrieval",
+    "generation",
+    "embedding",
+    "vector",
+    "vectors",
+    "langchain",
+    "graphrag",
+    "qa",
+    "question",
+    "answer",
+    "cloud",
+    "api",
+}
+
+
+def normalize_text(text: str) -> str:
+    text = text.lower()
+    text = text.replace("_", " ")
+    text = text.replace("/", " ")
+    text = text.replace("-", " ")
+
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join(ch for ch in text if not unicodedata.combining(ch))
+
+    text = re.sub(r"[^a-z0-9]+", " ", text)
+    return text.strip()
+
+
+def tokenize(text: str) -> list[str]:
+    text = normalize_text(text)
+    return [word for word in text.split() if len(word) >= 2]
+
+
+def detect_search_mode(query_tokens: list[str]) -> str:
+    """
+    person režim:
+    napríklad jan ptak, jan holp, daniel hladek
+
+    topic režim:
+    napríklad rag agent, knowledge graph, nlp chatbot
+    """
+
+    if not query_tokens:
+        return "topic"
+
+    has_technical_term = any(token in TECHNICAL_TERMS for token in query_tokens)
+
+    if len(query_tokens) == 2 and not has_technical_term:
+        return "person"
+
+    return "topic"
+
+
+def score_tokens(query_tokens: list[str], field_tokens: list[str], weight: int) -> int:
+    counts = Counter(field_tokens)
+    score = 0
+
+    for token in query_tokens:
+        score += counts.get(token, 0) * weight
+
+    return score
+
+
+def get_tags(conn: sqlite3.Connection, chunk_id: str) -> list[str]:
+    rows = conn.execute(
+        "SELECT tag FROM chunk_tags WHERE chunk_id = ?",
+        (chunk_id,)
+    ).fetchall()
+
+    return [row[0] for row in rows]
+
+
+def get_categories(conn: sqlite3.Connection, chunk_id: str) -> list[str]:
+    rows = conn.execute(
+        "SELECT category FROM chunk_categories WHERE chunk_id = ?",
+        (chunk_id,)
+    ).fetchall()
+
+    return [row[0] for row in rows]
+
+
+def contains_all_tokens(query_tokens: list[str], field_tokens: list[str]) -> bool:
+    return all(token in field_tokens for token in query_tokens)
+
+
+def person_match(query_tokens: list[str], item: dict) -> bool:
+    title_tokens = tokenize(item.get("title") or "")
+    path_tokens = tokenize(item.get("document_path") or "")
+    author_tokens = tokenize(item.get("author") or "")
+    text_tokens = tokenize(item.get("text") or "")
+
+    if contains_all_tokens(query_tokens, title_tokens):
+        return True
+
+    if contains_all_tokens(query_tokens, path_tokens):
+        return True
+
+    if contains_all_tokens(query_tokens, author_tokens):
+        return True
+
+    """
+    Text berieme slabšie, ale necháme ho ako fallback.
+    Napríklad ak meno nie je v title, ale je v obsahu.
+    """
+    if contains_all_tokens(query_tokens, text_tokens):
+        return True
+
+    return False
+
+
+def score_item(query: str, query_tokens: list[str], item: dict, mode: str) -> int:
+    title = item.get("title") or ""
+    path = item.get("document_path") or ""
+    author = item.get("author") or ""
+    text = item.get("text") or ""
+    tags = item.get("tags") or []
+    categories = item.get("categories") or []
+
+    title_tokens = tokenize(title)
+    path_tokens = tokenize(path)
+    author_tokens = tokenize(author)
+    text_tokens = tokenize(text)
+    tag_tokens = tokenize(" ".join(tags))
+    category_tokens = tokenize(" ".join(categories))
+
+    score = 0
+
+    if mode == "person":
+        score += score_tokens(query_tokens, title_tokens, 30)
+        score += score_tokens(query_tokens, path_tokens, 30)
+        score += score_tokens(query_tokens, author_tokens, 15)
+        score += score_tokens(query_tokens, text_tokens, 2)
+
+        if contains_all_tokens(query_tokens, title_tokens):
+            score += 100
+
+        if contains_all_tokens(query_tokens, path_tokens):
+            score += 100
+
+        if contains_all_tokens(query_tokens, author_tokens):
+            score += 60
+
+        return score
+
+    score += score_tokens(query_tokens, title_tokens, 12)
+    score += score_tokens(query_tokens, path_tokens, 12)
+    score += score_tokens(query_tokens, tag_tokens, 10)
+    score += score_tokens(query_tokens, category_tokens, 6)
+    score += score_tokens(query_tokens, author_tokens, 3)
+    score += score_tokens(query_tokens, text_tokens, 2)
+
+    normalized_query = normalize_text(query)
+    normalized_title = normalize_text(title)
+    normalized_path = normalize_text(path)
+
+    if normalized_query and normalized_query in normalized_title:
+        score += 30
+
+    if normalized_query and normalized_query in normalized_path:
+        score += 30
+
+    matched_title_tokens = sum(1 for token in query_tokens if token in title_tokens)
+    matched_path_tokens = sum(1 for token in query_tokens if token in path_tokens)
+
+    if query_tokens and matched_title_tokens == len(query_tokens):
+        score += 25
+
+    if query_tokens and matched_path_tokens == len(query_tokens):
+        score += 25
+
+    return score
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("[red]Použitie:[/red] python scripts/search_db.py \"rag agent\"")
+        raise SystemExit(1)
+
+    if not DB_FILE.exists():
+        raise SystemExit(f"Databáza neexistuje: {DB_FILE}")
+
+    query = " ".join(sys.argv[1:])
+    query_tokens = tokenize(query)
+    mode = detect_search_mode(query_tokens)
+
+    conn = sqlite3.connect(DB_FILE)
+
+    rows = conn.execute("""
+        SELECT chunk_id, document_path, title, author, chunk_index, text, text_length
+        FROM chunks
+    """).fetchall()
+
+    results = []
+
+    for row in rows:
+        chunk_id, document_path, title, author, chunk_index, text, text_length = row
+
+        item = {
+            "chunk_id": chunk_id,
+            "document_path": document_path,
+            "title": title,
+            "author": author,
+            "chunk_index": chunk_index,
+            "text": text,
+            "text_length": text_length,
+            "tags": get_tags(conn, chunk_id),
+            "categories": get_categories(conn, chunk_id),
+        }
+
+        if mode == "person" and not person_match(query_tokens, item):
+            continue
+
+        score = score_item(query, query_tokens, item, mode)
+
+        if score > 0:
+            item["score"] = score
+            results.append(item)
+
+    results.sort(key=lambda item: item["score"], reverse=True)
+
+    print(f"[bold]Dopyt:[/bold] {query}")
+    print(f"[bold]Režim:[/bold] {mode}")
+    print(f"[bold]Počet výsledkov:[/bold] {len(results)}")
+    print("\n[bold]Top výsledky:[/bold]\n")
+
+    for rank, item in enumerate(results[:10], start=1):
+        print(f"[cyan]{rank}. Skóre: {item['score']}[/cyan]")
+        print(f"[bold]Názov:[/bold] {item['title']}")
+        print(f"[bold]Cesta:[/bold] {item['document_path']}")
+        print(f"[bold]Chunk:[/bold] {item['chunk_index']}")
+        print(f"[bold]Kategórie:[/bold] {item['categories']}")
+        print(f"[bold]Tagy:[/bold] {item['tags']}")
+        print(f"[bold]Autor:[/bold] {item['author']}")
+        print("[bold]Text:[/bold]")
+        print((item["text"] or "")[:700])
+        print("-" * 80)
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()