dp-zp-agent/scripts/search_chunks.py
2026-06-03 21:04:03 +02:00

91 lines
2.5 KiB
Python

from pathlib import Path
import json
import re
import sys
from collections import Counter
from rich import print
CHUNKS_FILE = Path("data/chunks.json")
def tokenize(text: str) -> list[str]:
text = text.lower()
text = re.sub(r"[^a-záäčďéíĺľňóôŕšťúýž0-9]+", " ", text)
return [word for word in text.split() if len(word) >= 2]
def score_chunk(query_tokens: list[str], chunk: dict) -> int:
text = " ".join([
chunk.get("title") or "",
" ".join(chunk.get("tags") or []),
" ".join(chunk.get("categories") or []),
chunk.get("author") or "",
chunk.get("text") or "",
])
tokens = tokenize(text)
token_counts = Counter(tokens)
score = 0
for query_token in query_tokens:
score += token_counts.get(query_token, 0) * 3
if query_token in [tag.lower() for tag in chunk.get("tags", [])]:
score += 10
if query_token in [category.lower() for category in chunk.get("categories", [])]:
score += 6
title = (chunk.get("title") or "").lower()
if query_token in title:
score += 5
return score
def main():
if len(sys.argv) < 2:
print("[red]Použitie:[/red] python scripts/search_chunks.py \"rag agent\"")
raise SystemExit(1)
query = " ".join(sys.argv[1:])
query_tokens = tokenize(query)
if not CHUNKS_FILE.exists():
raise SystemExit(f"Súbor neexistuje: {CHUNKS_FILE}")
with CHUNKS_FILE.open("r", encoding="utf-8") as file:
chunks = json.load(file)
results = []
for chunk in chunks:
score = score_chunk(query_tokens, chunk)
if score > 0:
results.append((score, chunk))
results.sort(key=lambda item: item[0], reverse=True)
print(f"[bold]Dopyt:[/bold] {query}")
print(f"[bold]Počet výsledkov:[/bold] {len(results)}")
print("\n[bold]Top výsledky:[/bold]\n")
for rank, (score, chunk) in enumerate(results[:10], start=1):
print(f"[cyan]{rank}. Skóre: {score}[/cyan]")
print(f"[bold]Názov:[/bold] {chunk.get('title')}")
print(f"[bold]Cesta:[/bold] {chunk.get('document_path')}")
print(f"[bold]Kategórie:[/bold] {chunk.get('categories')}")
print(f"[bold]Tagy:[/bold] {chunk.get('tags')}")
print(f"[bold]Autor:[/bold] {chunk.get('author')}")
print("[bold]Text:[/bold]")
print((chunk.get("text") or "")[:700])
print("-" * 80)
if __name__ == "__main__":
main()