from pathlib import Path import json import re import sys from collections import Counter from rich import print CHUNKS_FILE = Path("data/chunks.json") def tokenize(text: str) -> list[str]: text = text.lower() text = re.sub(r"[^a-záäčďéíĺľňóôŕšťúýž0-9]+", " ", text) return [word for word in text.split() if len(word) >= 2] def score_chunk(query_tokens: list[str], chunk: dict) -> int: text = " ".join([ chunk.get("title") or "", " ".join(chunk.get("tags") or []), " ".join(chunk.get("categories") or []), chunk.get("author") or "", chunk.get("text") or "", ]) tokens = tokenize(text) token_counts = Counter(tokens) score = 0 for query_token in query_tokens: score += token_counts.get(query_token, 0) * 3 if query_token in [tag.lower() for tag in chunk.get("tags", [])]: score += 10 if query_token in [category.lower() for category in chunk.get("categories", [])]: score += 6 title = (chunk.get("title") or "").lower() if query_token in title: score += 5 return score def main(): if len(sys.argv) < 2: print("[red]Použitie:[/red] python scripts/search_chunks.py \"rag agent\"") raise SystemExit(1) query = " ".join(sys.argv[1:]) query_tokens = tokenize(query) if not CHUNKS_FILE.exists(): raise SystemExit(f"Súbor neexistuje: {CHUNKS_FILE}") with CHUNKS_FILE.open("r", encoding="utf-8") as file: chunks = json.load(file) results = [] for chunk in chunks: score = score_chunk(query_tokens, chunk) if score > 0: results.append((score, chunk)) results.sort(key=lambda item: item[0], reverse=True) print(f"[bold]Dopyt:[/bold] {query}") print(f"[bold]Počet výsledkov:[/bold] {len(results)}") print("\n[bold]Top výsledky:[/bold]\n") for rank, (score, chunk) in enumerate(results[:10], start=1): print(f"[cyan]{rank}. Skóre: {score}[/cyan]") print(f"[bold]Názov:[/bold] {chunk.get('title')}") print(f"[bold]Cesta:[/bold] {chunk.get('document_path')}") print(f"[bold]Kategórie:[/bold] {chunk.get('categories')}") print(f"[bold]Tagy:[/bold] {chunk.get('tags')}") print(f"[bold]Autor:[/bold] {chunk.get('author')}") print("[bold]Text:[/bold]") print((chunk.get("text") or "")[:700]) print("-" * 80) if __name__ == "__main__": main()