91 lines
2.5 KiB
Python
91 lines
2.5 KiB
Python
from pathlib import Path
|
|
import json
|
|
import re
|
|
import sys
|
|
from collections import Counter
|
|
from rich import print
|
|
|
|
|
|
CHUNKS_FILE = Path("data/chunks.json")
|
|
|
|
|
|
def tokenize(text: str) -> list[str]:
|
|
text = text.lower()
|
|
text = re.sub(r"[^a-záäčďéíĺľňóôŕšťúýž0-9]+", " ", text)
|
|
return [word for word in text.split() if len(word) >= 2]
|
|
|
|
|
|
def score_chunk(query_tokens: list[str], chunk: dict) -> int:
|
|
text = " ".join([
|
|
chunk.get("title") or "",
|
|
" ".join(chunk.get("tags") or []),
|
|
" ".join(chunk.get("categories") or []),
|
|
chunk.get("author") or "",
|
|
chunk.get("text") or "",
|
|
])
|
|
|
|
tokens = tokenize(text)
|
|
token_counts = Counter(tokens)
|
|
|
|
score = 0
|
|
|
|
for query_token in query_tokens:
|
|
score += token_counts.get(query_token, 0) * 3
|
|
|
|
if query_token in [tag.lower() for tag in chunk.get("tags", [])]:
|
|
score += 10
|
|
|
|
if query_token in [category.lower() for category in chunk.get("categories", [])]:
|
|
score += 6
|
|
|
|
title = (chunk.get("title") or "").lower()
|
|
if query_token in title:
|
|
score += 5
|
|
|
|
return score
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("[red]Použitie:[/red] python scripts/search_chunks.py \"rag agent\"")
|
|
raise SystemExit(1)
|
|
|
|
query = " ".join(sys.argv[1:])
|
|
query_tokens = tokenize(query)
|
|
|
|
if not CHUNKS_FILE.exists():
|
|
raise SystemExit(f"Súbor neexistuje: {CHUNKS_FILE}")
|
|
|
|
with CHUNKS_FILE.open("r", encoding="utf-8") as file:
|
|
chunks = json.load(file)
|
|
|
|
results = []
|
|
|
|
for chunk in chunks:
|
|
score = score_chunk(query_tokens, chunk)
|
|
|
|
if score > 0:
|
|
results.append((score, chunk))
|
|
|
|
results.sort(key=lambda item: item[0], reverse=True)
|
|
|
|
print(f"[bold]Dopyt:[/bold] {query}")
|
|
print(f"[bold]Počet výsledkov:[/bold] {len(results)}")
|
|
|
|
print("\n[bold]Top výsledky:[/bold]\n")
|
|
|
|
for rank, (score, chunk) in enumerate(results[:10], start=1):
|
|
print(f"[cyan]{rank}. Skóre: {score}[/cyan]")
|
|
print(f"[bold]Názov:[/bold] {chunk.get('title')}")
|
|
print(f"[bold]Cesta:[/bold] {chunk.get('document_path')}")
|
|
print(f"[bold]Kategórie:[/bold] {chunk.get('categories')}")
|
|
print(f"[bold]Tagy:[/bold] {chunk.get('tags')}")
|
|
print(f"[bold]Autor:[/bold] {chunk.get('author')}")
|
|
print("[bold]Text:[/bold]")
|
|
print((chunk.get("text") or "")[:700])
|
|
print("-" * 80)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|