dp-zp-agent/scripts/search_chunks.py

190 lines
4.4 KiB
Python

from pathlib import Path
import json
import re
import os
import frontmatter
from rich import print
ZPWIKI_ROOT = Path(os.getenv("ZPWIKI_ROOT", "../zpwiki"))
PAGES_ROOT = ZPWIKI_ROOT / "pages"
OUTPUT_FILE = Path("data/chunks.json")
MAX_CHARS = 1200
OVERLAP_CHARS = 200
def json_safe(value):
if value is None:
return None
if isinstance(value, (str, int, float, bool)):
return value
if isinstance(value, list):
return [json_safe(item) for item in value]
if isinstance(value, dict):
return {str(key): json_safe(val) for key, val in value.items()}
return str(value)
def normalize_list(value):
if value is None:
return []
if isinstance(value, list):
return [str(item).strip() for item in value if str(item).strip()]
if isinstance(value, str):
return [item.strip() for item in value.split(",") if item.strip()]
return [str(value)]
def clean_markdown(text: str) -> str:
text = text.replace("\r\n", "\n")
text = re.sub(r"\n{3,}", "\n\n", text)
text = text.strip()
return text
def split_by_headings(text: str) -> list[str]:
parts = re.split(r"(?m)(?=^#{1,6}\s+)", text)
return [part.strip() for part in parts if part.strip()]
def split_long_text(
text: str,
max_chars: int = MAX_CHARS,
overlap: int = OVERLAP_CHARS,
) -> list[str]:
if len(text) <= max_chars:
return [text]
chunks = []
start = 0
while start < len(text):
end = start + max_chars
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
if end >= len(text):
break
start = max(0, end - overlap)
return chunks
def chunk_markdown(text: str) -> list[str]:
text = clean_markdown(text)
if not text:
return []
heading_parts = split_by_headings(text)
chunks = []
for part in heading_parts:
if len(part) <= MAX_CHARS:
chunks.append(part)
else:
chunks.extend(split_long_text(part))
return chunks
def extract_document(file_path: Path) -> dict:
post = frontmatter.load(file_path)
metadata = {
key: json_safe(value)
for key, value in post.metadata.items()
}
taxonomy = metadata.get("taxonomy") or {}
categories = normalize_list(
metadata.get("category")
or taxonomy.get("category")
)
tags = normalize_list(
metadata.get("tag")
or metadata.get("tags")
or taxonomy.get("tag")
or taxonomy.get("tags")
)
author = (
metadata.get("author")
or taxonomy.get("author")
)
relative_path = file_path.relative_to(ZPWIKI_ROOT)
return {
"path": str(relative_path),
"title": metadata.get("title"),
"categories": categories,
"tags": tags,
"published": metadata.get("published"),
"author": author,
"content": post.content.strip(),
"metadata": metadata,
}
def main():
if not PAGES_ROOT.exists():
raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
markdown_files = sorted(PAGES_ROOT.glob("**/README.md"))
all_chunks = []
document_count = 0
for file_path in markdown_files:
document = extract_document(file_path)
chunks = chunk_markdown(document["content"])
document_count += 1
for index, chunk_text in enumerate(chunks):
all_chunks.append({
"chunk_id": f"{document['path']}::chunk-{index}",
"document_path": document["path"],
"title": document["title"],
"categories": document["categories"],
"tags": document["tags"],
"author": document["author"],
"published": document["published"],
"chunk_index": index,
"text": chunk_text,
"text_length": len(chunk_text),
})
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8") as file:
json.dump(all_chunks, file, ensure_ascii=False, indent=2)
print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
print(f"[green]Dokumentov:[/green] {document_count}")
print(f"[green]Chunkov:[/green] {len(all_chunks)}")
print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")
if all_chunks:
print("\n[bold]Ukážka prvého chunku:[/bold]")
print(all_chunks[0])
if __name__ == "__main__":
main()