149 lines
3.6 KiB
Python
149 lines
3.6 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from rich import print
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|
|
|
if str(PROJECT_ROOT) not in sys.path:
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
|
|
from scripts.common import CHUNKS_FILE, PAGES_ROOT, ZPWIKI_ROOT, load_zpwiki_page, write_json
|
|
|
|
|
|
MAX_CHARS = 1200
|
|
OVERLAP_CHARS = 200
|
|
|
|
|
|
def clean_markdown(text: str) -> str:
|
|
text = text.replace("\r\n", "\n")
|
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
return text.strip()
|
|
|
|
|
|
def split_by_headings(text: str) -> list[str]:
|
|
parts = re.split(r"(?m)(?=^#{1,6}\s+)", text)
|
|
return [part.strip() for part in parts if part.strip()]
|
|
|
|
|
|
def find_split_position(text: str, max_chars: int) -> int:
|
|
"""Nájde lepšie miesto delenia, aby chunk nekončil úplne náhodne."""
|
|
if len(text) <= max_chars:
|
|
return len(text)
|
|
|
|
search_area = text[:max_chars]
|
|
min_position = int(max_chars * 0.6)
|
|
|
|
for separator in ("\n\n", "\n", ". ", " "):
|
|
position = search_area.rfind(separator)
|
|
|
|
if position >= min_position:
|
|
return position + len(separator)
|
|
|
|
return max_chars
|
|
|
|
|
|
def split_long_text(
|
|
text: str,
|
|
max_chars: int = MAX_CHARS,
|
|
overlap: int = OVERLAP_CHARS,
|
|
) -> list[str]:
|
|
if max_chars <= overlap:
|
|
raise ValueError("max_chars musí byť väčšie ako overlap")
|
|
|
|
if len(text) <= max_chars:
|
|
return [text]
|
|
|
|
chunks = []
|
|
start = 0
|
|
|
|
while start < len(text):
|
|
remaining = text[start:]
|
|
|
|
if len(remaining) <= max_chars:
|
|
chunk = remaining.strip()
|
|
|
|
if chunk:
|
|
chunks.append(chunk)
|
|
|
|
break
|
|
|
|
split_at = find_split_position(remaining, max_chars)
|
|
chunk = remaining[:split_at].strip()
|
|
|
|
if chunk:
|
|
chunks.append(chunk)
|
|
|
|
start += max(1, split_at - overlap)
|
|
|
|
return chunks
|
|
|
|
|
|
def chunk_markdown(text: str) -> list[str]:
|
|
"""Rozdelí Markdown najprv podľa nadpisov a potom podľa dĺžky."""
|
|
text = clean_markdown(text)
|
|
|
|
if not text:
|
|
return []
|
|
|
|
chunks = []
|
|
|
|
for part in split_by_headings(text):
|
|
chunks.extend(split_long_text(part))
|
|
|
|
return chunks
|
|
|
|
|
|
def build_chunks() -> list[dict]:
|
|
if not PAGES_ROOT.exists():
|
|
raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
|
|
|
|
all_chunks = []
|
|
document_count = 0
|
|
|
|
for file_path in sorted(PAGES_ROOT.glob("**/README.md")):
|
|
document = load_zpwiki_page(file_path)
|
|
document_count += 1
|
|
|
|
for index, text in enumerate(chunk_markdown(document["content"])):
|
|
all_chunks.append(
|
|
{
|
|
"chunk_id": f"{document['path']}::chunk-{index}",
|
|
"document_path": document["path"],
|
|
"title": document["title"],
|
|
"categories": document["categories"],
|
|
"tags": document["tags"],
|
|
"author": document["author"],
|
|
"published": document["published"],
|
|
"chunk_index": index,
|
|
"text": text,
|
|
"text_length": len(text),
|
|
}
|
|
)
|
|
|
|
write_json(CHUNKS_FILE, all_chunks)
|
|
|
|
print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
|
|
print(f"[green]Dokumentov:[/green] {document_count}")
|
|
print(f"[green]Chunkov:[/green] {len(all_chunks)}")
|
|
print(f"[green]Výstup uložený do:[/green] {CHUNKS_FILE}")
|
|
|
|
if all_chunks:
|
|
print("\n[bold]Ukážka prvého chunku:[/bold]")
|
|
print(all_chunks[0])
|
|
|
|
return all_chunks
|
|
|
|
|
|
def main() -> None:
|
|
build_chunks()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|