from pathlib import Path import json import sqlite3 from rich import print DOCUMENTS_FILE = Path("data/documents.json") CHUNKS_FILE = Path("data/chunks.json") DB_FILE = Path("data/zp_index.sqlite") def create_tables(conn: sqlite3.Connection): cursor = conn.cursor() cursor.execute("DROP TABLE IF EXISTS chunk_tags") cursor.execute("DROP TABLE IF EXISTS chunk_categories") cursor.execute("DROP TABLE IF EXISTS chunks") cursor.execute("DROP TABLE IF EXISTS documents") cursor.execute(""" CREATE TABLE documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, path TEXT UNIQUE NOT NULL, title TEXT, author TEXT, published INTEGER, content_length INTEGER, metadata_json TEXT ) """) cursor.execute(""" CREATE TABLE chunks ( id INTEGER PRIMARY KEY AUTOINCREMENT, chunk_id TEXT UNIQUE NOT NULL, document_path TEXT NOT NULL, title TEXT, author TEXT, chunk_index INTEGER, text TEXT NOT NULL, text_length INTEGER, FOREIGN KEY(document_path) REFERENCES documents(path) ) """) cursor.execute(""" CREATE TABLE chunk_tags ( chunk_id TEXT NOT NULL, tag TEXT NOT NULL ) """) cursor.execute(""" CREATE TABLE chunk_categories ( chunk_id TEXT NOT NULL, category TEXT NOT NULL ) """) cursor.execute("CREATE INDEX idx_documents_path ON documents(path)") cursor.execute("CREATE INDEX idx_chunks_document_path ON chunks(document_path)") cursor.execute("CREATE INDEX idx_chunks_title ON chunks(title)") cursor.execute("CREATE INDEX idx_chunk_tags_tag ON chunk_tags(tag)") cursor.execute("CREATE INDEX idx_chunk_categories_category ON chunk_categories(category)") conn.commit() def load_json(path: Path): if not path.exists(): raise SystemExit(f"Súbor neexistuje: {path}") with path.open("r", encoding="utf-8") as file: return json.load(file) def insert_documents(conn: sqlite3.Connection, documents: list[dict]): cursor = conn.cursor() for doc in documents: cursor.execute(""" INSERT INTO documents ( path, title, author, published, content_length, metadata_json ) VALUES (?, ?, ?, ?, ?, ?) """, ( doc.get("path"), doc.get("title"), doc.get("author"), 1 if doc.get("published") else 0, doc.get("content_length"), json.dumps(doc.get("metadata") or {}, ensure_ascii=False), )) conn.commit() def insert_chunks(conn: sqlite3.Connection, chunks: list[dict]): cursor = conn.cursor() for chunk in chunks: cursor.execute(""" INSERT INTO chunks ( chunk_id, document_path, title, author, chunk_index, text, text_length ) VALUES (?, ?, ?, ?, ?, ?, ?) """, ( chunk.get("chunk_id"), chunk.get("document_path"), chunk.get("title"), chunk.get("author"), chunk.get("chunk_index"), chunk.get("text"), chunk.get("text_length"), )) for tag in chunk.get("tags") or []: cursor.execute(""" INSERT INTO chunk_tags (chunk_id, tag) VALUES (?, ?) """, ( chunk.get("chunk_id"), tag, )) for category in chunk.get("categories") or []: cursor.execute(""" INSERT INTO chunk_categories (chunk_id, category) VALUES (?, ?) """, ( chunk.get("chunk_id"), category, )) conn.commit() def main(): documents = load_json(DOCUMENTS_FILE) chunks = load_json(CHUNKS_FILE) DB_FILE.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(DB_FILE) create_tables(conn) insert_documents(conn, documents) insert_chunks(conn, chunks) cursor = conn.cursor() document_count = cursor.execute("SELECT COUNT(*) FROM documents").fetchone()[0] chunk_count = cursor.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] tag_count = cursor.execute("SELECT COUNT(*) FROM chunk_tags").fetchone()[0] category_count = cursor.execute("SELECT COUNT(*) FROM chunk_categories").fetchone()[0] conn.close() print(f"[green]SQLite index vytvorený:[/green] {DB_FILE}") print(f"Dokumentov: {document_count}") print(f"Chunkov: {chunk_count}") print(f"Tag záznamov: {tag_count}") print(f"Kategória záznamov: {category_count}") if __name__ == "__main__": main()