dp-zp-agent/scripts/build_sqlite_index.py
2026-06-03 21:04:03 +02:00

168 lines
4.7 KiB
Python

from pathlib import Path
import json
import sqlite3
from rich import print
DOCUMENTS_FILE = Path("data/documents.json")
CHUNKS_FILE = Path("data/chunks.json")
DB_FILE = Path("data/zp_index.sqlite")
def create_tables(conn: sqlite3.Connection):
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS chunk_tags")
cursor.execute("DROP TABLE IF EXISTS chunk_categories")
cursor.execute("DROP TABLE IF EXISTS chunks")
cursor.execute("DROP TABLE IF EXISTS documents")
cursor.execute("""
CREATE TABLE documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
path TEXT UNIQUE NOT NULL,
title TEXT,
author TEXT,
published INTEGER,
content_length INTEGER,
metadata_json TEXT
)
""")
cursor.execute("""
CREATE TABLE chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
chunk_id TEXT UNIQUE NOT NULL,
document_path TEXT NOT NULL,
title TEXT,
author TEXT,
chunk_index INTEGER,
text TEXT NOT NULL,
text_length INTEGER,
FOREIGN KEY(document_path) REFERENCES documents(path)
)
""")
cursor.execute("""
CREATE TABLE chunk_tags (
chunk_id TEXT NOT NULL,
tag TEXT NOT NULL
)
""")
cursor.execute("""
CREATE TABLE chunk_categories (
chunk_id TEXT NOT NULL,
category TEXT NOT NULL
)
""")
cursor.execute("CREATE INDEX idx_documents_path ON documents(path)")
cursor.execute("CREATE INDEX idx_chunks_document_path ON chunks(document_path)")
cursor.execute("CREATE INDEX idx_chunks_title ON chunks(title)")
cursor.execute("CREATE INDEX idx_chunk_tags_tag ON chunk_tags(tag)")
cursor.execute("CREATE INDEX idx_chunk_categories_category ON chunk_categories(category)")
conn.commit()
def load_json(path: Path):
if not path.exists():
raise SystemExit(f"Súbor neexistuje: {path}")
with path.open("r", encoding="utf-8") as file:
return json.load(file)
def insert_documents(conn: sqlite3.Connection, documents: list[dict]):
cursor = conn.cursor()
for doc in documents:
cursor.execute("""
INSERT INTO documents (
path, title, author, published, content_length, metadata_json
)
VALUES (?, ?, ?, ?, ?, ?)
""", (
doc.get("path"),
doc.get("title"),
doc.get("author"),
1 if doc.get("published") else 0,
doc.get("content_length"),
json.dumps(doc.get("metadata") or {}, ensure_ascii=False),
))
conn.commit()
def insert_chunks(conn: sqlite3.Connection, chunks: list[dict]):
cursor = conn.cursor()
for chunk in chunks:
cursor.execute("""
INSERT INTO chunks (
chunk_id, document_path, title, author, chunk_index, text, text_length
)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
chunk.get("chunk_id"),
chunk.get("document_path"),
chunk.get("title"),
chunk.get("author"),
chunk.get("chunk_index"),
chunk.get("text"),
chunk.get("text_length"),
))
for tag in chunk.get("tags") or []:
cursor.execute("""
INSERT INTO chunk_tags (chunk_id, tag)
VALUES (?, ?)
""", (
chunk.get("chunk_id"),
tag,
))
for category in chunk.get("categories") or []:
cursor.execute("""
INSERT INTO chunk_categories (chunk_id, category)
VALUES (?, ?)
""", (
chunk.get("chunk_id"),
category,
))
conn.commit()
def main():
documents = load_json(DOCUMENTS_FILE)
chunks = load_json(CHUNKS_FILE)
DB_FILE.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(DB_FILE)
create_tables(conn)
insert_documents(conn, documents)
insert_chunks(conn, chunks)
cursor = conn.cursor()
document_count = cursor.execute("SELECT COUNT(*) FROM documents").fetchone()[0]
chunk_count = cursor.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
tag_count = cursor.execute("SELECT COUNT(*) FROM chunk_tags").fetchone()[0]
category_count = cursor.execute("SELECT COUNT(*) FROM chunk_categories").fetchone()[0]
conn.close()
print(f"[green]SQLite index vytvorený:[/green] {DB_FILE}")
print(f"Dokumentov: {document_count}")
print(f"Chunkov: {chunk_count}")
print(f"Tag záznamov: {tag_count}")
print(f"Kategória záznamov: {category_count}")
if __name__ == "__main__":
main()