168 lines
4.7 KiB
Python
168 lines
4.7 KiB
Python
from pathlib import Path
|
|
import json
|
|
import sqlite3
|
|
from rich import print
|
|
|
|
|
|
DOCUMENTS_FILE = Path("data/documents.json")
|
|
CHUNKS_FILE = Path("data/chunks.json")
|
|
DB_FILE = Path("data/zp_index.sqlite")
|
|
|
|
|
|
def create_tables(conn: sqlite3.Connection):
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("DROP TABLE IF EXISTS chunk_tags")
|
|
cursor.execute("DROP TABLE IF EXISTS chunk_categories")
|
|
cursor.execute("DROP TABLE IF EXISTS chunks")
|
|
cursor.execute("DROP TABLE IF EXISTS documents")
|
|
|
|
cursor.execute("""
|
|
CREATE TABLE documents (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
path TEXT UNIQUE NOT NULL,
|
|
title TEXT,
|
|
author TEXT,
|
|
published INTEGER,
|
|
content_length INTEGER,
|
|
metadata_json TEXT
|
|
)
|
|
""")
|
|
|
|
cursor.execute("""
|
|
CREATE TABLE chunks (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
chunk_id TEXT UNIQUE NOT NULL,
|
|
document_path TEXT NOT NULL,
|
|
title TEXT,
|
|
author TEXT,
|
|
chunk_index INTEGER,
|
|
text TEXT NOT NULL,
|
|
text_length INTEGER,
|
|
FOREIGN KEY(document_path) REFERENCES documents(path)
|
|
)
|
|
""")
|
|
|
|
cursor.execute("""
|
|
CREATE TABLE chunk_tags (
|
|
chunk_id TEXT NOT NULL,
|
|
tag TEXT NOT NULL
|
|
)
|
|
""")
|
|
|
|
cursor.execute("""
|
|
CREATE TABLE chunk_categories (
|
|
chunk_id TEXT NOT NULL,
|
|
category TEXT NOT NULL
|
|
)
|
|
""")
|
|
|
|
cursor.execute("CREATE INDEX idx_documents_path ON documents(path)")
|
|
cursor.execute("CREATE INDEX idx_chunks_document_path ON chunks(document_path)")
|
|
cursor.execute("CREATE INDEX idx_chunks_title ON chunks(title)")
|
|
cursor.execute("CREATE INDEX idx_chunk_tags_tag ON chunk_tags(tag)")
|
|
cursor.execute("CREATE INDEX idx_chunk_categories_category ON chunk_categories(category)")
|
|
|
|
conn.commit()
|
|
|
|
|
|
def load_json(path: Path):
|
|
if not path.exists():
|
|
raise SystemExit(f"Súbor neexistuje: {path}")
|
|
|
|
with path.open("r", encoding="utf-8") as file:
|
|
return json.load(file)
|
|
|
|
|
|
def insert_documents(conn: sqlite3.Connection, documents: list[dict]):
|
|
cursor = conn.cursor()
|
|
|
|
for doc in documents:
|
|
cursor.execute("""
|
|
INSERT INTO documents (
|
|
path, title, author, published, content_length, metadata_json
|
|
)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
doc.get("path"),
|
|
doc.get("title"),
|
|
doc.get("author"),
|
|
1 if doc.get("published") else 0,
|
|
doc.get("content_length"),
|
|
json.dumps(doc.get("metadata") or {}, ensure_ascii=False),
|
|
))
|
|
|
|
conn.commit()
|
|
|
|
|
|
def insert_chunks(conn: sqlite3.Connection, chunks: list[dict]):
|
|
cursor = conn.cursor()
|
|
|
|
for chunk in chunks:
|
|
cursor.execute("""
|
|
INSERT INTO chunks (
|
|
chunk_id, document_path, title, author, chunk_index, text, text_length
|
|
)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
chunk.get("chunk_id"),
|
|
chunk.get("document_path"),
|
|
chunk.get("title"),
|
|
chunk.get("author"),
|
|
chunk.get("chunk_index"),
|
|
chunk.get("text"),
|
|
chunk.get("text_length"),
|
|
))
|
|
|
|
for tag in chunk.get("tags") or []:
|
|
cursor.execute("""
|
|
INSERT INTO chunk_tags (chunk_id, tag)
|
|
VALUES (?, ?)
|
|
""", (
|
|
chunk.get("chunk_id"),
|
|
tag,
|
|
))
|
|
|
|
for category in chunk.get("categories") or []:
|
|
cursor.execute("""
|
|
INSERT INTO chunk_categories (chunk_id, category)
|
|
VALUES (?, ?)
|
|
""", (
|
|
chunk.get("chunk_id"),
|
|
category,
|
|
))
|
|
|
|
conn.commit()
|
|
|
|
|
|
def main():
|
|
documents = load_json(DOCUMENTS_FILE)
|
|
chunks = load_json(CHUNKS_FILE)
|
|
|
|
DB_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
conn = sqlite3.connect(DB_FILE)
|
|
|
|
create_tables(conn)
|
|
insert_documents(conn, documents)
|
|
insert_chunks(conn, chunks)
|
|
|
|
cursor = conn.cursor()
|
|
|
|
document_count = cursor.execute("SELECT COUNT(*) FROM documents").fetchone()[0]
|
|
chunk_count = cursor.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
|
|
tag_count = cursor.execute("SELECT COUNT(*) FROM chunk_tags").fetchone()[0]
|
|
category_count = cursor.execute("SELECT COUNT(*) FROM chunk_categories").fetchone()[0]
|
|
|
|
conn.close()
|
|
|
|
print(f"[green]SQLite index vytvorený:[/green] {DB_FILE}")
|
|
print(f"Dokumentov: {document_count}")
|
|
print(f"Chunkov: {chunk_count}")
|
|
print(f"Tag záznamov: {tag_count}")
|
|
print(f"Kategória záznamov: {category_count}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|