42 lines
1.3 KiB
Python
42 lines
1.3 KiB
Python
|
from elasticsearch import Elasticsearch
|
||
|
from langchain_huggingface import HuggingFaceEmbeddings
|
||
|
import json
|
||
|
import sys
|
||
|
|
||
|
es = Elasticsearch(
|
||
|
cloud_id="tt:dXMtZWFzdC0yLmF3cy5lbGFzdGljLWNsb3VkLmNvbTo0NDMkOGM3ODQ0ZWVhZTEyNGY3NmFjNjQyNDFhNjI4NmVhYzMkZTI3YjlkNTQ0ODdhNGViNmEyMTcxMjMxNmJhMWI0ZGU=",
|
||
|
basic_auth=("elastic", "sSz2BEGv56JRNjGFwoQ191RJ")
|
||
|
)
|
||
|
|
||
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
||
|
|
||
|
|
||
|
def load_drug_data(json_path):
|
||
|
with open(json_path, 'r', encoding='utf-8') as f:
|
||
|
data = json.load(f)
|
||
|
return data
|
||
|
|
||
|
|
||
|
def index_documents(data):
|
||
|
total_documents = len(data)
|
||
|
for i, item in enumerate(data, start=1):
|
||
|
doc_text = f"{item['link']} {item.get('pribalovy_letak', '')} {item.get('spc', '')}"
|
||
|
|
||
|
vector = embeddings.embed_query(doc_text)
|
||
|
|
||
|
es.index(index='drug_docs', id=i, body={
|
||
|
'text': doc_text,
|
||
|
'vector': vector,
|
||
|
'full_data': item
|
||
|
})
|
||
|
|
||
|
sys.stdout.write(f"\rПроиндексировано {i} из {total_documents} документов")
|
||
|
sys.stdout.flush()
|
||
|
|
||
|
print("\nИндексирование завершено.")
|
||
|
|
||
|
|
||
|
data_path = "../../data_adc_databaza/cleaned_general_info_additional.json"
|
||
|
drug_data = load_drug_data(data_path)
|
||
|
index_documents(drug_data)
|