2024-10-12 12:08:12 +00:00
|
|
|
from elasticsearch import Elasticsearch
|
|
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
2024-10-31 14:05:21 +00:00
|
|
|
from elasticsearch.helpers import bulk
|
|
|
|
import json
|
2024-10-12 12:08:12 +00:00
|
|
|
|
|
|
|
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])
|
|
|
|
|
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
|
|
|
|
2024-10-31 14:05:21 +00:00
|
|
|
def create_index():
|
|
|
|
# Определяем маппинг для индекса
|
|
|
|
mapping = {
|
|
|
|
"mappings": {
|
|
|
|
"properties": {
|
|
|
|
"text": {
|
|
|
|
"type": "text",
|
|
|
|
"analyzer": "standard"
|
|
|
|
},
|
|
|
|
"vector": {
|
|
|
|
"type": "dense_vector",
|
|
|
|
"dims": 384 # Размерность векторного представления
|
|
|
|
},
|
|
|
|
"full_data": {
|
|
|
|
"type": "object",
|
|
|
|
"enabled": False # Отключаем индексацию вложенных данных
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
es.indices.create(index='drug_docs', body=mapping, ignore=400)
|
2024-10-12 12:08:12 +00:00
|
|
|
|
|
|
|
def load_drug_data(json_path):
|
|
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
|
|
data = json.load(f)
|
|
|
|
return data
|
|
|
|
|
|
|
|
def index_documents(data):
|
2024-10-31 14:05:21 +00:00
|
|
|
actions = []
|
|
|
|
total_docs = len(data)
|
|
|
|
for i, item in enumerate(data, start=1):
|
2024-10-12 12:08:12 +00:00
|
|
|
doc_text = f"{item['link']} {item.get('pribalovy_letak', '')} {item.get('spc', '')}"
|
|
|
|
|
|
|
|
vector = embeddings.embed_query(doc_text)
|
|
|
|
|
2024-10-31 14:05:21 +00:00
|
|
|
action = {
|
|
|
|
"_index": "drug_docs",
|
|
|
|
"_id": i,
|
|
|
|
"_source": {
|
|
|
|
'text': doc_text,
|
|
|
|
'vector': vector,
|
|
|
|
'full_data': item
|
|
|
|
}
|
|
|
|
}
|
|
|
|
actions.append(action)
|
|
|
|
|
|
|
|
# Отображение прогресса
|
|
|
|
print(f"Индексируется документ {i}/{total_docs}", end='\r')
|
|
|
|
|
|
|
|
# Опционально: индексируем пакетами по N документов
|
|
|
|
if i % 100 == 0 or i == total_docs:
|
|
|
|
bulk(es, actions)
|
|
|
|
actions = []
|
2024-10-12 12:08:12 +00:00
|
|
|
|
2024-10-31 14:05:21 +00:00
|
|
|
# Если остались неиндексированные документы
|
|
|
|
if actions:
|
|
|
|
bulk(es, actions)
|
2024-10-12 12:08:12 +00:00
|
|
|
|
2024-10-31 14:05:21 +00:00
|
|
|
print("\nИндексирование завершено.")
|
2024-10-12 12:08:12 +00:00
|
|
|
|
2024-10-31 14:05:21 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
create_index()
|
|
|
|
data_path = "../../data_adc_databaza/cleaned_general_info_additional.json"
|
|
|
|
drug_data = load_drug_data(data_path)
|
|
|
|
index_documents(drug_data)
|