implemented cloud requesting elasticsearch and more through langchain

2024-10-22 15:39:09 +02:00 · 2024-10-22 15:39:09 +02:00 · 8b2aad77aa
commit 8b2aad77aa
parent a527489f43
5 changed files with 85 additions and 17 deletions
--- a/Backend/pycache/model.cpython-311.pyc
+++ b/Backend/pycache/model.cpython-311.pyc
--- a/Backend/config.json
+++ b/Backend/config.json
@ -0,0 +1,3 @@
+{
+  "useCloud" : false
+}
--- a/Backend/indexCloud.py
+++ b/Backend/indexCloud.py
@ -0,0 +1,41 @@
+from elasticsearch import Elasticsearch
+from langchain_huggingface import HuggingFaceEmbeddings
+import json
+import sys
+
+es = Elasticsearch(
+    cloud_id="tt:dXMtZWFzdC0yLmF3cy5lbGFzdGljLWNsb3VkLmNvbTo0NDMkOGM3ODQ0ZWVhZTEyNGY3NmFjNjQyNDFhNjI4NmVhYzMkZTI3YjlkNTQ0ODdhNGViNmEyMTcxMjMxNmJhMWI0ZGU=",
+    basic_auth=("elastic", "sSz2BEGv56JRNjGFwoQ191RJ")
+)
+
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
+
+
+def load_drug_data(json_path):
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+
+def index_documents(data):
+    total_documents = len(data)
+    for i, item in enumerate(data, start=1):
+        doc_text = f"{item['link']} {item.get('pribalovy_letak', '')} {item.get('spc', '')}"
+
+        vector = embeddings.embed_query(doc_text)
+
+        es.index(index='drug_docs', id=i, body={
+            'text': doc_text,
+            'vector': vector,
+            'full_data': item
+        })
+
+        sys.stdout.write(f"\rПроиндексировано {i} из {total_documents} документов")
+        sys.stdout.flush()
+
+    print("\nИндексирование завершено.")
+
+
+data_path = "../../data_adc_databaza/cleaned_general_info_additional.json"
+drug_data = load_drug_data(data_path)
+index_documents(drug_data)
--- a/Backend/index_JSON.py
+++ b/Backend/index_JSON.py
@ -2,10 +2,8 @@ import json
 from elasticsearch import Elasticsearch
 from langchain_huggingface import HuggingFaceEmbeddings

-
 es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])

-
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


--- a/Backend/model.py
+++ b/Backend/model.py
@ -1,7 +1,11 @@
-import os
+from elasticsearch import Elasticsearch
+import json
 import requests
+from langchain.chains import SequentialChain
+from langchain.chains import LLMChain, SequentialChain
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_elasticsearch import ElasticsearchStore
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 import logging


@ -9,10 +13,11 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)


+
+
 mistral_api_key = "hXDC4RBJk1qy5pOlrgr01GtOlmyCBaNs"
 if not mistral_api_key:
-    raise ValueError("API ключ не найден. Убедитесь, что переменная MISTRAL_API_KEY установлена.")
-
+    raise ValueError("API ключ не найден.")


 class CustomMistralLLM:
@ -38,30 +43,47 @@ class CustomMistralLLM:
        return result.get("choices", [{}])[0].get("message", {}).get("content", "No response")


-
 logger.info("Загрузка модели HuggingFaceEmbeddings...")
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


+config_file_path = "config.json"
+
+
+with open(config_file_path, 'r') as config_file:
+    config = json.load(config_file)
+
+#  Cloud ID
+if config.get("useCloud", False):
+    logger.info("CLOUD ELASTIC")
+    cloud_id = "tt:dXMtZWFzdC0yLmF3cy5lbGFzdGljLWNsb3VkLmNvbTo0NDMkOGM3ODQ0ZWVhZTEyNGY3NmFjNjQyNDFhNjI4NmVhYzMkZTI3YjlkNTQ0ODdhNGViNmEyMTcxMjMxNmJhMWI0ZGU="  # Замените на ваш Cloud ID
+    vectorstore = ElasticsearchStore(
+        es_cloud_id=cloud_id,
+        index_name='drug_docs',
+        embedding=embeddings,
+        es_user = "elastic",
+        es_password = "sSz2BEGv56JRNjGFwoQ191RJ",
+    )
+else:
+    logger.info("LOCAL ELASTIC")
    vectorstore = ElasticsearchStore(
        es_url="http://localhost:9200",
        index_name='drug_docs',
        embedding=embeddings,
-    es_user='elastic',
-    es_password='sSz2BEGv56JRNjGFwoQ191RJ'
    )

+logger.info(f"Подключение установлено к {'облачному' if config.get('useCloud', False) else 'локальному'} Elasticsearch")

+# LLM
 llm = CustomMistralLLM(
    api_key=mistral_api_key,
    endpoint_url="https://api.mistral.ai/v1/chat/completions"
 )

-
-
 def process_query_with_mistral(query, k=10):
    logger.info("Обработка запроса началась.")
    try:
+        # Elasticsearch LangChain
        response = vectorstore.similarity_search(query, k=k)
        if not response:
            return {"summary": "Ничего не найдено", "links": [], "status_log": ["Ничего не найдено."]}
@ -75,8 +97,12 @@ def process_query_with_mistral(query, k=10):
        )

        summary = llm.generate_text(prompt=structured_prompt, max_tokens=512, temperature=0.7)
-        return {"summary": summary, "links": links, "status_log": ["Ответ получен от модели Mistral."]}
+
+        #TextSplitter
+        splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
+        split_summary = splitter.split_text(summary)
+
+        return {"summary": split_summary, "links": links, "status_log": ["Ответ получен от модели Mistral."]}
    except Exception as e:
        logger.info(f"Ошибка: {str(e)}")
        return {"summary": "Произошла ошибка", "links": [], "status_log": [f"Ошибка: {str(e)}"]}
-