diff --git a/Backend/__pycache__/model.cpython-311.pyc b/Backend/__pycache__/model.cpython-311.pyc index 7e4d1d4..78faa39 100644 Binary files a/Backend/__pycache__/model.cpython-311.pyc and b/Backend/__pycache__/model.cpython-311.pyc differ diff --git a/Backend/config.json b/Backend/config.json new file mode 100644 index 0000000..54e135e --- /dev/null +++ b/Backend/config.json @@ -0,0 +1,3 @@ +{ + "useCloud" : false +} \ No newline at end of file diff --git a/Backend/indexCloud.py b/Backend/indexCloud.py new file mode 100644 index 0000000..bb897d9 --- /dev/null +++ b/Backend/indexCloud.py @@ -0,0 +1,41 @@ +from elasticsearch import Elasticsearch +from langchain_huggingface import HuggingFaceEmbeddings +import json +import sys + +es = Elasticsearch( + cloud_id="tt:dXMtZWFzdC0yLmF3cy5lbGFzdGljLWNsb3VkLmNvbTo0NDMkOGM3ODQ0ZWVhZTEyNGY3NmFjNjQyNDFhNjI4NmVhYzMkZTI3YjlkNTQ0ODdhNGViNmEyMTcxMjMxNmJhMWI0ZGU=", + basic_auth=("elastic", "sSz2BEGv56JRNjGFwoQ191RJ") +) + +embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") + + +def load_drug_data(json_path): + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + return data + + +def index_documents(data): + total_documents = len(data) + for i, item in enumerate(data, start=1): + doc_text = f"{item['link']} {item.get('pribalovy_letak', '')} {item.get('spc', '')}" + + vector = embeddings.embed_query(doc_text) + + es.index(index='drug_docs', id=i, body={ + 'text': doc_text, + 'vector': vector, + 'full_data': item + }) + + sys.stdout.write(f"\rПроиндексировано {i} из {total_documents} документов") + sys.stdout.flush() + + print("\nИндексирование завершено.") + + +data_path = "../../data_adc_databaza/cleaned_general_info_additional.json" +drug_data = load_drug_data(data_path) +index_documents(drug_data) diff --git a/Backend/index_JSON.py b/Backend/index_JSON.py index e353fcb..b5d7e1f 100644 --- a/Backend/index_JSON.py +++ b/Backend/index_JSON.py @@ -2,10 +2,8 @@ import json from elasticsearch import Elasticsearch from langchain_huggingface import HuggingFaceEmbeddings - es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}]) - embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") diff --git a/Backend/model.py b/Backend/model.py index cfecdb3..1477e41 100644 --- a/Backend/model.py +++ b/Backend/model.py @@ -1,7 +1,11 @@ -import os +from elasticsearch import Elasticsearch +import json import requests +from langchain.chains import SequentialChain +from langchain.chains import LLMChain, SequentialChain from langchain_huggingface import HuggingFaceEmbeddings from langchain_elasticsearch import ElasticsearchStore +from langchain.text_splitter import RecursiveCharacterTextSplitter import logging @@ -9,10 +13,11 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) + + mistral_api_key = "hXDC4RBJk1qy5pOlrgr01GtOlmyCBaNs" if not mistral_api_key: - raise ValueError("API ключ не найден. Убедитесь, что переменная MISTRAL_API_KEY установлена.") - + raise ValueError("API ключ не найден.") class CustomMistralLLM: @@ -38,30 +43,47 @@ class CustomMistralLLM: return result.get("choices", [{}])[0].get("message", {}).get("content", "No response") - logger.info("Загрузка модели HuggingFaceEmbeddings...") embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") -vectorstore = ElasticsearchStore( - es_url="http://localhost:9200", - index_name='drug_docs', - embedding=embeddings, - es_user='elastic', - es_password='sSz2BEGv56JRNjGFwoQ191RJ' -) +config_file_path = "config.json" +with open(config_file_path, 'r') as config_file: + config = json.load(config_file) + +# Cloud ID +if config.get("useCloud", False): + logger.info("CLOUD ELASTIC") + cloud_id = "tt:dXMtZWFzdC0yLmF3cy5lbGFzdGljLWNsb3VkLmNvbTo0NDMkOGM3ODQ0ZWVhZTEyNGY3NmFjNjQyNDFhNjI4NmVhYzMkZTI3YjlkNTQ0ODdhNGViNmEyMTcxMjMxNmJhMWI0ZGU=" # Замените на ваш Cloud ID + vectorstore = ElasticsearchStore( + es_cloud_id=cloud_id, + index_name='drug_docs', + embedding=embeddings, + es_user = "elastic", + es_password = "sSz2BEGv56JRNjGFwoQ191RJ", + ) +else: + logger.info("LOCAL ELASTIC") + vectorstore = ElasticsearchStore( + es_url="http://localhost:9200", + index_name='drug_docs', + embedding=embeddings, + ) + +logger.info(f"Подключение установлено к {'облачному' if config.get('useCloud', False) else 'локальному'} Elasticsearch") + +# LLM llm = CustomMistralLLM( api_key=mistral_api_key, endpoint_url="https://api.mistral.ai/v1/chat/completions" ) - - def process_query_with_mistral(query, k=10): logger.info("Обработка запроса началась.") try: + # Elasticsearch LangChain response = vectorstore.similarity_search(query, k=k) if not response: return {"summary": "Ничего не найдено", "links": [], "status_log": ["Ничего не найдено."]} @@ -75,8 +97,12 @@ def process_query_with_mistral(query, k=10): ) summary = llm.generate_text(prompt=structured_prompt, max_tokens=512, temperature=0.7) - return {"summary": summary, "links": links, "status_log": ["Ответ получен от модели Mistral."]} + + #TextSplitter + splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20) + split_summary = splitter.split_text(summary) + + return {"summary": split_summary, "links": links, "status_log": ["Ответ получен от модели Mistral."]} except Exception as e: logger.info(f"Ошибка: {str(e)}") return {"summary": "Произошла ошибка", "links": [], "status_log": [f"Ошибка: {str(e)}"]} -