add testing models files

2024-11-11 10:56:44 +01:00 · 2024-11-11 10:56:44 +01:00 · ff01567cb4
commit ff01567cb4
parent 4e0499ff05
6 changed files with 565 additions and 0 deletions
--- a/Backend/index-server-es.py
+++ b/Backend/index-server-es.py
@ -0,0 +1,80 @@
 from elasticsearch import Elasticsearch
 from langchain.embeddings import HuggingFaceEmbeddings
 from elasticsearch.helpers import bulk
 import json
 # Настройка подключения к Elasticsearch с аутентификацией и HTTPS
 es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200, 'scheme': 'https'}],
    http_auth=('elastic', 'S7DoO3ma=G=9USBPbqq3'),  # замените на ваш пароль
    verify_certs=False  # Отключить проверку SSL-сертификата, если используется самоподписанный сертификат
 )
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 def create_index():
    # Определяем маппинг для индекса
    mapping = {
        "mappings": {
            "properties": {
                "text": {
                    "type": "text",
                    "analyzer": "standard"
                },
                "vector": {
                    "type": "dense_vector",
                    "dims": 384  # Размерность векторного представления
                },
                "full_data": {
                    "type": "object",
                    "enabled": False  # Отключаем индексацию вложенных данных
                }
            }
        }
    }
    es.indices.create(index='drug_docs', body=mapping, ignore=400)
 def load_drug_data(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data
 def index_documents(data):
    actions = []
    total_docs = len(data)
    for i, item in enumerate(data, start=1):
        doc_text = f"{item['link']} {item.get('pribalovy_letak', '')} {item.get('spc', '')}"
        vector = embeddings.embed_query(doc_text)
        action = {
            "_index": "drug_docs",
            "_id": i,
            "_source": {
                'text': doc_text,
                'vector': vector,
                'full_data': item
            }
        }
        actions.append(action)
        # Отображение прогресса
        print(f"Индексируется документ {i}/{total_docs}", end='\r')
        # Опционально: индексируем пакетами по N документов
        if i % 100 == 0 or i == total_docs:
            bulk(es, actions)
            actions = []
    # Если остались неиндексированные документы
    if actions:
        bulk(es, actions)
    print("\nИндексирование завершено.")
 if __name__ == "__main__":
    create_index()
    data_path = "/home/poiasnik/esDB/cleaned_general_info_additional.json"
    drug_data = load_drug_data(data_path)
    index_documents(drug_data)
--- a/Backend/qwen72-test.py
+++ b/Backend/qwen72-test.py
@ -0,0 +1,77 @@
 import torch
 import logging
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from elasticsearch import Elasticsearch
 # Настройка логирования
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Подключение к Elasticsearch
 es = Elasticsearch(
    ["https://localhost:9200"],
    basic_auth=("elastic", "S7DoO3ma=G=9USBPbqq3"),  # Ваш пароль
    verify_certs=False
 )
 index_name = 'drug_docs'
 # Загрузка токенизатора и модели
 model_name = "Qwen/Qwen2.5-7B-Instruct"
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 # Проверка наличия pad_token
 if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
 def text_search(query, k=10, max_doc_length=300, max_docs=3):
    try:
        es_results = es.search(
            index=index_name,
            body={"size": k, "query": {"match": {"text": query}}}
        )
        text_documents = [hit['_source'].get('text', '') for hit in es_results['hits']['hits']]
        text_documents = [doc[:max_doc_length] for doc in text_documents[:max_docs]]
        return text_documents
    except Exception as e:
        logger.error(f"Ошибка поиска: {str(e)}")
        return []
 # Пример запроса для поиска
 query = "čo piť pri horúčke"
 text_documents = text_search(query)
 # Обрезаем текст, если он превышает предел токенов модели
 max_tokens_per_input = 1024  # Установим более низкое значение для max_tokens
 context_text = ' '.join(text_documents)
 input_text = (
    f"Informácie o liekoch: {context_text[:max_tokens_per_input]}\n"
    "Uveďte tri konkrétne lieky alebo riešenia s veľmi krátkym vysvetlením pre každý z nich.\n"
    "Odpoveď v slovenčine:"
 )
 # Токенизация входного текста
 inputs = tokenizer(input_text, return_tensors="pt", max_length=max_tokens_per_input, truncation=True).to(device)
 try:
    generated_ids = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=300,  # Снижено значение
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        do_sample=False,  # Отключено семплирование для детерминированного вывода
        pad_token_id=tokenizer.pad_token_id
    )
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True, errors='ignore')
    print("Сгенерированный текст:", response)
 except RuntimeError as e:
    print(f"Произошла ошибка во время генерации: {e}")
--- a/Backend/qwen7b-test.py
+++ b/Backend/qwen7b-test.py
@ -0,0 +1,308 @@
 """A simple command-line interactive chat demo for Qwen2.5-Instruct model with left-padding using bos_token."""
 import argparse
 import os
 import platform
 import shutil
 from copy import deepcopy
 from threading import Thread
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from transformers.trainer_utils import set_seed
 DEFAULT_CKPT_PATH = "Qwen/Qwen2.5-7B-Instruct"
 _WELCOME_MSG = """\
 Welcome to use Qwen2.5-Instruct model, type text to start chat, type :h to show command help.
 """
 _HELP_MSG = """\
 Commands:
    :help / :h              Show this help message              
    :exit / :quit / :q      Exit the demo                       
    :clear / :cl            Clear screen                        
    :clear-history / :clh   Clear history                       
    :history / :his         Show history                        
    :seed                   Show current random seed            
    :seed <N>               Set random seed to <N>              
    :conf                   Show current generation config      
    :conf <key>=<value>     Change generation config            
    :reset-conf             Reset generation config             
 """
 _ALL_COMMAND_NAMES = [
    "help",
    "h",
    "exit",
    "quit",
    "q",
    "clear",
    "cl",
    "clear-history",
    "clh",
    "history",
    "his",
    "seed",
    "conf",
    "reset-conf",
 ]
 def _setup_readline():
    try:
        import readline
    except ImportError:
        return
    _matches = []
    def _completer(text, state):
        nonlocal _matches
        if state == 0:
            _matches = [
                cmd_name for cmd_name in _ALL_COMMAND_NAMES if cmd_name.startswith(text)
            ]
        if 0 <= state < len(_matches):
            return _matches[state]
        return None
    readline.set_completer(_completer)
    readline.parse_and_bind("tab: complete")
 def _load_model_tokenizer(args):
    tokenizer = AutoTokenizer.from_pretrained(
        args.checkpoint_path,
        resume_download=True,
    )
    # Set bos_token for left-padding
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.bos_token
    device_map = "cpu" if args.cpu_only else "auto"
    model = AutoModelForCausalLM.from_pretrained(
        args.checkpoint_path,
        torch_dtype="auto",
        device_map=device_map,
        resume_download=True,
    ).eval()
    # Conservative generation config
    model.generation_config.max_new_tokens = 256
    model.generation_config.temperature = 0.7
    model.generation_config.top_k = 50
    model.generation_config.top_p = 0.9
    model.generation_config.pad_token_id = tokenizer.pad_token_id
    model.generation_config.eos_token_id = tokenizer.eos_token_id
    model.generation_config.do_sample = False
    return model, tokenizer
 def _gc():
    import gc
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
 def _clear_screen():
    if platform.system() == "Windows":
        os.system("cls")
    else:
        os.system("clear")
 def _print_history(history):
    terminal_width = shutil.get_terminal_size()[0]
    print(f"History ({len(history)})".center(terminal_width, "="))
    for index, (query, response) in enumerate(history):
        print(f"User[{index}]: {query}")
        print(f"Qwen[{index}]: {response}")
    print("=" * terminal_width)
 def _get_input() -> str:
    while True:
        try:
            message = input("User> ").strip()
        except UnicodeDecodeError:
            print("[ERROR] Encoding error in input")
            continue
        except KeyboardInterrupt:
            exit(1)
        if message:
            return message
        print("[ERROR] Query is empty")
 def _chat_stream(model, tokenizer, query, history):
    conversation = []
    for query_h, response_h in history:
        conversation.append({"role": "user", "content": query_h})
        conversation.append({"role": "assistant", "content": response_h})
    conversation.append({"role": "user", "content": query})
    input_text = tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        tokenize=False,
    )
    # Perform left-padding with bos_token
    inputs = tokenizer(
        [input_text],
        return_tensors="pt",
        padding="longest",
        truncation=True,
        pad_to_multiple_of=8,
        max_length=1024,
        add_special_tokens=False
    ).to(model.device)
    # Update attention_mask for left-padding compatibility
    inputs["attention_mask"] = inputs["attention_mask"].flip(dims=[1])
    streamer = TextIteratorStreamer(
        tokenizer=tokenizer, skip_prompt=True, timeout=60.0, skip_special_tokens=True
    )
    generation_kwargs = {
        **inputs,
        "streamer": streamer,
    }
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    for new_text in streamer:
        yield new_text
 def main():
    parser = argparse.ArgumentParser(
        description="Qwen2.5-Instruct command-line interactive chat demo."
    )
    parser.add_argument(
        "-c",
        "--checkpoint-path",
        type=str,
        default=DEFAULT_CKPT_PATH,
        help="Checkpoint name or path, default to %(default)r",
    )
    parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")
    parser.add_argument(
        "--cpu-only", action="store_true", help="Run demo with CPU only"
    )
    args = parser.parse_args()
    history, response = [], ""
    model, tokenizer = _load_model_tokenizer(args)
    orig_gen_config = deepcopy(model.generation_config)
    _setup_readline()
    _clear_screen()
    print(_WELCOME_MSG)
    seed = args.seed
    while True:
        query = _get_input()
        # Process commands.
        if query.startswith(":"):
            command_words = query[1:].strip().split()
            if not command_words:
                command = ""
            else:
                command = command_words[0]
            if command in ["exit", "quit", "q"]:
                break
            elif command in ["clear", "cl"]:
                _clear_screen()
                print(_WELCOME_MSG)
                _gc()
                continue
            elif command in ["clear-history", "clh"]:
                print(f"[INFO] All {len(history)} history cleared")
                history.clear()
                _gc()
                continue
            elif command in ["help", "h"]:
                print(_HELP_MSG)
                continue
            elif command in ["history", "his"]:
                _print_history(history)
                continue
            elif command in ["seed"]:
                if len(command_words) == 1:
                    print(f"[INFO] Current random seed: {seed}")
                    continue
                else:
                    new_seed_s = command_words[1]
                    try:
                        new_seed = int(new_seed_s)
                    except ValueError:
                        print(
                            f"[WARNING] Fail to change random seed: {new_seed_s!r} is not a valid number"
                        )
                    else:
                        print(f"[INFO] Random seed changed to {new_seed}")
                        seed = new_seed
                    continue
            elif command in ["conf"]:
                if len(command_words) == 1:
                    print(model.generation_config)
                else:
                    for key_value_pairs_str in command_words[1:]:
                        eq_idx = key_value_pairs_str.find("=")
                        if eq_idx == -1:
                            print("[WARNING] format: <key>=<value>")
                            continue
                        conf_key, conf_value_str = (
                            key_value_pairs_str[:eq_idx],
                            key_value_pairs_str[eq_idx + 1 :],
                        )
                        try:
                            conf_value = eval(conf_value_str)
                        except Exception as e:
                            print(e)
                            continue
                        else:
                            print(
                                f"[INFO] Change config: model.generation_config.{conf_key} = {conf_value}"
                            )
                            setattr(model.generation_config, conf_key, conf_value)
                continue
            elif command in ["reset-conf"]:
                print("[INFO] Reset generation config")
                model.generation_config = deepcopy(orig_gen_config)
                print(model.generation_config)
                continue
        # Run chat.
        set_seed(seed)
        _clear_screen()
        print(f"\nUser: {query}")
        print(f"\nQwen: ", end="")
        try:
            partial_text = ""
            for new_text in _chat_stream(model, tokenizer, query, history):
                print(new_text, end="", flush=True)
                partial_text += new_text
            response = partial_text
            print()
        except KeyboardInterrupt:
            print("[WARNING] Generation interrupted")
            continue
        history.append((query, response))
 if __name__ == "__main__":
    main()
--- a/Backend/test_flant5.py
+++ b/Backend/test_flant5.py
@ -0,0 +1,34 @@
 import requests
 API_TOKEN = "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"
 API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-large"
 headers = {
    "Authorization": f"Bearer {API_TOKEN}",
    "Content-Type": "application/json"
 }
 def query_flan_t5(prompt):
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_length": 250,
            "do_sample": True,
            "temperature": 0.9,
            "top_p": 0.95,
            "top_k": 50
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()
 prompt = "Ako sa máš? Daj odpoved v slovencine"
 result = query_flan_t5(prompt)
 if isinstance(result, list) and len(result) > 0:
    print("Ответ от Flan-T5:", result[0]['generated_text'])
 else:
    print("Ошибка при получении ответа:", result)
--- a/Backend/test_mt5_base.py
+++ b/Backend/test_mt5_base.py
@ -0,0 +1,47 @@
 # import requests
 #
 # API_TOKEN = "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"
 # API_URL = "https://api-inference.huggingface.co/models/google/mt5-base"
 #
 # headers = {
 #     "Authorization": f"Bearer {API_TOKEN}",
 #     "Content-Type": "application/json"
 # }
 #
 # def query_mT5(prompt):
 #     payload = {
 #         "inputs": prompt,
 #         "parameters": {
 #             "max_length": 100,
 #             "do_sample": True,
 #             "temperature": 0.7
 #         }
 #     }
 #     response = requests.post(API_URL, headers=headers, json=payload)
 #     return response.json()
 #
 # # Пример использования
 # result = query_mT5("Aké sú účinné lieky na horúčku?")
 # print("Ответ от mT5:", result)
 from transformers import AutoTokenizer, MT5ForConditionalGeneration
 tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
 model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
 # training
 input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
 labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
 outputs = model(input_ids=input_ids, labels=labels)
 loss = outputs.loss
 logits = outputs.logits
 # inference
 input_ids = tokenizer(
    "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
 ).input_ids  # Batch size 1
 outputs = model.generate(input_ids, max_new_tokens=50)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 # studies have shown that owning a dog is good for you.
--- a/Backend/test_slovakbert-skquad.py
+++ b/Backend/test_slovakbert-skquad.py
@ -0,0 +1,19 @@
 from sentence_transformers import SentenceTransformer, util
 # Загрузка модели из Hugging Face
 model = SentenceTransformer("TUKE-DeutscheTelekom/slovakbert-skquad-mnlr")  # Замените на ID нужной модели
 # Пример предложений на словацком языке
 sentences = [
    "Prvý most cez Zlatý roh nechal vybudovať cisár Justinián I. V roku 1502 vypísal sultán Bajezid II. súťaž na nový most.",
    "V ktorom roku vznikol druhý drevený most cez záliv Zlatý roh?",
    "Aká je priemerná dĺžka života v Eritrei?"
 ]
 # Получение эмбеддингов для каждого предложения
 embeddings = model.encode(sentences)
 print("Shape of embeddings:", embeddings.shape)  # Вывод формы эмбеддингов, например (3, 768)
 # Вычисление сходства между предложениями
 similarities = util.cos_sim(embeddings, embeddings)
 print("Similarity matrix:\n", similarities)