we do

2024-10-23 13:43:55 +02:00 · 2024-10-23 13:43:55 +02:00 · 0621eaed3e
commit 0621eaed3e
parent aaf6afd6d8
14 changed files with 8863 additions and 0 deletions
--- a/data_files/cleaned.txt
+++ b/data_files/cleaned.txt
--- a/data_files/cleaner.py
+++ b/data_files/cleaner.py
@ -0,0 +1,18 @@
 import re
 # Словацкий алфавит (включает буквы с диакритическими знаками)
 slovak_alphabet = r'a-zA-ZáäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ., \n'
 # Открываем TXT файл с обработкой ошибок
 with open('output.txt', 'r', encoding='utf-8', errors='replace') as file:
    text_content = file.read()
 # Используем регулярное выражение для замены всех символов, не входящих в словацкий алфавит
 cleaned_text = re.sub(f'[^{slovak_alphabet} ]', '', text_content)
 # Записываем очищенный текст в новый файл
 with open('cleaned_output.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(cleaned_text)
 print("Текст успешно очищен!")
--- a/data_files/convert_html.py
+++ b/data_files/convert_html.py
@ -0,0 +1,18 @@
 from bs4 import BeautifulSoup
 # Открываем HTML файл
 with open('file.html', 'r', encoding='windows-1250') as file:
    html_content = file.read()
 # Парсим HTML с BeautifulSoup
 soup = BeautifulSoup(html_content, 'html.parser')
 # Извлекаем текст, удаляя все HTML-теги
 text = soup.get_text()
 # Записываем текст в файл (или делаем с ним что-то другое)
 with open('output.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(text)
 print("Текст успешно извлечён!")
--- a/data_files/converter.py
+++ b/data_files/converter.py
@ -0,0 +1,27 @@
 import chardet
 from bs4 import BeautifulSoup
 # Определяем кодировку файла
 with open('file.html', 'rb') as file:
    raw_data = file.read()
 # Используем chardet для определения кодировки
 result = chardet.detect(raw_data)
 file_encoding = result['encoding']
 # Читаем файл с найденной кодировкой
 with open('file.html', 'r', encoding=file_encoding) as file:
    html_content = file.read()
 # Парсим HTML
 soup = BeautifulSoup(html_content, 'html.parser')
 # Извлекаем текст
 text = soup.get_text()
 # Записываем текст в файл
 with open('output.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(text)
 print(f"Текст успешно извлечён! Использована кодировка: {file_encoding}")
--- a/data_files/csv_create.py
+++ b/data_files/csv_create.py
@ -0,0 +1,30 @@
 import csv
 # Открываем файлы с правильными и неправильными строками
 with open('slovak_no_caps.txt', 'r', encoding='utf-8', errors='replace') as correct_file:
    correct_lines = correct_file.readlines()
 with open('no_slovak_no_caps.txt', 'r', encoding='utf-8', errors='replace') as incorrect_file:
    incorrect_lines = incorrect_file.readlines()
 # Убедимся, что оба списка строк имеют одинаковую длину
 max_length = max(len(correct_lines), len(incorrect_lines))
 # Дополняем более короткий список пустыми строками, если это необходимо
 correct_lines += [''] * (max_length - len(correct_lines))
 incorrect_lines += [''] * (max_length - len(incorrect_lines))
 # Открываем CSV файл для записи
 with open('dataset.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['correct', 'incorrect']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    # Записываем заголовок
    writer.writeheader()
    # Записываем строки в CSV файл
    for correct_line, incorrect_line in zip(correct_lines, incorrect_lines):
        writer.writerow({'correct': correct_line.strip(), 'incorrect': incorrect_line.strip()})
 print("CSV файл успешно создан!")
--- a/data_files/dataset.csv
+++ b/data_files/dataset.csv
--- a/data_files/citatelsky_dennik_14492.html
+++ b/data_files/citatelsky_dennik_14492.html
--- a/data_files/no_slovak_letters.txt
+++ b/data_files/no_slovak_letters.txt
--- a/data_files/no_slovak_no_caps.txt
+++ b/data_files/no_slovak_no_caps.txt
--- a/data_files/no_slovak_output.txt
+++ b/data_files/no_slovak_output.txt
--- a/data_files/output.txt
+++ b/data_files/output.txt
--- a/data_files/remove_caps.py
+++ b/data_files/remove_caps.py
@ -0,0 +1,33 @@
 import re
 # Открываем файл и читаем его содержимое
 with open('cleaned.txt', 'r', encoding='utf-8', errors='replace') as file:
    text_content = file.read()
 # Функция для преобразования слов
 def normalize_caps(text):
    # Разделяем текст на строки
    lines = text.splitlines()
    # Обрабатываем каждую строку
    normalized_lines = []
    for line in lines:
        # Разделяем строку на слова
        words = line.split()
        # Проверяем каждое слово
        for i, word in enumerate(words):
            if word.isupper():  # Если слово написано полностью заглавными буквами
                words[i] = word.capitalize()  # Меняем на первое заглавное и остальные строчные
        # Собираем слова обратно в строку и добавляем в список
        normalized_lines.append(' '.join(words))
    # Собираем строки обратно в текст
    return '\n'.join(normalized_lines)
 # Применяем нормализацию
 normalized_text = normalize_caps(text_content)
 # Записываем изменённый текст в новый файл
 with open('slovak_no_caps.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(normalized_text)
 print("Текст успешно нормализован!")
--- a/data_files/remove_slovak_letters.py
+++ b/data_files/remove_slovak_letters.py
@ -0,0 +1,24 @@
 # Словарь для замены словацких символов на обычные буквы
 replacement_dict = {
    'á': 'a', 'ä': 'a', 'č': 'c', 'ď': 'd', 'é': 'e', 'í': 'i',
    'ĺ': 'l', 'ľ': 'l', 'ň': 'n', 'ó': 'o', 'ô': 'o', 'ŕ': 'r',
    'šť': 'st', 'ú': 'u', 'ý': 'y', 'ž': 'z',
    'Á': 'A', 'Ä': 'A', 'Č': 'C', 'Ď': 'D', 'É': 'E', 'Í': 'I',
    'Ĺ': 'L', 'Ľ': 'L', 'Ň': 'N', 'Ó': 'O', 'Ô': 'O', 'Ŕ': 'R',
    'Š': 'S', 'Ú': 'U', 'Ý': 'Y', 'Ž': 'Z'
 }
 # Открываем файл и читаем его содержимое
 with open('no_slovak_letters.txt', 'r', encoding='utf-8', errors='replace') as file:
    text_content = file.read()
 # Заменяем словацкие символы на обычные буквы
 for slovak_char, english_char in replacement_dict.items():
    text_content = text_content.replace(slovak_char, english_char)
 # Записываем изменённый текст в новый файл
 with open('no_slovak_output.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(text_content)
 print("Текст успешно нормализован!")
--- a/data_files/slovak_no_caps.txt
+++ b/data_files/slovak_no_caps.txt