Bakalarska_praca/data_files/converter.py

import chardet
from bs4 import BeautifulSoup

# Определяем кодировку файла
with open('file.html', 'rb') as file:
    raw_data = file.read()

# Используем chardet для определения кодировки
result = chardet.detect(raw_data)
file_encoding = result['encoding']

# Читаем файл с найденной кодировкой
with open('file.html', 'r', encoding=file_encoding) as file:
    html_content = file.read()

# Парсим HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Извлекаем текст
text = soup.get_text()

# Записываем текст в файл
with open('output.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(text)

print(f"Текст успешно извлечён! Использована кодировка: {file_encoding}")