Bakalarska_praca/data_files/converter.py
2024-10-23 13:43:55 +02:00

28 lines
856 B
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import chardet
from bs4 import BeautifulSoup
# Определяем кодировку файла
with open('file.html', 'rb') as file:
raw_data = file.read()
# Используем chardet для определения кодировки
result = chardet.detect(raw_data)
file_encoding = result['encoding']
# Читаем файл с найденной кодировкой
with open('file.html', 'r', encoding=file_encoding) as file:
html_content = file.read()
# Парсим HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Извлекаем текст
text = soup.get_text()
# Записываем текст в файл
with open('output.txt', 'w', encoding='utf-8') as output_file:
output_file.write(text)
print(f"Текст успешно извлечён! Использована кодировка: {file_encoding}")