book dataset was generated

This commit is contained in:
Andrii Pervashov 2024-11-08 17:59:22 +01:00
parent 6137eeef63
commit 794c7d4c4d
2 changed files with 12719 additions and 3 deletions

View File

@ -1,10 +1,10 @@
import csv import csv
# Открываем файлы с правильными и неправильными строками # Открываем файлы с правильными и неправильными строками
with open('slovak_no_caps.txt', 'r', encoding='utf-8', errors='replace') as correct_file: with open('cleaned_book.txt', 'r', encoding='utf-8', errors='replace') as correct_file:
correct_lines = correct_file.readlines() correct_lines = correct_file.readlines()
with open('no_slovak_no_caps.txt', 'r', encoding='utf-8', errors='replace') as incorrect_file: with open('book_no_slovak_output.txt', 'r', encoding='utf-8', errors='replace') as incorrect_file:
incorrect_lines = incorrect_file.readlines() incorrect_lines = incorrect_file.readlines()
# Убедимся, что оба списка строк имеют одинаковую длину # Убедимся, что оба списка строк имеют одинаковую длину
@ -15,7 +15,7 @@ correct_lines += [''] * (max_length - len(correct_lines))
incorrect_lines += [''] * (max_length - len(incorrect_lines)) incorrect_lines += [''] * (max_length - len(incorrect_lines))
# Открываем CSV файл для записи # Открываем CSV файл для записи
with open('dataset.csv', 'w', newline='', encoding='utf-8') as csvfile: with open('dataset_book.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['correct', 'incorrect'] fieldnames = ['correct', 'incorrect']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

12716
dataset_book.csv Normal file

File diff suppressed because it is too large Load Diff