usaa24/sk1/compressor.c

543 lines
13 KiB
C
Raw Normal View History

2025-01-17 18:08:33 +00:00
#include "compressor.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
2025-01-18 12:15:16 +00:00
#include <unistd.h>
2025-01-17 18:08:33 +00:00
#define WINDOW_SIZE 4096 // Размер скользящего окна
#define LOOKAHEAD_BUFFER_SIZE 15 // Размер буфера предпросмотра
// Структура для хранения токена
typedef struct
{
int offset;
int length;
char next_char;
} LZ77Token;
#define MAX_TREE_NODES 256
// Huffman tree node
typedef struct Node
{
unsigned char symbol;
int frequency;
struct Node* left;
struct Node* right;
} Node;
// Min-heap for Huffman tree
typedef struct MinHeap
{
Node* nodes[MAX_TREE_NODES];
int size;
} MinHeap;
void swap_nodes(Node** a, Node** b)
{
Node* temp = *a;
*a = *b;
*b = temp;
}
2025-01-18 12:15:16 +00:00
void heapify(MinHeap* heap, int idx)
2025-01-17 18:08:33 +00:00
{
int smallest = idx;
int left = 2 * idx + 1;
int right = 2 * idx + 2;
if (left < heap->size && heap->nodes[left]->frequency < heap->nodes[smallest]->frequency)
{
smallest = left;
}
if (right < heap->size && heap->nodes[right]->frequency < heap->nodes[smallest]->frequency)
{
smallest = right;
}
if (smallest != idx)
{
swap_nodes(&heap->nodes[smallest], &heap->nodes[idx]);
heapify(heap, smallest);
}
}
Node* extract_min(MinHeap* heap)
{
Node* temp = heap->nodes[0];
heap->nodes[0] = heap->nodes[heap->size - 1];
heap->size--;
heapify(heap, 0);
return temp;
}
void insert_min_heap(MinHeap* heap, Node* node)
{
heap->size++;
int i = heap->size - 1;
while (i && node->frequency < heap->nodes[(i - 1) / 2]->frequency)
{
heap->nodes[i] = heap->nodes[(i - 1) / 2];
i = (i - 1) / 2;
}
heap->nodes[i] = node;
}
MinHeap* create_min_heap()
{
MinHeap* heap = (MinHeap*)malloc(sizeof(MinHeap));
heap->size = 0;
return heap;
}
Node* create_node(unsigned char symbol, int frequency)
{
Node* node = (Node*)malloc(sizeof(Node));
node->symbol = symbol;
node->frequency = frequency;
node->left = node->right = NULL;
return node;
}
void build_huffman_tree(MinHeap* heap)
{
while (heap->size > 1)
{
Node* left = extract_min(heap);
Node* right = extract_min(heap);
Node* new_node = create_node(0, left->frequency + right->frequency);
new_node->left = left;
new_node->right = right;
insert_min_heap(heap, new_node);
}
}
void build_codes(Node* root, char* code, int top, char codes[MAX_TREE_NODES][MAX_TREE_NODES])
{
if (root->left)
{
code[top] = '0';
build_codes(root->left, code, top + 1, codes);
}
if (root->right)
{
code[top] = '1';
build_codes(root->right, code, top + 1, codes);
}
if (!root->left && !root->right)
{
code[top] = '\0';
strcpy(codes[root->symbol], code);
}
}
void free_tree(Node* root)
{
if (root)
{
free_tree(root->left);
free_tree(root->right);
free(root);
}
}
2025-01-18 12:15:16 +00:00
int compress_2(const char* input_file_name, const char* output_file_name)
2025-01-17 18:08:33 +00:00
{
FILE* input = fopen(input_file_name, "rb");
2025-01-18 12:15:16 +00:00
if (!input)
{
return -1;
}
2025-01-17 18:08:33 +00:00
int freq[MAX_TREE_NODES] = {0};
unsigned char buffer;
2025-01-18 12:15:16 +00:00
// Читаем файл и подсчитываем частоты символов
2025-01-17 18:08:33 +00:00
while (fread(&buffer, 1, 1, input))
{
freq[buffer]++;
}
MinHeap* heap = create_min_heap();
for (int i = 0; i < MAX_TREE_NODES; i++)
{
if (freq[i] > 0)
{
insert_min_heap(heap, create_node((unsigned char)i, freq[i]));
}
}
build_huffman_tree(heap);
char codes[MAX_TREE_NODES][MAX_TREE_NODES] = {0};
char code[MAX_TREE_NODES];
build_codes(heap->nodes[0], code, 0, codes);
2025-01-18 12:15:16 +00:00
// Вернемся в начало входного файла
2025-01-17 18:08:33 +00:00
fseek(input, 0, SEEK_SET);
2025-01-18 12:15:16 +00:00
2025-01-17 18:08:33 +00:00
FILE* output = fopen(output_file_name, "wb");
if (!output)
{
fclose(input);
return -1;
}
2025-01-18 12:15:16 +00:00
// Записываем частоты символов
2025-01-17 18:08:33 +00:00
fwrite(freq, sizeof(freq), 1, output);
unsigned char out_buffer = 0;
int bit_count = 0;
2025-01-18 12:15:16 +00:00
// Записываем закодированные данные
2025-01-17 18:08:33 +00:00
while (fread(&buffer, 1, 1, input))
{
char* symbol_code = codes[buffer];
2025-01-18 12:15:16 +00:00
2025-01-17 18:08:33 +00:00
for (int i = 0; symbol_code[i] != '\0'; i++)
{
out_buffer <<= 1;
if (symbol_code[i] == '1')
{
out_buffer |= 1;
}
bit_count++;
2025-01-18 12:15:16 +00:00
if (bit_count == 8)
2025-01-17 18:08:33 +00:00
{
fwrite(&out_buffer, 1, 1, output);
bit_count = 0;
out_buffer = 0;
}
}
}
if (bit_count > 0)
{
out_buffer <<= (8 - bit_count);
fwrite(&out_buffer, 1, 1, output);
}
2025-01-18 12:15:16 +00:00
fseek(input, 0, SEEK_END);
long sizeIN = ftell(input);
fseek(input, 0, SEEK_SET);
// Записываем реальный размер входного файла для декомпрессии
fwrite(&sizeIN, sizeof(sizeIN), 1, output);
2025-01-17 18:08:33 +00:00
fseek(output, 0, SEEK_END);
int sizeOUT = ftell(output);
fseek(output, 0, SEEK_SET);
fclose(input);
fclose(output);
free_tree(heap->nodes[0]);
free(heap);
return sizeOUT;
}
int decompress_2(const char* input_file_name, const char* output_file_name)
{
FILE* input = fopen(input_file_name, "rb");
2025-01-18 12:15:16 +00:00
if (!input)
{
return -1;
}
2025-01-17 18:08:33 +00:00
int freq[MAX_TREE_NODES];
fread(freq, sizeof(freq), 1, input);
MinHeap* heap = create_min_heap();
for (int i = 0; i < MAX_TREE_NODES; i++)
{
2025-01-18 12:15:16 +00:00
if (freq[i] > 0)
2025-01-17 18:08:33 +00:00
{
insert_min_heap(heap, create_node((unsigned char)i, freq[i]));
}
}
build_huffman_tree(heap);
FILE* output = fopen(output_file_name, "wb");
if (!output)
{
fclose(input);
2025-01-18 12:15:16 +00:00
free_tree(heap->nodes[0]);
free(heap);
2025-01-17 18:08:33 +00:00
return -1;
}
Node* root = heap->nodes[0];
Node* current = root;
2025-01-18 12:15:16 +00:00
// Перемещаемся к концу файла, чтобы прочитать оригинальный размер
fseek(input, -sizeof(long), SEEK_END);
long original_size;
fread(&original_size, sizeof(long), 1, input);
// Вернемся к началу сжатых данных
long data_end = ftell(input) - sizeof(long);
fseek(input, sizeof(freq), SEEK_SET);
long written_bytes = 0;
2025-01-17 18:08:33 +00:00
unsigned char buffer;
2025-01-18 12:15:16 +00:00
int bit_count = 0;
2025-01-17 18:08:33 +00:00
2025-01-18 12:15:16 +00:00
while (ftell(input) < data_end || (bit_count > 0 && written_bytes < original_size))
2025-01-17 18:08:33 +00:00
{
2025-01-18 12:15:16 +00:00
if (bit_count == 0 && fread(&buffer, 1, 1, input) == 1)
2025-01-17 18:08:33 +00:00
{
2025-01-18 12:15:16 +00:00
bit_count = 8;
}
if (bit_count > 0)
{
int bit = (buffer >> (bit_count - 1)) & 1;
bit_count--;
current = bit ? current->right : current->left;
2025-01-17 18:08:33 +00:00
if (!current->left && !current->right)
{
2025-01-18 12:15:16 +00:00
if (written_bytes < original_size)
{
fwrite(&current->symbol, 1, 1, output);
written_bytes++;
}
2025-01-17 18:08:33 +00:00
current = root;
2025-01-18 12:15:16 +00:00
// Остановимся, если записали все байты
if (written_bytes == original_size)
{
break;
}
2025-01-17 18:08:33 +00:00
}
}
}
fseek(output, 0, SEEK_END);
int sizeOUT = ftell(output);
fseek(output, 0, SEEK_SET);
fclose(input);
fclose(output);
free_tree(root);
free(heap);
return sizeOUT;
}
2025-01-18 12:15:16 +00:00
2025-01-17 18:08:33 +00:00
// Функция для записи токена в файл в компактном формате
void write_token(FILE *file, LZ77Token token)
{
// Записываем offset и length как 2 байта (можно оптимизировать дальше)
unsigned short offset_length = (token.offset << 4) | (token.length & 0xF);
fwrite(&offset_length, sizeof(unsigned short), 1, file);
fwrite(&token.next_char, sizeof(char), 1, file);
}
// Функция для чтения токена из файла
LZ77Token read_token(FILE *file)
{
LZ77Token token;
unsigned short offset_length;
2025-01-18 18:11:07 +00:00
if (fread(&offset_length, sizeof(unsigned short), 1, file) != 1)
{
token.offset = 0;
token.length = 0;
token.next_char = 0;
return token;
}
2025-01-17 18:08:33 +00:00
token.offset = offset_length >> 4;
token.length = offset_length & 0xF;
2025-01-18 18:11:07 +00:00
if (fread(&token.next_char, sizeof(char), 1, file) != 1)
{
token.next_char = 0;
}
2025-01-17 18:08:33 +00:00
return token;
}
2025-01-18 18:11:07 +00:00
int compress_1(const char* input_file_name, const char* output_file_name) {
2025-01-17 18:08:33 +00:00
FILE *input_file = fopen(input_file_name, "rb");
FILE *output_file = fopen(output_file_name, "wb");
2025-01-18 18:11:07 +00:00
if (!input_file || !output_file) {
perror("Ошибка открытия файла");
2025-01-17 18:08:33 +00:00
return -1;
}
fseek(input_file, 0, SEEK_END);
long file_size = ftell(input_file);
fseek(input_file, 0, SEEK_SET);
2025-01-18 18:11:07 +00:00
if (file_size > 10 * 1024 * 1024) {
fprintf(stderr, "Файл слишком большой (больше 10 МБ)\n");
2025-01-17 18:08:33 +00:00
return -1;
}
char *data = (char*)malloc(file_size);
2025-01-18 18:11:07 +00:00
if (!data) {
perror("Ошибка выделения памяти");
fclose(input_file);
fclose(output_file);
return -1;
}
2025-01-17 18:08:33 +00:00
fread(data, 1, file_size, input_file);
int pos = 0;
2025-01-18 18:11:07 +00:00
while (pos < file_size) {
2025-01-17 18:08:33 +00:00
LZ77Token token = {0, 0, data[pos]};
int max_length = 0;
int max_offset = 0;
int start = (pos - WINDOW_SIZE) > 0 ? (pos - WINDOW_SIZE) : 0;
2025-01-18 18:11:07 +00:00
for (int i = start; i < pos; i++) {
2025-01-17 18:08:33 +00:00
int length = 0;
2025-01-18 18:11:07 +00:00
while (length < LOOKAHEAD_BUFFER_SIZE && pos + length < file_size && data[i + length] == data[pos + length]) {
2025-01-17 18:08:33 +00:00
length++;
}
2025-01-18 18:11:07 +00:00
if (length > max_length) {
2025-01-17 18:08:33 +00:00
max_length = length;
max_offset = pos - i;
}
}
2025-01-18 18:11:07 +00:00
if (max_length > 1) {
2025-01-17 18:08:33 +00:00
token.offset = max_offset;
token.length = max_length;
token.next_char = data[pos + max_length];
pos += max_length + 1;
2025-01-18 18:11:07 +00:00
} else {
2025-01-17 18:08:33 +00:00
pos++;
}
2025-01-18 18:11:07 +00:00
// Записываем токен
2025-01-17 18:08:33 +00:00
write_token(output_file, token);
}
2025-01-18 18:11:07 +00:00
fseek(input_file, 0, SEEK_END);
long sizeIN = ftell(input_file);
fseek(input_file, 0, SEEK_SET);
// Записываем реальный размер входного файла для декомпрессии
fwrite(&sizeIN, sizeof(sizeIN), 1, output_file);
2025-01-17 18:08:33 +00:00
fseek(output_file, 0, SEEK_END);
int sizeOUT = ftell(output_file);
fseek(output_file, 0, SEEK_SET);
free(data);
fclose(input_file);
fclose(output_file);
return sizeOUT;
}
2025-01-18 18:11:07 +00:00
int decompress_1(const char* input_file_name, const char* output_file_name) {
2025-01-17 18:08:33 +00:00
FILE *input_file = fopen(input_file_name, "rb");
FILE *output_file = fopen(output_file_name, "wb");
2025-01-18 18:11:07 +00:00
if (!input_file || !output_file) {
perror("Ошибка открытия файла");
2025-01-17 18:08:33 +00:00
return -1;
}
char *window = (char*)malloc(WINDOW_SIZE);
2025-01-18 18:11:07 +00:00
if (!window) {
perror("Ошибка выделения памяти");
fclose(input_file);
fclose(output_file);
return -1;
}
memset(window, 0, WINDOW_SIZE);
fseek(input_file, -sizeof(long), SEEK_END);
long original_size;
fread(&original_size, sizeof(long), 1, input_file);
int byte_written = 0;
fseek(input_file, 0, SEEK_SET);
2025-01-17 18:08:33 +00:00
int window_pos = 0;
2025-01-18 18:11:07 +00:00
while (1) {
unsigned short offset_length;
char next_char;
2025-01-17 18:08:33 +00:00
2025-01-18 18:11:07 +00:00
// Читаем offset и length (2 байта)
if (fread(&offset_length, sizeof(unsigned short), 1, input_file) != 1) {
break; // Конец файла
}
// Читаем следующий символ (1 байт)
if (fread(&next_char, sizeof(char), 1, input_file) != 1) {
break; // Конец файла
}
// Распаковываем offset и length
int offset = offset_length >> 4; // Старшие 12 бит
int length = offset_length & 0xF; // Младшие 4 бита
// Обрабатываем токен
if (length > 0) {
int start = window_pos - offset;
for (int i = 0; i < length; i++) {
//доп проверка на лишние символы
if (byte_written < original_size) {
char c = window[(start + i) % WINDOW_SIZE];
fputc(c, output_file);
byte_written++;
// Обновляем окно
window[window_pos % WINDOW_SIZE] = c;
window_pos++;
}
2025-01-17 18:08:33 +00:00
}
}
2025-01-18 18:11:07 +00:00
// Записываем следующий символ только если не достигнут конец файла
if (byte_written < original_size) {
fputc(next_char, output_file);
byte_written++;
// Обновляем окно
window[window_pos % WINDOW_SIZE] = next_char;
window_pos++;
}
else
{
break;
}
2025-01-17 18:08:33 +00:00
}
fseek(output_file, 0, SEEK_END);
int sizeOUT = ftell(output_file);
fseek(output_file, 0, SEEK_SET);
free(window);
fclose(input_file);
fclose(output_file);
return sizeOUT;
}