usaa21/sk2a/compressor.c

280 lines
7.0 KiB
C
Raw Normal View History

2022-01-19 21:30:15 +00:00
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "compressor.h"
#define BUFSIZE 1024
#define SIZE 256
#define MAX_WORDS 100000
// One codeword is previous index - new index -character
// Trie (a.k.a prefix tree)
struct dict {
int index;
// Character is encoded as non zero value in characters
struct dict* characters[SIZE];
};
// Free trie
void delete(struct dict* tree){
if (tree == NULL){
return;
}
for (int i = 0 ;i < SIZE; i++){
delete(tree-> characters[i]);
}
2022-01-20 07:04:32 +00:00
2022-01-19 21:30:15 +00:00
}
void get_words (struct dict* tree,char* words[]){
if (tree == NULL){
return;
}
for (int i = 0 ;i < SIZE; i++){
if (tree-> characters[i] > 0){
int need_size = snprintf(NULL,0,"-%d-%c\n",tree->index,i);
words[tree->characters[i]->index] = calloc(need_size + 1,sizeof(char));
memset(words[tree->characters[i]->index],0,need_size + 1);
sprintf(words[tree->characters[i]->index],"-%d-%c\n",tree->index,i);
get_words(tree-> characters[i],words);
}
}
}
int mylz78compress(char* buff,int size,char** outbuf){
// https://medium.com/@dbudhrani/how-data-compression-works-exploring-lz78-e97e539138
// The trie represents the current phrase
// The first node represents the empty string with index 0
struct dict* root = calloc(1,sizeof(struct dict));
struct dict* current = root;
int index = 1;
int c = 0;
char *last = NULL;
for(int i = 0; i < size; i++){
c = buff[i];
// Current character is in dictionary, continue searching
if (current->characters[c] > 0){
// ak este nieco ostalo a toto je koniec
if (i == size - 1){
int sz_needed = snprintf(NULL,0,"-%d-%c\n",current->index,c);
last = calloc(sz_needed + 1,sizeof(char));
sprintf(last,"-%d-%c\n",current->index,c);
}
current = current->characters[c];
}
else {
current->characters[c] = calloc(1,sizeof(struct dict));
current->characters[c]->index = index;
index += 1;
current = root;
}
}
char **words = calloc(index,sizeof(char*));
get_words(root,words);
int size_needed = 0;
for(int i=1; i < index; i++){
if(words[i] != NULL){
size_needed += strlen(words[i]);
}
}
if (last != NULL){
size_needed += strlen(last);
}
char *temp = calloc(size_needed + 10,sizeof(char));
int pos = 0;
for(int i=0; i < index; i++){
if (words[i] != NULL){
strcpy(temp + pos,words[i]);
pos += strlen(words[i]);
}
}
if (last != NULL){
strcpy(temp + pos,last);
pos += strlen(last);
}
delete(root);
2022-01-20 07:04:32 +00:00
2022-01-19 21:30:15 +00:00
*outbuf = temp;
return strlen(temp);
}
int myrlcompress(char* buff,int size,char** outbuf){
// 2 * size je urcite dost, kedze v najhorsom pripade bude kazdy znak osamote
*outbuf = calloc(2 * size + 1,sizeof(char));
memset(*outbuf,0,size + 1);
int head = -1;
int length = 0;
int pos = 0;
for(int i = 0; i < size; i++){
int c = buff[i];
if (c == head){
length += 1;
if (i == size - 1){
(*outbuf)[pos] = head;
(*outbuf)[pos + 1] = length;
pos += 2;
break;
}
}
else {
if (head != -1){
(*outbuf)[pos] = head;
(*outbuf)[pos + 1] = length;
pos += 2;
}
if (i == size - 1){
(*outbuf)[pos] = c;
(*outbuf)[pos + 1] = 1;
pos += 2;
break;
}
head = c;
length = 1;
}
// ak sme blizko preteceniu, vypiseme aktualny znak
if (length == 255){
(*outbuf)[pos] = head;
(*outbuf)[pos + 1] = length;
pos += 2;
head = -1;
length = 0;
}
}
return pos;
}
void compress(FILE* infile,FILE* outfile){
fseek(infile, 0, SEEK_END);
int insize = ftell(infile) + 1;
rewind(infile);
char *buffer = calloc(insize,sizeof(char));
memset(buffer,0,insize);
insize = fread(buffer,sizeof(char),insize - 1,infile);
if (insize == 0){
assert(!ferror(infile));
}
char *tempbuf = NULL;
int tempsize = myrlcompress(buffer,insize,&tempbuf);
char *outbuf = NULL;
int outsize = mylz78compress(tempbuf,tempsize,&outbuf);
if (outsize > 0){
fwrite(outbuf,sizeof(char),outsize,outfile);
}
2022-01-20 07:04:32 +00:00
2022-01-19 21:30:15 +00:00
}
int myrldecompress(char* buff,int size,char** outbuf){
*outbuf = calloc(size + 1,sizeof(char));
memset(*outbuf,0,size + 1);
int pos = 0;
for(int i = 0; i < size; i+=2){
char c = buff[i];
char length = buff[i + 1];
// ak nam nebude stacit miesto v *outbuf, treba allocovat viac
2022-01-20 07:26:50 +00:00
//zistil som ze pri velkosti 10 MB to nebude nutne
/*while((length + pos) > (size - 1)){
2022-01-19 21:30:15 +00:00
size *= 2;
char *tempbuf = calloc(size,sizeof(char));
memset(tempbuf,'\0',size);
strcpy(tempbuf,*outbuf);
*outbuf = tempbuf;
2022-01-20 07:26:50 +00:00
}*/
2022-01-19 21:30:15 +00:00
for (int j = 0; j < length; j++){
(*outbuf)[pos] = c;
pos += 1;
}
}
return pos;
}
int mylz78decompress(char* buff,int size,char** outbuf){
*outbuf = calloc(size,sizeof(char));
memset(*outbuf,0,size);
char *words[MAX_WORDS];
words[0] = "";
char c;
int index = 1;
int number;
int pos = 0;
int bytes_read, bytes_read_all = 0;
while (sscanf(buff + bytes_read_all,"-%d-%c\n%n",&number,&c,&bytes_read) == 2){
bytes_read_all += bytes_read;
words[index] = calloc(strlen(words[number]) + 2,sizeof(char*));
strcpy(words[index],words[number]);
words[index][strlen(words[number])] = c;
// ak nam nebude stacit miesto v *outbuf, treba allocovat viac
while((strlen(words[index]) + pos) > (size - 1)){
size *= 2;
char *tempbuf = calloc(size,sizeof(char));
memset(tempbuf,'\0',size);
strcpy(tempbuf,*outbuf);
*outbuf = tempbuf;
}
strcpy(*outbuf + pos,words[index]);
pos += strlen(words[index]);
index += 1;
}
return strlen(*outbuf);
}
void decompress(FILE* infile,FILE* outfile){
fseek(infile, 0, SEEK_END);
int insize = ftell(infile) + 1;
rewind(infile);
char *buffer = calloc(insize,sizeof(char));
memset(buffer,0,insize);
insize = fread(buffer,sizeof(char),insize - 1,infile);
if (insize == 0){
assert(!ferror(infile));
}
char *tempbuf = NULL;
int tempsize = mylz78decompress(buffer,insize,&tempbuf);
char *outbuf = NULL;
int outsize = myrldecompress(tempbuf,tempsize,&outbuf);
if (outsize > 0){
fwrite(outbuf,sizeof(char),outsize,outfile);
}
}