#include #include #include #include #include "compressor.h" #define BUFSIZE 1024 #define SIZE 256 #define MAX_WORDS 100000 // One codeword is previous index - new index -character // Trie (a.k.a prefix tree) struct dict { int index; // Character is encoded as non zero value in characters struct dict* characters[SIZE]; }; // Free trie void delete(struct dict* tree){ if (tree == NULL){ return; } for (int i = 0 ;i < SIZE; i++){ delete(tree-> characters[i]); } } void get_words (struct dict* tree,char* words[]){ if (tree == NULL){ return; } for (int i = 0 ;i < SIZE; i++){ if (tree-> characters[i] > 0){ int need_size = snprintf(NULL,0,"-%d-%c\n",tree->index,i); words[tree->characters[i]->index] = calloc(need_size + 1,sizeof(char)); memset(words[tree->characters[i]->index],0,need_size + 1); sprintf(words[tree->characters[i]->index],"-%d-%c\n",tree->index,i); get_words(tree-> characters[i],words); } } } int mylz78compress(char* buff,int size,char** outbuf){ // https://medium.com/@dbudhrani/how-data-compression-works-exploring-lz78-e97e539138 // The trie represents the current phrase // The first node represents the empty string with index 0 struct dict* root = calloc(1,sizeof(struct dict)); struct dict* current = root; int index = 1; int c = 0; char *last = NULL; for(int i = 0; i < size; i++){ c = buff[i]; // Current character is in dictionary, continue searching if (current->characters[c] > 0){ // ak este nieco ostalo a toto je koniec if (i == size - 1){ int sz_needed = snprintf(NULL,0,"-%d-%c\n",current->index,c); last = calloc(sz_needed + 1,sizeof(char)); sprintf(last,"-%d-%c\n",current->index,c); } current = current->characters[c]; } else { current->characters[c] = calloc(1,sizeof(struct dict)); current->characters[c]->index = index; index += 1; current = root; } } char **words = calloc(index,sizeof(char*)); get_words(root,words); int size_needed = 0; for(int i=1; i < index; i++){ if(words[i] != NULL){ size_needed += strlen(words[i]); } } if (last != NULL){ size_needed += strlen(last); } char *temp = calloc(size_needed + 10,sizeof(char)); int pos = 0; for(int i=0; i < index; i++){ if (words[i] != NULL){ strcpy(temp + pos,words[i]); pos += strlen(words[i]); } } if (last != NULL){ strcpy(temp + pos,last); pos += strlen(last); } delete(root); *outbuf = temp; return strlen(temp); } int myrlcompress(char* buff,int size,char** outbuf){ // 2 * size je urcite dost, kedze v najhorsom pripade bude kazdy znak osamote *outbuf = calloc(2 * size + 1,sizeof(char)); memset(*outbuf,0,size + 1); int head = -1; int length = 0; int pos = 0; for(int i = 0; i < size; i++){ int c = buff[i]; if (c == head){ length += 1; if (i == size - 1){ (*outbuf)[pos] = head; (*outbuf)[pos + 1] = length; pos += 2; break; } } else { if (head != -1){ (*outbuf)[pos] = head; (*outbuf)[pos + 1] = length; pos += 2; } if (i == size - 1){ (*outbuf)[pos] = c; (*outbuf)[pos + 1] = 1; pos += 2; break; } head = c; length = 1; } // ak sme blizko preteceniu, vypiseme aktualny znak if (length == 255){ (*outbuf)[pos] = head; (*outbuf)[pos + 1] = length; pos += 2; head = -1; length = 0; } } return pos; } void compress(FILE* infile,FILE* outfile){ fseek(infile, 0, SEEK_END); int insize = ftell(infile) + 1; rewind(infile); char *buffer = calloc(insize,sizeof(char)); memset(buffer,0,insize); insize = fread(buffer,sizeof(char),insize - 1,infile); if (insize == 0){ assert(!ferror(infile)); } char *tempbuf = NULL; int tempsize = myrlcompress(buffer,insize,&tempbuf); char *outbuf = NULL; int outsize = mylz78compress(tempbuf,tempsize,&outbuf); if (outsize > 0){ fwrite(outbuf,sizeof(char),outsize,outfile); } } int myrldecompress(char* buff,int size,char** outbuf){ *outbuf = calloc(size + 1,sizeof(char)); memset(*outbuf,0,size + 1); int pos = 0; for(int i = 0; i < size; i+=2){ char c = buff[i]; char length = buff[i + 1]; // ak nam nebude stacit miesto v *outbuf, treba allocovat viac //zistil som ze pri velkosti 10 MB to nebude nutne /*while((length + pos) > (size - 1)){ size *= 2; char *tempbuf = calloc(size,sizeof(char)); memset(tempbuf,'\0',size); strcpy(tempbuf,*outbuf); *outbuf = tempbuf; }*/ for (int j = 0; j < length; j++){ (*outbuf)[pos] = c; pos += 1; } } return pos; } int mylz78decompress(char* buff,int size,char** outbuf){ *outbuf = calloc(size,sizeof(char)); memset(*outbuf,0,size); char *words[MAX_WORDS]; words[0] = ""; char c; int index = 1; int number; int pos = 0; int bytes_read, bytes_read_all = 0; while (sscanf(buff + bytes_read_all,"-%d-%c\n%n",&number,&c,&bytes_read) == 2){ bytes_read_all += bytes_read; words[index] = calloc(strlen(words[number]) + 2,sizeof(char*)); strcpy(words[index],words[number]); words[index][strlen(words[number])] = c; // ak nam nebude stacit miesto v *outbuf, treba allocovat viac while((strlen(words[index]) + pos) > (size - 1)){ size *= 2; char *tempbuf = calloc(size,sizeof(char)); memset(tempbuf,'\0',size); strcpy(tempbuf,*outbuf); *outbuf = tempbuf; } strcpy(*outbuf + pos,words[index]); pos += strlen(words[index]); index += 1; } return strlen(*outbuf); } void decompress(FILE* infile,FILE* outfile){ fseek(infile, 0, SEEK_END); int insize = ftell(infile) + 1; rewind(infile); char *buffer = calloc(insize,sizeof(char)); memset(buffer,0,insize); insize = fread(buffer,sizeof(char),insize - 1,infile); if (insize == 0){ assert(!ferror(infile)); } char *tempbuf = NULL; int tempsize = mylz78decompress(buffer,insize,&tempbuf); char *outbuf = NULL; int outsize = myrldecompress(tempbuf,tempsize,&outbuf); if (outsize > 0){ fwrite(outbuf,sizeof(char),outsize,outfile); } }