#include #include #include #include #include "compressor.h" #define BUFSIZE 1024 #define SIZE 256 #define MAX_WORDS 100000 // One codeword is previous index - new index -character // Trie (a.k.a prefix tree) struct dict { int index; // Character is encoded as non zero value in characters struct dict* characters[SIZE]; }; // Free trie void delete(struct dict* tree){ //skopirovane z lz78, vymaze strukturu co sa pouziva v lz78 if (tree == NULL){ //maze to pamat stromu return; } for (int i = 0 ;i < SIZE; i++){ delete(tree-> characters[i]); } free(tree); } void get_words (struct dict* tree,char* words[]){ //posielam cely strom a smernik na dvojrozmerne pole if (tree == NULL){ return; } for (int i = 0 ;i < SIZE; i++){ //prejdeme vsetkych potomkov if (tree-> characters[i] > 0){ //zistime ci tam je nieco naalokovane int need_size = snprintf(NULL,0,"-%d-%c\n",tree->index,i); //kolko pamate words[tree->characters[i]->index] = calloc(need_size + 1,sizeof(char)); //naalokujeme pamat memset(words[tree->characters[i]->index],0,need_size + 1); sprintf(words[tree->characters[i]->index],"-%d-%c\n",tree->index,i); //napise sa vo formate -index predosleho notu - aktualny znak get_words(tree-> characters[i],words); //zavolam na kazdeho potomka a potom na dalsich potomkov alebo syna } } } int mylz78compress(char* buff,int size,char** outbuf){ // https://medium.com/@dbudhrani/how-data-compression-works-exploring-lz78-e97e539138 // The trie represents the current phrase // The first node represents the empty string with index 0 struct dict* root = calloc(1,sizeof(struct dict)); struct dict* current = root; int index = 1; int c = 0; char *last = NULL; for(int i = 0; i < size; i++){ c = buff[i]; // Current character is in dictionary, continue searching if (current->characters[c] > 0){ // ak este nieco ostalo a toto je koniec if (i == size - 1){ // pridem na koniec suboru a musim si to zapamatat, inac to stratim int sz_needed = snprintf(NULL,0,"-%d-%c\n",current->index,c); //kolko bajtov to bude zaberat last = calloc(sz_needed + 1,sizeof(char)); //alokuje do do last sprintf(last,"-%d-%c\n",current->index,c); // } current = current->characters[c]; } else { current->characters[c] = calloc(1,sizeof(struct dict)); current->characters[c]->index = index; index += 1; current = root; } } char **words = calloc(index,sizeof(char*)); // z stromu vytvori format get_words(root,words); int size_needed = 0; for(int i=1; i < index; i++){ //nastavime velkost aku bude mat vystupny buffer if(words[i] != NULL){ size_needed += strlen(words[i]); //poscitavame velkosti vsetkych stringov } } if (last != NULL){ //mame string last? size_needed += strlen(last); //priratam } char *temp = calloc(size_needed + 10,sizeof(char)); //alokujem velkost, padalo mi to ked som mal mensie cislo ako 10 int pos = 0; //pamatam kde som to uz vypisal for(int i=0; i < index; i++){ if (words[i] != NULL){ strcpy(temp + pos,words[i]); //nebude sa to pisat na zaciatok ale az za tym pos += strlen(words[i]); } } if (last != NULL){ // zapisem aj last strcpy(temp + pos,last); pos += strlen(last); } delete(root); //nepotrebujeme strom, uvolnime pamat free(words); *outbuf = temp; //adresa sa zapise do outbuffer return strlen(temp); } int myrlcompress(char* buff,int size,char** outbuf){ // 2 * size je urcite dost, kedze v najhorsom pripade bude kazdy znak osamote *outbuf = calloc(2 * size + 1,sizeof(char)); //smernik na smernik, napisem adresu novej pamate memset(*outbuf,0,size + 1); //vymaze int head = -1; int length = 0; int pos = 0; for(int i = 0; i < size; i++){ int c = buff[i]; if (c == head){ length += 1; if (i == size - 1){ //posledne pismenko (*outbuf)[pos] = head; (*outbuf)[pos + 1] = length; pos += 2; //aby som vedel kde to zapisovat break; } } else { if (head != -1){ (*outbuf)[pos] = head; (*outbuf)[pos + 1] = length; pos += 2; } if (i == size - 1){ //posledne pismenko (*outbuf)[pos] = c; (*outbuf)[pos + 1] = 1; pos += 2; break; } head = c; length = 1; } // ak sme blizko preteceniu, vypiseme aktualny znak if (length == 255){ //nemozme ist viac, vynulujem a idem od nuly (*outbuf)[pos] = head; (*outbuf)[pos + 1] = length; pos += 2; head = -1; length = 0; } } return pos; //velkost vystupneho buffera } void compress(FILE* infile,FILE* outfile){ fseek(infile, 0, SEEK_END); //vstupny subor, a chceme presunut kurzor na konci suboru int insize = ftell(infile) + 1; // na akej pozicii je kurzor, zistime velkost suboru rewind(infile); //vrati kurzor na zaciatok char *buffer = calloc(insize,sizeof(char)); //alokuje pamat memset(buffer,0,insize); insize = fread(buffer,sizeof(char),insize - 1,infile); //nacita obsah do bufferu if (insize == 0){ assert(!ferror(infile)); } char *tempbuf = NULL; // az vo funkcii to alokujem int tempsize = myrlcompress(buffer,insize,&tempbuf); //vstupny buffer, velkost a adresu bufferu char *outbuf = NULL; int outsize = mylz78compress(tempbuf,tempsize,&outbuf); if (outsize > 0){ fwrite(outbuf,sizeof(char),outsize,outfile); //napise sa do suboru, z akeho buffera do akeho suboru } free(buffer); free(tempbuf); free(outbuf); } int myrldecompress(char* buff,int size,char** outbuf){ *outbuf = calloc(size + 1,sizeof(char)); memset(*outbuf,0,size + 1); int pos = 0; for(int i = 0; i < size; i+=2){ char c = buff[i]; char length = buff[i + 1]; // ak nam nebude stacit miesto v *outbuf, treba allocovat viac while((length + pos) > (size - 1)){ size *= 2; char *tempbuf = calloc(size,sizeof(char)); memset(tempbuf,'\0',size); strcpy(tempbuf,*outbuf); free(*outbuf); *outbuf = tempbuf; } for (int j = 0; j < length; j++){ //tolko krat ho pridavam, kolko mam cisielko (*outbuf)[pos] = c; pos += 1; } } return pos; } int mylz78decompress(char* buff,int size,char** outbuf){ //nepotrebujem pouzivat strom *outbuf = calloc(size,sizeof(char)); //taka ista velkost ako vstup memset(*outbuf,0,size); char *words[MAX_WORDS]; //dvojrozmerne pole words[0] = ""; char c; int index = 1; int number; int pos = 0; int bytes_read, bytes_read_all = 0; //citame vstupny buffer, necitame po bajtoch, viac ciferne cisla while (sscanf(buff + bytes_read_all,"-%d-%c\n%n",&number,&c,&bytes_read) == 2){ //formatovacie citanie, pusaveme o bajty, ktore sa precitali ak vrati ine cislo ako 2 tak je nakocni bytes_read_all += bytes_read; //o tolko sme sa posunuli words[index] = calloc(strlen(words[number]) + 2,sizeof(char*)); //vo words sa vytvoria prefixi strcpy(words[index],words[number]); //nakopirujeme words number a pridame nakoniec c words[index][strlen(words[number])] = c; // ak nam nebude stacit miesto v *outbuf, treba allocovat viac while((strlen(words[index]) + pos) > (size - 1)){ size *= 2; char *tempbuf = calloc(size,sizeof(char)); memset(tempbuf,'\0',size); strcpy(tempbuf,*outbuf); free(*outbuf); *outbuf = tempbuf; } strcpy(*outbuf + pos,words[index]); //nakopiruje najnovsie slovo pos += strlen(words[index]); index += 1; } return strlen(*outbuf); //vratim velkost } void decompress(FILE* infile,FILE* outfile){ fseek(infile, 0, SEEK_END); int insize = ftell(infile) + 1; rewind(infile); char *buffer = calloc(insize,sizeof(char)); memset(buffer,0,insize); insize = fread(buffer,sizeof(char),insize - 1,infile); if (insize == 0){ assert(!ferror(infile)); } char *tempbuf = NULL; int tempsize = mylz78decompress(buffer,insize,&tempbuf); //najprv lz78 char *outbuf = NULL; int outsize = myrldecompress(tempbuf,tempsize,&outbuf); if (outsize > 0){ fwrite(outbuf,sizeof(char),outsize,outfile); } free(buffer); free(tempbuf); free(outbuf); }