// Entropy Deflator // (C) 2009, Andrew Polar under GPL ver. 3. // First version released June 12, 2009 // This version May 2010 // // LICENSE // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 3 of // the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details at // Visit . // // The program demonstrates how to reduce entropy of data without changing the size #include #include #include #include #include ///////auxiliary functions/////////////////////////////// template double calculate_entropy(T* data, int data_size) { int min, max, counter, buffer_size; int* buffer; double entropy; double log2 = log(2.0); double prob; min = data[0]; max = data[0]; for (counter=0; counter max) max = data[counter]; } buffer_size = max - min + 1; buffer = (int*)malloc(buffer_size * sizeof(int)); memset(buffer, 0x00, buffer_size * sizeof(int)); for (counter=0; counter 0) { prob = (double)buffer[counter]; prob /= (double)data_size; entropy += log(prob)/log2*prob; } } entropy *= -1.0; if (buffer) free(buffer); return entropy; } int getFileSize(char* fileName) { FILE* f; if ((f = fopen(fileName, "rb")) == NULL) return -1; fseek(f, 0, SEEK_END); int bytes = ftell(f); fclose(f); return bytes; } //end auxiliary //This is generalization for Move-To-Front algorithm. class Predictor { public: Predictor(); ~Predictor(); void SetZeros(); bool Initialize(unsigned char memory = 8); void ReleaseMemory(); unsigned char deflate(unsigned char symbol); unsigned char inflate(unsigned char position); bool m_bInitialized; private: void ResortSymbols(unsigned char position); void RescaleFrequencies(); unsigned char m_memory; unsigned char *m_symbol, *m_frequency; }; Predictor::Predictor() { } Predictor::~Predictor() { ReleaseMemory(); } void Predictor::ReleaseMemory() { if (m_symbol) free(m_symbol); if (m_frequency) free(m_frequency); } void Predictor::SetZeros() { m_symbol = m_frequency = 0; m_bInitialized = false; } bool Predictor::Initialize(unsigned char memory) { m_symbol = (unsigned char*)malloc(256); m_frequency = (unsigned char*)malloc(256); if (m_symbol == 0 || m_frequency == 0) { printf("Failed to allocate memory.\n"); return false; } for (int i=0; i<256; ++i) { m_frequency[i] = 0; m_symbol[i] = i; } m_memory = memory; m_bInitialized = true; return m_bInitialized; } unsigned char Predictor::deflate(unsigned char symbol) { unsigned char cnt = 0; while (m_symbol[cnt] != symbol) {++cnt;} ResortSymbols(cnt); RescaleFrequencies(); return cnt; } unsigned char Predictor::inflate(unsigned char position) { unsigned char symbol = m_symbol[position]; ResortSymbols(position); RescaleFrequencies(); return symbol; } void Predictor::ResortSymbols(unsigned char position) { ++m_frequency[position]; if (position == 0) { return; } int position_to_switch = -1; for (int i=position-1; i>=0; --i) { if (m_frequency[position] > m_frequency[i]) { position_to_switch = i; } else { break; } } if (position_to_switch < 0) { return; } unsigned char buf = m_symbol[position_to_switch]; m_symbol[position_to_switch] = m_symbol[position]; m_symbol[position] = buf; // buf = m_frequency[position_to_switch]; m_frequency[position_to_switch] = m_frequency[position]; m_frequency[position] = buf; } void Predictor::RescaleFrequencies() { if (m_frequency[0] < m_memory) return; for (int i=0; i<256; ++i) { if (m_frequency[i] > 0) m_frequency[i] >>= 1; else break; } } //end predictor class class EntropyDeflator { public: EntropyDeflator(); ~EntropyDeflator(); bool Initialize(int PRECISION); void ReleaseMemory(); unsigned char deflate(unsigned char symbol); unsigned char inflate(unsigned char symbol); private: int hash_executable(int v); int hash_text(int v); int context_manager(); Predictor* m_predictor; int STAT_SIZE, STAT_BIT_MASK, m_context, m_counter; long long m_ht, m_he, m_ht2, m_he2; bool m_isText; }; EntropyDeflator::EntropyDeflator() { m_predictor = 0; m_context = 0; m_counter = 0; m_isText = false; m_ht = m_he = m_ht2 = m_he2 = 0; } EntropyDeflator::~EntropyDeflator() { ReleaseMemory(); } bool EntropyDeflator::Initialize(int PRECISION) { STAT_SIZE = 1<>5)^v) & STAT_BIT_MASK; } inline int EntropyDeflator::context_manager() { int he = hash_executable(m_context); int ht = hash_text(m_context); //Here I try to identify file according to standard deviation //in values returned by hash functions. It actually works but //in the future I'm thinking to implement much more sophisticated //technique of flexible selection of the best hash function. if (m_counter < 20000) { ++m_counter; m_ht += ht; m_he += he; m_ht2 += ht * ht; m_he2 += he * he; long long m2_ht = 2* m_ht * m_ht - m_ht2; long long m2_he = 2* m_he * m_he - m_he2; //printf("%I64d %I64d\n", m2_ht, m2_he); if (m2_ht < m2_he) m_isText = true; else m_isText = false; if (m_isText) return ht; else return he; } if (m_isText) return ht; else return he; } unsigned char EntropyDeflator::deflate(unsigned char symbol) { int context = context_manager(); if (!m_predictor[context].m_bInitialized) { if (!m_predictor[context].Initialize()) { printf("Failed to allocate memory for predictor, unexpected termination\n"); exit(1); } } unsigned char result = m_predictor[context].deflate(symbol); m_context <<= 8; m_context |= symbol; return result; } unsigned char EntropyDeflator::inflate(unsigned char symbol) { int context = context_manager(); if (!m_predictor[context].m_bInitialized) { if (!m_predictor[context].Initialize()) { printf("Failed to allocate memory for predictor, unexpected termination\n"); exit(1); } } unsigned char result = m_predictor[context].inflate(symbol); m_context <<= 8; m_context |= result; return result; } int main() { char fileName[] = "C:\\Development\\SAMPLES\\world95.txt"; int data_size = getFileSize(fileName); if (data_size < 0) { printf("File not found\n"); return 0; } unsigned char* data = (unsigned char*)malloc(data_size * sizeof(*data)); FILE* f = fopen(fileName, "rb"); fread(data, 1, data_size, f); fclose(f); int precision = 15; //encoding clock_t start = clock(); unsigned char* result = (unsigned char*)malloc(data_size * sizeof(*result)); EntropyDeflator* deflator = new EntropyDeflator(); if (!deflator->Initialize(precision)) { printf("Failed to allocate memory\n"); return 1; } for (int i=0; ideflate(data[i]); } delete deflator; clock_t end = clock(); printf("Encoding, time %2.3f s.\n", (double)(end - start)/CLOCKS_PER_SEC); //end encoding /* just research showing that statistics noticably different after 0 value int stat[256]; for (int i=0; i<256; ++i) stat[i] = 0; int stat1[256]; for (int i=0; i<256; ++i) stat1[i] = 0; int cont = 0; for (int i=0; iInitialize(precision)) { printf("Failed to allocate memory\n"); return 1; } for (int i=0; iinflate(result[i]); } delete inflator; end = clock(); printf("Decoding, time %2.3f s.\n", (double)(end - start)/CLOCKS_PER_SEC); //end decoding bool isOK = true; for (int i=0; i