using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; namespace ConceptualProximity { class PrimitiveHuffman { private int[] m_len = { 1, 3, 4, 4, 4, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7 }; private int[] m_val = null; private int[] m_lookupLen; private int[] m_lookupVal; private Queue m_bQueue = new Queue(); private int m_bit_counter; private int m_bit_holding_buffer; private int m_max_code_len; public PrimitiveHuffman() { } private void makeValues() { if (m_val != null) return; m_val = new int[m_len.Length]; m_val[0] = 0; for (int i = 1; i < m_len.Length; ++i) { m_val[i] = (m_val[i - 1] + 1) << (m_len[i] - m_len[i - 1]); } } public void initializeEncoding() { m_bit_counter = 0; m_bit_holding_buffer = 0; m_bQueue.Clear(); makeValues(); } private void writeBits(int codeLen, int code) { m_bit_holding_buffer <<= codeLen; m_bit_holding_buffer |= code; m_bit_counter += codeLen; while (m_bit_counter >= 8) { m_bQueue.Enqueue((byte)(m_bit_holding_buffer >> (m_bit_counter - 8))); m_bit_counter -= 8; } } public void flushBuffer() { writeBits(8 - m_bit_counter, 0); } public void encodeSymbol(int x) { int high = x >> 8; high &= 0xfffffff; int low = x & 0xff; if (high < m_val.Length - 1) { writeBits(m_len[high], m_val[high]); } else { writeBits(m_len[m_val.Length - 1], m_val[m_val.Length - 1]); writeBits(8, (high >> 16) & 0xff); writeBits(8, (high >> 8) & 0xff); writeBits(8, high & 0xff); } writeBits(8, low); } public void initializeDecoding() { makeValues(); m_max_code_len = 0; for (int i = 0; i < m_len.Length; ++i) { if (m_max_code_len < m_len[i]) { m_max_code_len = m_len[i]; } } m_bit_holding_buffer = 0; m_bit_counter = 0; int max = 0; for (int i = 0; i < m_val.Length; ++i) { if (m_val[i] > max) { max = m_val[i]; } } m_lookupVal = new int[max + 1]; m_lookupLen = new int[max + 1]; int cnt = 0; for (int i = 0; i < m_val.Length - 1; ++i) { int a1 = m_val[i] << (m_max_code_len - m_len[i]); int a2 = m_val[i + 1] << (m_max_code_len - m_len[i + 1]); for (int k = a1; k < a2; ++k) { m_lookupVal[k] = cnt; m_lookupLen[k] = m_len[i]; } ++cnt; } m_lookupVal[m_val[m_val.Length - 1]] = cnt; m_lookupLen[m_val[m_val.Length - 1]] = m_len[m_len.Length - 1]; } private int readBits(int bit_len) { while (m_bit_counter < bit_len) { m_bit_holding_buffer <<= 8; if (m_bQueue.Count > 0) { m_bit_holding_buffer |= m_bQueue.Dequeue(); } m_bit_counter += 8; } return (m_bit_holding_buffer >> (m_bit_counter - bit_len)) & ((1 << bit_len) - 1); } public int decodeSymbol() { int valRead = readBits(m_max_code_len); int len = m_lookupLen[valRead]; int high = m_lookupVal[valRead]; m_bit_counter -= len; if (high >= m_val.Length - 1) { int x = readBits(8); m_bit_counter -= 8; high <<= 8; high |= x; x = readBits(8); m_bit_counter -= 8; high <<= 8; high |= x; x = readBits(8); m_bit_counter -= 8; high <<= 8; high |= x; } valRead = readBits(8); m_bit_counter -= 8; return (high << 8) | valRead; } public int getSize() { return m_bQueue.Count; } public bool saveData(string fileName) { if (File.Exists(fileName)) File.Delete(fileName); FileStream fileStream = File.Open(fileName, FileMode.Create, FileAccess.Write); if (fileStream == null) { Console.WriteLine("FileStream is null"); return false; } BinaryWriter binaryWriter = new BinaryWriter(fileStream); if (m_bQueue == null) { Console.WriteLine("Queue is null"); return false; } int nSize = m_bQueue.Count; for (int i = 0; i < nSize; ++i) { byte b = m_bQueue.Dequeue(); binaryWriter.Write(b); } fileStream.Close(); return true; } public bool readData(string fileName) { if (!File.Exists(fileName)) { return false; } FileStream fileStream = File.Open(fileName, FileMode.Open, FileAccess.Read); if (fileStream == null) return false; BinaryReader binaryReader = new BinaryReader(fileStream); m_bQueue.Clear(); int CHUNK_SIZE = 1024; byte[] chunk = new byte[CHUNK_SIZE]; int chunksize = -1; do { chunksize = binaryReader.Read(chunk, 0, CHUNK_SIZE); if (chunksize > 0) { for (int i = 0; i < chunksize; ++i) { m_bQueue.Enqueue(chunk[i]); } } } while (chunksize > 0); binaryReader.Close(); fileStream.Close(); return true; } } class Program { static int m_pos; static int getClosestDist(int a, int[] b) { int distPrev = Math.Abs(a - b[m_pos]); if (m_pos == b.Length - 1) return distPrev; do { ++m_pos; int distCur = Math.Abs(a - b[m_pos]); if (distCur < distPrev) { distPrev = distCur; } else { --m_pos; break; } } while (m_pos < b.Length - 1); return distPrev; } static double getDistance(int[] a, int[] b) { int[] min; int[] max; if (a.Length < b.Length) { min = a; max = b; } else { min = b; max = a; } m_pos = 0; double sumOfDistances = 0.0; int countForClosePoints = 0; int countForRemotePoints = 0; for (int i = 0; i < min.Length; ++i) { int dist = getClosestDist(min[i], max); if (dist < 1000) { sumOfDistances += dist; ++countForClosePoints; } else { ++countForRemotePoints; } } if (countForClosePoints == 0) return 1000.0; double distance = sumOfDistances / (double)(countForClosePoints); return distance * (double)(countForClosePoints + countForRemotePoints) / (double)(countForClosePoints); } static string[] getSortedListOfFiles(string rootFolder) { DirectoryInfo dirInfo = new DirectoryInfo(rootFolder); if (!dirInfo.Exists) { Console.WriteLine("Folder {0} not exists.", rootFolder); return null; } List files = new List(); List sizes = new List(); files.Clear(); sizes.Clear(); FileInfo[] fiArray = dirInfo.GetFiles("*.*", SearchOption.AllDirectories); foreach (FileInfo fi in fiArray) { files.Add(fi.Name); sizes.Add(fi.Length); } string[] strFiles = files.ToArray(); long[] lSizes = sizes.ToArray(); files.Clear(); sizes.Clear(); int[] order = new int[lSizes.Length]; for (int i = 0; i < lSizes.Length; ++i) { order[i] = i; } Array.Sort(lSizes, order); string[] filesList = new string[strFiles.Length]; int cnt = 0; for (int i = strFiles.Length - 1; i >= 0; --i) { filesList[cnt] = strFiles[order[i]]; ++cnt; } return filesList; } static string[] findClosestWords(string rootFolder, string fileName, int nHowMany) { string[] filesList = getSortedListOfFiles(rootFolder); if (filesList == null) return null; string fullFileName = Path.Combine(rootFolder, fileName); if (!File.Exists(fullFileName)) { Console.WriteLine("File {0} not exists.", fullFileName); return null; } double[] distances = new double[filesList.Length]; List main = new List(); PrimitiveHuffman phMain = new PrimitiveHuffman(); phMain.initializeDecoding(); phMain.readData(fullFileName); int x = phMain.decodeSymbol(); main.Add(x); while (true) { int y = phMain.decodeSymbol(); if (y < 0) break; else { x += y; main.Add(x); } } List current = new List(); for (int i = 0; i < filesList.Length; ++i) { current.Clear(); PrimitiveHuffman phCurrent = new PrimitiveHuffman(); phCurrent.initializeDecoding(); phCurrent.readData(Path.Combine(rootFolder, filesList[i])); x = phCurrent.decodeSymbol(); current.Add(x); while (true) { int y = phCurrent.decodeSymbol(); if (y < 0) break; else { x += y; current.Add(x); } } double dist = getDistance(main.ToArray(), current.ToArray()); distances[i] = dist; phCurrent = null; if (i % 1000 == 0) { Console.Write("words: {0} \r", i); } } Console.WriteLine("..................."); Array.Sort(distances, filesList); string[] sortedFileList = new string[nHowMany]; for (int i = 0; i < nHowMany; ++i) { sortedFileList[i] = filesList[i]; } return sortedFileList; } static void Main(string[] args) { string testFileName = "microcomput"; string rootFolder = @"C:\APOLAR\UTILITIES\Qscreener\Qscreener\bin\Release\data\bin"; int N = 20; string[] files = findClosestWords(rootFolder, testFileName, N); if (files == null) { Console.WriteLine("Error"); return; } foreach (string file in files) { Console.WriteLine(file); } } } }