using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; using System.Collections; namespace Cluster { class HACBuilder { private class Row { public int nClusterIndex = -1; public List m_files = new List(0); public List m_data = new List(0); public List m_pos = new List(0); public double norm = 0.0; } private List m_rows = new List(0); private int[] m_nTabooList = { 0, -1, 0, 0, 0, -1, -1, 0, -1, 0, -1, 0, -1, -1, 0, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 2, 2, -1, -1, 2, -1, 2, -1, -1, 2, 2, 2, 2, -1, -1, -1, 3, -1, 3, -1, -1, 3, -1, 3, 3, -1, 3, 3, -1, 3, -1, -1, -1, 4, 4, 4, -1, -1, 4, 4, -1, 4, -1, 4, -1, 4, -1, -1, 5, 5, -1, -1, -1, 5, -1, -1, 5, 5, -1, 5, 5, 5, -1, -1, 6, -1, -1, 6, 6, 6, 6, -1, 6, -1, -1, 6, -1, 6, -1, 7, 7, 7, -1, 7, -1, -1, -1, 7, -1, -1, 7, -1, 7, 7, -1}; private float getCosine(Row r1, Row r2) { double product = 0.0f; int index1 = 0; int index2 = 0; while (true) { if (r1.m_pos[index1] == r2.m_pos[index2]) { product += r1.m_data[index1] * r2.m_data[index2]; ++index1; ++index2; } else if (r1.m_pos[index1] > r2.m_pos[index2]) { ++index2; } else if (r1.m_pos[index1] < r2.m_pos[index2]) { ++index1; } if (index1 >= r1.m_pos.Count) break; if (index2 >= r2.m_pos.Count) break; } product /= r1.norm; product /= r2.norm; double min = r1.m_files.Count; if (min > r2.m_files.Count) min = r2.m_files.Count; product /= min; return (float)(product); } private bool checkTabooList(int clusterI, int clusterJ) { if (m_rows[clusterI].nClusterIndex < 0) return true; if (m_rows[clusterJ].nClusterIndex < 0) return true; if (m_rows[clusterI].nClusterIndex == m_rows[clusterJ].nClusterIndex) return true; return false; } private void getMaxElement(ref int nRow, ref int nCol, ref float fMax, float[,] fMatrix, int nSize) { nRow = 0; nCol = 0; fMax = fMatrix[nRow, nCol]; for (int i = 0; i < nSize; ++i) { for (int j = i + 1; j < nSize; ++j) { if (fMax < fMatrix[i, j] && checkTabooList(i,j) == true) { fMax = fMatrix[i, j]; nRow = i; nCol = j; } } } } private Row mergeTwoRows(Row r1, Row r2) { Row merged = new Row(); for (int i = 0; i < r1.m_files.Count; ++i) { merged.m_files.Add(r1.m_files[i]); } for (int i = 0; i < r2.m_files.Count; ++i) { merged.m_files.Add(r2.m_files[i]); } int index1 = 0; int index2 = 0; while (true) { if (r1.m_pos[index1] == r2.m_pos[index2]) { merged.m_pos.Add(r1.m_pos[index1]); int sum = r1.m_data[index1]; sum += r2.m_data[index2]; merged.m_data.Add(sum); ++index1; ++index2; } else if (r1.m_pos[index1] > r2.m_pos[index2]) { merged.m_pos.Add(r2.m_pos[index2]); merged.m_data.Add(r2.m_data[index2]); ++index2; } else if (r1.m_pos[index1] < r2.m_pos[index2]) { merged.m_pos.Add(r1.m_pos[index1]); merged.m_data.Add(r1.m_data[index1]); ++index1; } if (index1 >= r1.m_pos.Count) break; if (index2 >= r2.m_pos.Count) break; } for (int i = index1; i < r1.m_pos.Count; ++i) { merged.m_pos.Add(r1.m_pos[i]); merged.m_data.Add(r1.m_data[i]); } for (int i = index2; i < r2.m_pos.Count; ++i) { merged.m_pos.Add(r2.m_pos[i]); merged.m_data.Add(r2.m_data[i]); } merged.norm = 0.0; foreach (short s in merged.m_data) { merged.norm += s * s; } merged.norm = Math.Sqrt(merged.norm); if (r1.nClusterIndex >= 0 && r2.nClusterIndex >= 0) { if (r1.nClusterIndex != r2.nClusterIndex) { Console.WriteLine("Error: merging two knowingly different clusters {0} {1}", r1.nClusterIndex, r2.nClusterIndex); Environment.Exit(0); } } if (r1.nClusterIndex >= 0) merged.nClusterIndex = r1.nClusterIndex; if (r2.nClusterIndex >= 0) merged.nClusterIndex = r2.nClusterIndex; return merged; } public bool executeHAC(string fileName, int nCategories) { if (!File.Exists(fileName)) { return false; } FileInfo fi = new FileInfo(fileName); long nBytes = fi.Length; int nNonZeros = (int)(nBytes / 10); ArrayList list = new ArrayList(); list.Clear(); //read data int nR = 0; int nC = 0; short sum = 0; int current_row = 0; m_rows.Add(new Row()); m_rows[current_row].m_files.Add(current_row); if (m_rows[current_row].nClusterIndex < 0) { m_rows[current_row].nClusterIndex = m_nTabooList[current_row]; } using (FileStream stream = new FileStream(fileName, FileMode.Open)) { using (BinaryReader reader = new BinaryReader(stream)) { for (long i = 0; i < nNonZeros; ++i) { int nRnew = reader.ReadInt32(); nC = reader.ReadInt32(); sum = reader.ReadInt16(); if (nRnew != nR) { m_rows.Add(new Row()); ++current_row; m_rows[current_row].m_files.Add(current_row); if (m_rows[current_row].nClusterIndex < 0) { m_rows[current_row].nClusterIndex = m_nTabooList[current_row]; } } nR = nRnew; m_rows[current_row].m_pos.Add(nC); m_rows[current_row].m_data.Add(sum); } reader.Close(); } stream.Close(); } //end //compute norms foreach (Row row in m_rows) { row.norm = 0.0; foreach (short s in row.m_data) { row.norm += s * s; } row.norm = Math.Sqrt(row.norm); } //end Console.WriteLine("Start HAC procedure...\n"); int NN = m_rows.Count - nCategories; int nMatrixSize = m_rows.Count; float[,] fMatrix = new float[nMatrixSize, nMatrixSize]; for (int i = 0; i < nMatrixSize; ++i) { for (int j = i + 1; j < nMatrixSize; ++j) { fMatrix[i, j] = getCosine(m_rows[i], m_rows[j]); } } for (int loop = 0; loop < NN; ++loop) { nMatrixSize = m_rows.Count; int nrow = 0; int ncol = 0; float fMax = 0.0f; getMaxElement(ref nrow, ref ncol, ref fMax, fMatrix, nMatrixSize); Row r1 = m_rows[nrow]; Row r2 = m_rows[ncol]; Row merged = mergeTwoRows(r1, r2); m_rows.Remove(r1); m_rows.Remove(r2); //reassign repeated matrix elements int curRow = 0; int curCol = 0; for (int i = 0; i < nMatrixSize; ++i) { if (i != nrow && i != ncol) { curCol = curRow + 1; for (int j = i + 1; j < nMatrixSize; ++j) { if (j != nrow && j != ncol) { fMatrix[curRow, curCol] = fMatrix[i, j]; ++curCol; } } ++curRow; } } m_rows.Add(merged); //recompute new matrix elements curRow = 0; for (int i = 0; i < nMatrixSize; ++i) { if (i != nrow && i != ncol) { fMatrix[curRow, nMatrixSize - 2] = getCosine(m_rows[curRow], m_rows[nMatrixSize - 2]); ++curRow; } } Console.Write("Step {0} \r", loop); } Console.WriteLine(); Console.WriteLine("\nList of categories\n"); int cnt = 1; foreach (Row row in m_rows) { Console.WriteLine("Category {0}\n", cnt); foreach (int n in row.m_files) { Console.Write("{0,4}", n); } Console.WriteLine("\n"); ++cnt; } return true; } //this function works only when number of files in //each category is equal and if in the list of processed //files categories follow each other sequentially public double computeF_measure(int nCat, int nFiles) { int[,] nAccuracy = new int[nCat, nCat]; int cnt = 1; foreach (Row row in m_rows) { for (int k = 0; k < nCat; ++k) { nAccuracy[cnt - 1, k] = 0; } foreach (int n in row.m_files) { int nLower = 0; int nUpper = nFiles; for (int k = 0; k < nCat; ++k) { if (n < nUpper && n >= nLower) ++nAccuracy[cnt - 1, k]; nLower += nFiles; nUpper += nFiles; } } ++cnt; } for (int i = 0; i < nCat; ++i) { for (int j = 0; j < nCat; ++j) { Console.Write(" {0,4} ", nAccuracy[i, j]); } Console.WriteLine(); } Console.WriteLine(); int[] sumInRows = new int[nCat]; int[] sumInCols = new int[nCat]; for (int i = 0; i < nCat; ++i) { sumInRows[i] = 0; sumInCols[i] = 0; for (int j = 0; j < nCat; ++j) { sumInRows[i] += nAccuracy[i, j]; sumInCols[i] += nAccuracy[j, i]; } } double[,] f_measure = new double[nCat, nCat]; for (int i = 0; i < nCat; ++i) { for (int j = 0; j < nCat; ++j) { if (nAccuracy[i, j] > 0) { double f = (double)nAccuracy[i, j]; double prec = f / (double)(sumInRows[i]); double recall = f / (double)(sumInCols[j]); f_measure[i, j] = 2.0 * prec * recall / (prec + recall); } else { f_measure[i, j] = 0.0; } } } double total = 0.0; for (int loop = 0; loop < nCat; ++loop) { int RR = 0; int CC = 0; double max = f_measure[RR, CC]; for (int i = 0; i < nCat; ++i) { for (int j = 0; j < nCat; ++j) { if (f_measure[i, j] > max) { max = f_measure[i, j]; RR = i; CC = j; } } } total += max; for (int i = 0; i < nCat; ++i) { f_measure[RR, i] = 0.0; } for (int j = 0; j < nCat; ++j) { f_measure[j, CC] = 0.0; } } return total / (double)(nCat); } } class Program { static void Main(string[] args) { const int nCategories = 8; const int nFilesPerCat = 16; string datafile = @"C:\APOLAR\UTILITIES\Qlango\Qlango\bin\Release\data\DocWordMatrix.dat"; HACBuilder hacBuilder = new HACBuilder(); if (!hacBuilder.executeHAC(datafile, nCategories)) { Console.WriteLine("Failed to execute HAC"); return; } else { Console.WriteLine("HAC is OK"); } Console.WriteLine("Accuracy {0}", hacBuilder.computeF_measure(nCategories, nFilesPerCat)); } } }