//Andrew Polar, semanticsearchart.com, Mar. 20, 2012. using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; using System.Collections; namespace Cluster { class Row { public int nClusterIndex = -1; public List m_files = new List(0); public List m_data = new List(0); public List m_pos = new List(0); public double norm = 0.0; } class HACBuilder { public List readData(string datafile, int[] files, int[] tabooList) { List rows = new List(); FileInfo fi = new FileInfo(datafile); long nBytes = fi.Length; int nNonZeros = (int)(nBytes / 10); int nRold = -1; int current_row = -1; bool isStarted = false; using (FileStream stream = new FileStream(datafile, FileMode.Open)) { using (BinaryReader reader = new BinaryReader(stream)) { for (long i = 0; i < nNonZeros; ++i) { int nR = reader.ReadInt32(); int nC = reader.ReadInt32(); short sum = reader.ReadInt16(); if (nR != nRold) { bool isFound = false; int index = 0; foreach (int k in files) { if (k == nR) { isFound = true; break; } ++index; } if (isFound) { rows.Add(new Row()); ++current_row; rows[current_row].m_files.Add(nR); if (rows[current_row].nClusterIndex < 0) { rows[current_row].nClusterIndex = tabooList[index]; } isStarted = true; } else { isStarted = false; } nRold = nR; } else { if (isStarted) { rows[current_row].m_pos.Add(nC); rows[current_row].m_data.Add(sum); } } } reader.Close(); } stream.Close(); } //compute norms foreach (Row row in rows) { row.norm = 0.0; foreach (short s in row.m_data) { row.norm += s * s; } row.norm = Math.Sqrt(row.norm); } //end return rows; } private float getCosine(Row r1, Row r2) { double product = 0.0f; int index1 = 0; int index2 = 0; while (true) { if (r1.m_pos[index1] == r2.m_pos[index2]) { product += r1.m_data[index1] * r2.m_data[index2]; ++index1; ++index2; } else if (r1.m_pos[index1] > r2.m_pos[index2]) { ++index2; } else if (r1.m_pos[index1] < r2.m_pos[index2]) { ++index1; } if (index1 >= r1.m_pos.Count) break; if (index2 >= r2.m_pos.Count) break; } product /= r1.norm; product /= r2.norm; double min = r1.m_files.Count; if (min > r2.m_files.Count) min = r2.m_files.Count; product /= min; return (float)(product); } private bool checkTabooList(List rows, int clusterI, int clusterJ) { if (rows[clusterI].nClusterIndex < 0) return true; if (rows[clusterJ].nClusterIndex < 0) return true; if (rows[clusterI].nClusterIndex == rows[clusterJ].nClusterIndex) return true; return false; } private void getMaxElement(List rows, ref int nRow, ref int nCol, ref float fMax, float[,] fMatrix, int nSize) { nRow = 0; nCol = 0; fMax = fMatrix[nRow, nCol]; for (int i = 0; i < nSize; ++i) { for (int j = i + 1; j < nSize; ++j) { if (fMax < fMatrix[i, j] && checkTabooList(rows, i, j) == true) { fMax = fMatrix[i, j]; nRow = i; nCol = j; } } } } private Row mergeTwoRows(Row r1, Row r2) { Row merged = new Row(); for (int i = 0; i < r1.m_files.Count; ++i) { merged.m_files.Add(r1.m_files[i]); } for (int i = 0; i < r2.m_files.Count; ++i) { merged.m_files.Add(r2.m_files[i]); } int index1 = 0; int index2 = 0; while (true) { if (r1.m_pos[index1] == r2.m_pos[index2]) { merged.m_pos.Add(r1.m_pos[index1]); int sum = r1.m_data[index1]; sum += r2.m_data[index2]; merged.m_data.Add(sum); ++index1; ++index2; } else if (r1.m_pos[index1] > r2.m_pos[index2]) { merged.m_pos.Add(r2.m_pos[index2]); merged.m_data.Add(r2.m_data[index2]); ++index2; } else if (r1.m_pos[index1] < r2.m_pos[index2]) { merged.m_pos.Add(r1.m_pos[index1]); merged.m_data.Add(r1.m_data[index1]); ++index1; } if (index1 >= r1.m_pos.Count) break; if (index2 >= r2.m_pos.Count) break; } for (int i = index1; i < r1.m_pos.Count; ++i) { merged.m_pos.Add(r1.m_pos[i]); merged.m_data.Add(r1.m_data[i]); } for (int i = index2; i < r2.m_pos.Count; ++i) { merged.m_pos.Add(r2.m_pos[i]); merged.m_data.Add(r2.m_data[i]); } merged.norm = 0.0; foreach (short s in merged.m_data) { merged.norm += s * s; } merged.norm = Math.Sqrt(merged.norm); if (r1.nClusterIndex >= 0 && r2.nClusterIndex >= 0) { if (r1.nClusterIndex != r2.nClusterIndex) { Console.WriteLine("Error: merging two knowingly different clusters {0} {1}", r1.nClusterIndex, r2.nClusterIndex); Environment.Exit(0); } } if (r1.nClusterIndex >= 0) merged.nClusterIndex = r1.nClusterIndex; if (r2.nClusterIndex >= 0) merged.nClusterIndex = r2.nClusterIndex; return merged; } public List executeHAC(List rows, int nCategories) { if (nCategories >= rows.Count) { Console.WriteLine("Number of requested categories is larger data size"); return rows; } Console.WriteLine("Start HAC procedure...\n"); int NN = rows.Count - nCategories; int nMatrixSize = rows.Count; float[,] fMatrix = new float[nMatrixSize, nMatrixSize]; for (int i = 0; i < nMatrixSize; ++i) { for (int j = i + 1; j < nMatrixSize; ++j) { fMatrix[i, j] = getCosine(rows[i], rows[j]); } } for (int loop = 0; loop < NN; ++loop) { nMatrixSize = rows.Count; int nrow = 0; int ncol = 0; float fMax = 0.0f; getMaxElement(rows, ref nrow, ref ncol, ref fMax, fMatrix, nMatrixSize); Row r1 = rows[nrow]; Row r2 = rows[ncol]; Row merged = mergeTwoRows(r1, r2); rows.Remove(r1); rows.Remove(r2); //reassign repeated matrix elements int curRow = 0; int curCol = 0; for (int i = 0; i < nMatrixSize; ++i) { if (i != nrow && i != ncol) { curCol = curRow + 1; for (int j = i + 1; j < nMatrixSize; ++j) { if (j != nrow && j != ncol) { fMatrix[curRow, curCol] = fMatrix[i, j]; ++curCol; } } ++curRow; } } rows.Add(merged); //recompute new matrix elements curRow = 0; for (int i = 0; i < nMatrixSize; ++i) { if (i != nrow && i != ncol) { fMatrix[curRow, nMatrixSize - 2] = getCosine(rows[curRow], rows[nMatrixSize - 2]); ++curRow; } } Console.Write("Step {0} \r", loop); } Console.WriteLine(); Console.WriteLine("\nList of categories\n"); int cnt = 1; foreach (Row row in rows) { Console.WriteLine("Category {0}\n", cnt); foreach (int n in row.m_files) { Console.Write("{0,5}", n); } Console.WriteLine("\n"); ++cnt; } return rows; } public double computeF_measure(List rows, int[] nExpected, int[] nFiles) { Dictionary dict = new Dictionary(); int cnt = 0; int nCategories = 0; int nMax = nExpected[0]; for (int i = 0; i < nExpected.Length; ++i) { if (!dict.ContainsKey(nExpected[i])) { dict.Add(nExpected[i], cnt); ++cnt; ++nCategories; } if (nExpected[i] > nMax) nMax = nExpected[i]; } int[] chart = new int[nMax + 1]; foreach (KeyValuePair pair in dict) { chart[pair.Key] = pair.Value; } for (int i = 0; i < nExpected.Length; ++i) { nExpected[i] = chart[nExpected[i]]; } int[,] nAccuracy = new int[nCategories, nCategories]; int nC = 0; foreach (Row row in rows) { for (int i = 0; i < nCategories; ++i) { nAccuracy[nC, i] = 0; } foreach (int nFile in row.m_files) { int index = 0; foreach (int n in nFiles) { if (n == nFile) break; ++index; } ++nAccuracy[nC, nExpected[index]]; } ++nC; } for (int i = 0; i < nCategories; ++i) { for (int j = 0; j < nCategories; ++j) { Console.Write(" {0,4} ", nAccuracy[i, j]); } Console.WriteLine(); } Console.WriteLine(); int[] sumInRows = new int[nCategories]; int[] sumInCols = new int[nCategories]; for (int i = 0; i < nCategories; ++i) { sumInRows[i] = 0; sumInCols[i] = 0; for (int j = 0; j < nCategories; ++j) { sumInRows[i] += nAccuracy[i, j]; sumInCols[i] += nAccuracy[j, i]; } } double[,] f_measure = new double[nCategories, nCategories]; for (int i = 0; i < nCategories; ++i) { for (int j = 0; j < nCategories; ++j) { if (nAccuracy[i, j] > 0) { double f = (double)nAccuracy[i, j]; double prec = f / (double)(sumInRows[i]); double recall = f / (double)(sumInCols[j]); f_measure[i, j] = 2.0 * prec * recall / (prec + recall); } else { f_measure[i, j] = 0.0; } } } double total = 0.0; for (int loop = 0; loop < nCategories; ++loop) { int RR = 0; int CC = 0; double max = f_measure[RR, CC]; for (int i = 0; i < nCategories; ++i) { for (int j = 0; j < nCategories; ++j) { if (f_measure[i, j] > max) { max = f_measure[i, j]; RR = i; CC = j; } } } total += max; for (int i = 0; i < nCategories; ++i) { f_measure[RR, i] = 0.0; } for (int j = 0; j < nCategories; ++j) { f_measure[j, CC] = 0.0; } } return total / (double)(nCategories); } } class Program { static void Main(string[] args) { string datafile = @"C:\APOLAR\UTILITIES\Qlango\Qlango\bin\Release\data\DocWordMatrix.dat"; int nCategories = 3; int[] nFiles = { 0, 72, 2, 3, 6, 7, 8, 9, 11, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 64, 1, 66, 79, 68, 69, 70, 71, 65, 73, 74, 75, 76, 77, 78, 67}; int[] nTabooList = { -1, -1, 9, 9, -1, -1, 9, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, 3, 3, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1}; int[] nExpected = { 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; HACBuilder hacBuilder = new HACBuilder(); List rows = hacBuilder.readData(datafile, nFiles, nTabooList); rows = hacBuilder.executeHAC(rows, nCategories); Console.WriteLine("Accuracy {0}", hacBuilder.computeF_measure(rows, nExpected, nFiles)); } } }