//The code concept is taken from: //http://code.google.com/p/mltool4j/source/browse/trunk/src/edu/thu/mltool4j/topicmodel/plsa //Converted to C# by Andrew Polar for educational purpose. //Important logical bug of original code that affected accuracy is fixed. In original code Pz[z] values //are always equal. According to theoretical concept they should be computed and not necessarily be //equal. However, the computation must start and make several steps with equal values of Pz[z] and then //switch to unequal Pz[z]. This solution uses equal Pz[z] until result converges and then //start using unequal Pz[z] until next stable result is obtained. // //Also the algorithm is adjusted to more convenient reading of the documents. While processing of //the documents the words are counted as it is shown in the matrix below. Each row is a document, //not the other way. During computations the data is fed row by row in the same way they are //recorded. using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace PLSA { class ProbabilisticLSA { private int[,] data = new int[,] { {9, 2, 1, 0, 0, 0}, {8, 3, 2, 1, 0, 0}, {0, 0, 3, 3, 4, 8}, {0, 2, 0, 2, 4, 7}, {2, 0, 1, 1, 0, 3} }; private int nDocs = -1; //number of documents or rows private int nWords = -1; //number of words or columns private int nTopics = -1; //number of topics private class PositionsAndWords { public int position; public int nWords; public PositionsAndWords(int pos, int nW) { position = pos; nWords = nW; } } private List[] row = null; public Boolean doPLSA(int ntopics, double accuracy) { nDocs = data.GetLength(0); nWords = data.GetLength(1); nTopics = ntopics; InitializeDataStructure(); return EM(accuracy); } private void InitializeDataStructure() { row = new List[nDocs]; for (int d = 0; d < nDocs; ++d) { row[d] = new List(); for (int w = 0; w < nWords; ++w) { if (data[d, w] > 0) { row[d].Add(new PositionsAndWords(w, data[d, w])); } } } } private Boolean EM(double accuracy) { double[] Pz = new double[nTopics]; double[,] Pz_d = new double[nTopics, nDocs]; double[,] Pz_w = new double[nTopics, nWords]; double[,][] Pz_dw = new double[nTopics, nDocs][]; // run EM algorithm init(Pz, Pz_d, Pz_w, Pz_dw); double Likelihood = calcLoglikelihood(Pz, Pz_d, Pz_w); Boolean isEqualPz = true; Boolean isOccurredOnce = false; while (true) { // E-step if (!Estep(Pz, Pz_d, Pz_w, Pz_dw)) { Console.WriteLine("E-step failed"); return false; } // M-step if (!Mstep(Pz, Pz_d, Pz_w, Pz_dw, isEqualPz)) { Console.WriteLine("M-step failed"); return false; } double newLikelihood = calcLoglikelihood(Pz, Pz_d, Pz_w); Console.WriteLine("Likelihood: " + newLikelihood); if (Math.Abs(Likelihood / newLikelihood - 1.0) < accuracy) { if (!isOccurredOnce) { isOccurredOnce = true; isEqualPz = false; Console.WriteLine("Switch to unequal Pz"); } else { break; } } Likelihood = newLikelihood; } Console.WriteLine(); Normalize(Pz_d, Pz_w); Likelihood = calcLoglikelihood(Pz, Pz_d, Pz_w); Console.WriteLine("Final likelihood after renormalization: {0}\n", Likelihood); PrintResult(Pz_d, Pz_w, Pz); return true; } private Boolean init(double[] Pz, double[,] Pz_d, double[,] Pz_w, double[,][] Pz_dw) { Random rnd = new Random(); double zvalue = 1.0 / (double)nTopics; for (int z = 0; z < nTopics; ++z) { Pz[z] = zvalue; } for (int z = 0; z < nTopics; ++z) { double norm = 0.0; for (int d = 0; d < nDocs; ++d) { Pz_d[z, d] = (double)(rnd.Next(100)) / 100.0; norm += Pz_d[z, d]; } for (int d = 0; d < nDocs; ++d) { Pz_d[z, d] /= norm; } } for (int z = 0; z < nTopics; ++z) { double norm = 0.0; for (int w = 0; w < nWords; ++w) { Pz_w[z, w] = (double)(rnd.Next(100)) / 100.0; norm += Pz_w[z, w]; } for (int w = 0; w < nWords; ++w) { Pz_w[z, w] /= norm; } } for (int z = 0; z < nTopics; ++z) { for (int d = 0; d < nDocs; ++d) { Pz_dw[z, d] = new double[row[d].Count]; } } return false; } private Boolean Estep(double[] Pz, double[,] Pz_d, double[,] Pz_w, double[,][] Pz_dw) { for (int d = 0; d < nDocs; ++d) { int counter = 0; foreach (PositionsAndWords pw in row[d]) { int w = pw.position; double norm = 0.0; for (int z = 0; z < nTopics; z++) { Pz_dw[z, d][counter] = Pz[z] * Pz_d[z, d] * Pz_w[z, w]; norm += Pz_dw[z, d][counter]; } for (int z = 0; z < nTopics; z++) { Pz_dw[z, d][counter] /= norm; } ++counter; } } return true; } private Boolean Mstep(double[] Pz, double[,] Pz_d, double[,] Pz_w, double[,][] Pz_dw, Boolean isEqualPz) { for (int z = 0; z < nTopics; z++) { for (int w = 0; w < nWords; ++w) { Pz_w[z, w] = 0.0; } } for (int z = 0; z < nTopics; z++) { for (int d = 0; d < nDocs; ++d) { int counter = 0; foreach (PositionsAndWords pw in row[d]) { Pz_w[z, pw.position] += pw.nWords * Pz_dw[z, d][counter]; ++counter; } } } //here normalization should be this way for (int z = 0; z < nTopics; z++) { double norm = 0.0; for (int w = 0; w < nWords; ++w) { norm += Pz_w[z, w]; } for (int w = 0; w < nWords; w++) { Pz_w[z, w] /= norm; } }//end of normalization for (int z = 0; z < nTopics; ++z) { Pz[z] = 0.0; for (int d = 0; d < nDocs; ++d) { Pz_d[z, d] = 0.0; int counter = 0; foreach (PositionsAndWords pw in row[d]) { Pz_d[z, d] += pw.nWords * Pz_dw[z, d][counter]; ++counter; } Pz[z] += Pz_d[z, d]; } } //here also normalization should be this way for (int z = 0; z < nTopics; ++z) { double norm = 0.0; for (int d = 0; d < nDocs; ++d) { norm += Pz_d[z, d]; } for (int d = 0; d < nDocs; d++) { Pz_d[z, d] /= norm; } }//end of normalization if (isEqualPz) //we apply equal Pz until result converge and then continue with unequal Pz { for (int z = 0; z < nTopics; z++) { Pz[z] = 1.0 / (double)(nTopics); } } else { double norm = 0.0; for (int z = 0; z < nTopics; z++) { norm += Pz[z]; } for (int z = 0; z < nTopics; z++) { Pz[z] /= norm; } } return true; } private double calcLoglikelihood(double[] Pz, double[,] Pz_d, double[,] Pz_w) { double L = 0.0; for (int d = 0; d < nDocs; ++d) { foreach (PositionsAndWords pw in row[d]) { double sum = 0.0; for (int z = 0; z < nTopics; ++z) { sum += Pz[z] * Pz_d[z, d] * Pz_w[z, pw.position]; } L += pw.nWords * Math.Log10(sum); } } return L; } void Normalize(double[,] Pz_d, double[,] Pz_w) { for (int d = 0; d < nDocs; ++d) { double norm = 0.0; for (int z = 0; z < nTopics; ++z) { norm += Pz_d[z, d]; } if (norm <= 0.0) norm = 1.0; for (int z = 0; z < nTopics; ++z) { Pz_d[z, d] /= norm; } } for (int w = 0; w < nWords; ++w) { double norm = 0.0; for (int z = 0; z < nTopics; ++z) { norm += Pz_w[z, w]; } if (norm <= 0.0) norm = 1.0; for (int z = 0; z < nTopics; ++z) { Pz_w[z, w] /= norm; } } } private void PrintResult(double[,] Pz_d, double[,] Pz_w, double[] Pz) { Console.WriteLine("Documents in rows, topics in columns"); for (int d = 0; d < nDocs; ++d) { for (int z = 0; z < nTopics; ++z) { Console.Write(" {0:#0.0000} ", Pz_d[z, d]); } Console.WriteLine(); } Console.WriteLine(); Console.WriteLine("Topics in rows, words in columns"); for (int z = 0; z < nTopics; ++z) { for (int w = 0; w < nWords; ++w) { Console.Write(" {0:#0.0000} ", Pz_w[z, w]); } Console.WriteLine(); } Console.WriteLine(); Console.WriteLine("Topics"); for (int z = 0; z < nTopics; ++z) { Console.Write(" {0:#0.0000} ", Pz[z]); } Console.WriteLine('\n'); } } class Program { static void Main(string[] args) { ProbabilisticLSA plsa = new ProbabilisticLSA(); if (!plsa.doPLSA(2, 0.0000001)) { Console.WriteLine("PLSA failed"); } else { Console.WriteLine("PLSA completed"); } } } }