/* PLSI: Probabilistic Latent Semantic Indexing Tool $Id: plsi.cpp,v 1.9 2003/03/27 08:23:57 taku-ku Exp $; Copyright (C) 2002 Taku Kudo All rights reserved. This is free software with ABSOLUTELY NO WARRANTY. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include "plsi.h" #include "feature_node.h" #include "common.h" #include "config.h" #include #include #include #include #include #include #include #include #include #define RAND ((double)rand()/(double)RAND_MAX) #define LOG_E_2 0.69314718055994530942 #define log2(x) (std::log((double)(x))/LOG_E_2) namespace PLSI { void throw_error (const char *filename, const char *message) { std::strstream ss; ss << message << " [" << filename << "]"; throw std::runtime_error (ss.str()); } void suffix_open (std::ofstream &os, const char *filename, const char *suffix, unsigned int p) { std::string fname = filename; fname += "."; fname += suffix; os.setf(std::ios::fixed,std::ios::floatfield); os.precision(p); os.open (fname.c_str()); std::cout << fname << " "; if (! os) throw_error (fname.c_str(), "suffix_open(): Cannot open"); } void suffix_open (std::ifstream &is, const char *filename, const char *suffix) { std::string fname = filename; fname += "."; fname += suffix; is.open (fname.c_str()); if (! is) throw_error (fname.c_str(), "suffix_open(): Cannot open"); } void write_vector (const char* filename, const char* suffix, unsigned int p, unsigned int r, unsigned int z, double **d, bool inverse) { std::ofstream os; suffix_open (os, filename, suffix, p); if (inverse) { for (unsigned int j = 0; j < z; ++j) { for (unsigned int i = 0; i < r; ++i) { if (i != 0) os << ' '; os << d[i][j]; } os << std::endl; } } else { for (unsigned int i = 0; i < r; ++i) { for (unsigned int j = 0; j < z; ++j) { if (j != 0) os << ' '; os << d[i][j]; } os << std::endl; } } os.close (); } void write_vector (const char* filename, const char* suffix, unsigned int p, unsigned int z, double *d) { std::ofstream os; suffix_open (os, filename, suffix, p); for (unsigned int j = 0; j < z; ++j) { if (j != 0) os << ' '; os << d[j]; } os << std::endl; os.close (); } void read_vector (const char *filename, const char *suffix, unsigned int r, unsigned int z, double **d) { std::ifstream is; suffix_open (is, filename, suffix); std::string line; unsigned int i = 0; while (i < z && std::getline (is, line)) { std::istrstream istrs ((char *)line.c_str()); unsigned int j = 0; while (j < r && istrs >> d[++j][i]); if (j != r) throw_error (filename, "read_vector(): # colum is invalid"); ++i; } is.close(); if (i != z) throw_error (filename, "read_vector(): # row is invalid"); } void read_vector (const char *filename, const char *suffix, unsigned int r, double *d) { std::ifstream is; suffix_open (is, filename, suffix); std::string line; unsigned int j = 0; if (! std::getline (is, line)) throw_error (filename, "read_vector(): # row is invalid"); std::istrstream istrs ((char *)line.c_str()); while (j < r && istrs >> d[++j]); if (j != r) throw_error (filename, "read_vector(): # colum is invalid"); is.close(); return; } double PLSI::gain () { double gain = 0; double sum = 0.0; for (unsigned int id = 0; id < D; ++id) { feature_node *f = matrix.getFeatureNode(id); for (unsigned int j = 0; f[j].index != -1; ++j) { if ((sum = pdw (id, f[j].index)) != 0) gain -= (f[j].value/R * log2(sum)); } } return gain/log2(D * W); } double PLSI::init (const char *filename) { try { clear (); D = matrix.getRowNum(); W = matrix.getColNum(); if (D == 0) throw std::runtime_error ("Number of documents is 0"); if (W == 0) throw std::runtime_error ("Number of words is 0"); new_pdz = new double * [D]; old_pdz = new double * [D]; for (unsigned int i = 0; i < D ; ++i) { new_pdz[i] = new double [Z]; old_pdz[i] = new double [Z]; } new_pwz = new double * [W]; old_pwz = new double * [W]; for (unsigned i = 0; i < W; ++i) { new_pwz[i] = new double [Z]; old_pwz[i] = new double [Z]; } new_pz = new double [Z]; old_pz = new double [Z]; double sum2 = 0.0; if (! filename) { for (unsigned int iz = 0; iz < Z; ++iz) { double sum = 0; for (unsigned int id = 0; id < D ; ++id) sum += (old_pdz[id][iz] = RAND), new_pdz[id][iz] = 0.0; for (unsigned int id = 0; id < D; ++id) old_pdz[id][iz] /= sum; sum = 0; for (unsigned int iw= 0; iw < W ; ++iw) sum += (old_pwz[iw][iz] = RAND), new_pwz[iw][iz] = 0.0; for (unsigned int iw = 0; iw < W; ++iw) old_pwz[iw][iz] /= sum; sum2 += (old_pz[iz] = RAND), new_pz[iz] = 0.0; } for (unsigned int iz = 0; iz < Z; ++iz) old_pz[iz] /= sum2; } else { read_vector (filename, PDZ_SUFFIX, D, Z, old_pdz); read_vector (filename, PWZ_SUFFIX, W, Z, old_pwz); read_vector (filename, PZ_SUFFIX, Z, old_pz); } for (unsigned int id = 0; id < D; ++id) { feature_node *f = matrix.getFeatureNode(id); for (unsigned int j = 0; f[j].index != -1; ++j) R += f[j].value; } double target_gain = 0; for (unsigned int id = 0; id < D; ++id) { feature_node *f = matrix.getFeatureNode(id); for (unsigned int j = 0; f[j].index != -1; ++j) target_gain -= (f[j].value/R * log2(f[j].value/R)); } target_gain /= log2 (D * W); return target_gain; } catch (std::exception &e) { clear (); throw std::runtime_error (e.what ()); return false; } } bool PLSI::learn (unsigned int _Z, double _beta, double _eps, unsigned int _max_iter, const char *resume) { try { std::cout.setf(std::ios::fixed,std::ios::floatfield); std::cout.precision(10); Z = _Z; beta = _beta; double eps = _eps; unsigned int max_iter = _max_iter; if (Z <= 1) throw std::runtime_error ("Z is invalid."); if (beta <= 0.0) throw std::runtime_error ("beta is invalid."); if (eps <= 0.0) throw std::runtime_error ("eps is invalid."); if (max_iter <= 0) throw std::runtime_error ("max_iter is invalid."); double target_gain = init (resume); double current_gain; double old_gain = gain (); unsigned int iter = 0; std::cout << COPYRIGHT << std::endl; std::cout << "Number of documents: " << D << std::endl; std::cout << "Number of words: " << W << std::endl; std::cout << "Number of clusters: " << Z << std::endl; std::cout << "Target gain: " << target_gain << std::endl; std::cout << "eps: " << eps << std::endl; std::cout << "beta: " << beta << std::endl; if (resume) std::cout << "resume: " << resume << std::endl; std::cout << "\nStarting EM iterations ..." << std::endl; while (++iter) { for (unsigned int id =0; id < D; ++id) { if (id % 100 == 0) std::cout << "." << std::flush; feature_node *f = matrix.getFeatureNode(id); for (unsigned int j = 0; f[j].index != -1; ++j) { double sum = pzdw_norm (id, f[j].index); for (unsigned int iz = 0; iz < Z; ++iz) { double score = (f[j].value * pzdw (iz, id, f[j].index, sum)); new_pz[iz] += score; new_pwz[f[j].index][iz] += score; new_pdz[id][iz] += score; } } } for (unsigned int iz = 0; iz < Z; ++iz) { for (unsigned int id = 0; id < D; ++id) old_pdz[id][iz] = new_pdz[id][iz]/new_pz[iz], new_pdz[id][iz] = 0.0; for (unsigned int iw = 0; iw < W; ++iw) old_pwz[iw][iz] = new_pwz[iw][iz]/new_pz[iz], new_pwz[iw][iz] = 0.0; old_pz[iz] = new_pz[iz]/(double)R; new_pz[iz] = 0.0; } current_gain = gain (); double e = fabs (current_gain - old_gain); std::cout << " " << iter << " " << old_gain << " " << current_gain << " " << e << std::endl; if (iter >= max_iter || e <= eps) break; old_gain = current_gain; } std::cout << "Done!" << std::endl; return true; } catch (std::exception &e) { clear (); throw std::runtime_error (e.what()); return false; } } void PLSI::clear () { for (unsigned int i = 0; i < D; ++i) { delete [] new_pdz[i]; delete [] old_pdz[i]; } delete [] new_pdz; new_pdz = 0; delete [] old_pdz; old_pdz = 0; for (unsigned int i = 0; i < W; ++i) { delete [] new_pwz[i]; delete [] old_pwz[i]; } delete [] new_pwz; new_pwz = 0; delete [] old_pwz; old_pwz = 0; delete [] new_pz; new_pz = 0; delete [] old_pz; old_pz = 0; D = W = 0; R = 0.0; } bool PLSI::read (const char *filename) { return matrix.read (filename); } bool PLSI::write (const char *filename, unsigned int prec) { double *pd = new double [D]; for (unsigned int id = 0; id < D; ++id) { double sum = 0; for (unsigned int iz = 0; iz < Z; ++iz) sum += (old_pdz[id][iz] * old_pz[iz]); for (unsigned int iz = 0; iz < Z; ++iz) new_pdz[id][iz] = (sum == 0 ? 0.0 : (old_pdz[id][iz] * old_pz[iz])/sum); pd[id] = sum; } double *pw = new double [W]; for (unsigned int iw = 0; iw < W; ++iw) { double sum = 0; for (unsigned int iz = 0; iz < Z; ++iz) sum += (old_pwz[iw][iz] * old_pz[iz]); for (unsigned int iz = 0; iz < Z; ++iz) new_pwz[iw][iz] = (sum == 0 ? 0.0 : (old_pwz[iw][iz] * old_pz[iz])/sum); pw[iw] = sum; } write_vector (filename, PDZ_SUFFIX, prec, D, Z, old_pdz, true); write_vector (filename, PWZ_SUFFIX, prec, W, Z, old_pwz, true); write_vector (filename, PZD_SUFFIX, prec, D, Z, new_pdz, false); write_vector (filename, PZW_SUFFIX, prec, W, Z, new_pwz, false); write_vector (filename, PZ_SUFFIX, prec, Z, old_pz); write_vector (filename, PD_SUFFIX, prec, D, pd); write_vector (filename, PW_SUFFIX, prec, W, pw); std::cout << std::endl; delete [] pw; delete [] pd; return true; } }