@@ -258,6 +258,11 @@ def _gibbs_sample_training(self):
258258
259259 p_vector = beta_vector * theta_vector
260260 # print p_vector
261+ """
262+ for some special document m (only have one word) p_vector may be zero here, sum(p_vector) will be zero too
263+ 1.0 * p_vector / sum(p_vector) will be [...nan...]
264+ so we should avoid inputting the special document
265+ """
261266 p_vector = 1.0 * p_vector / sum (p_vector )
262267 # print p_vector
263268 sample_z = LldaModel ._multinomial_sample (p_vector )
@@ -513,6 +518,8 @@ def theta_m(self, m):
513518 @property
514519 def beta (self ):
515520 """
521+ This name "beta" comes from
522+ "Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage..."
516523 topic-term distribution
517524 beta[k, t] is the probability of term t(word) to be generated from topic k
518525 :return: a matrix, shape is K * T
@@ -542,25 +549,27 @@ def theta(self):
542549 def log_perplexity (self ):
543550 """
544551 log perplexity of LDA topic model
552+ Reference: Parameter estimation for text analysis, Gregor Heinrich.
545553 :return: a float value
546554 """
547555 beta = self .beta
548556 # theta = self.theta
549557 log_likelihood = 0
550- word_count = 0
558+ # word_count = 0
551559 for m , theta_m in enumerate (self .theta ):
552560 for t in self .W [m ]:
553561 likelihood_t = np .inner (beta [:, t ], theta_m )
554562 # print likelihood_t
555563 log_likelihood += - np .log (likelihood_t )
556- word_count += 1
557- assert word_count == self .WN , "word_count: %s\t self.WN: %s" % (word_count , self .WN )
564+ # word_count += 1
565+ # assert word_count == self.WN, "word_count: %s\tself.WN: %s" % (word_count, self.WN)
558566 return 1.0 * log_likelihood / self .WN
559567
560568 @property
561569 def perplexity (self ):
562570 """
563571 perplexity of LDA topic model
572+ Reference: Parameter estimation for text analysis, Gregor Heinrich.
564573 :return: a float value, perplexity = exp{log_perplexity}
565574 """
566575 return np .exp (self .log_perplexity )
0 commit comments