LDA模型的python模块

首先需要安装lda，pip install lda。

# -*- encoding:utf-8 -*-
import lda
import numpy as np
import lda.datasets

# 载入文档-词矩阵
# reuters为路透社新闻
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))
print(X[:5, :5])

# 输出：
'''
type(X): <type 'numpy.ndarray'>
shape: (395L, 4258L)

[[ 1  0  1  0  0]
 [ 7  0  2  0  0]
 [ 0  0  0  1 10]
 [ 6  0  1  0  0]
 [ 0  0  0  2 14]]
'''

X有395行，4258列。说明有395个文档，4258个词汇。

# 词汇
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))
print(vocab[:6])


# 输出
'''
type(vocab): <type 'tuple'>
len(vocab): 4258

('church', 'pope', 'years', 'people', 'mother', 'last')
'''

# 标题/文档
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))
print("len(titles): {}\n".format(len(titles)))
print(titles[:2])  # 前两篇文章的标题

# 输出
'''
type(titles): <type 'tuple'>
len(titles): 395

('0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20', '1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21')
'''

# 训练模型，20个话题，迭代1000次，默认取alpha=0.1，eta（或者beta）=0.01。
model = lda.LDA(n_topics=20, n_iter=1000, random_state=1)
model.fit(X)

# 每个topic的词矩阵，每行表示一个话题的词概率，和为1
topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
print("shape: {}".format(topic_word.shape))

# 输出:
'''
type(topic_word): <type 'numpy.ndarray'>
shape: (20L, 4258L)
'''

# 显示每个话题的topn的词
n = 5
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(-topic_dist)][:n]
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))
    
# 输出：
'''
*Topic 0
- church conversion neighbouring statements outrage
*Topic 1
- deliver voted ladies flight 960
*Topic 2
- deliver protesters complex taught grandmother
*Topic 3
- church stuff cdu ss performances
*Topic 4
- deliver golf honorary hill hailed
*Topic 5
- deliver conversion neighbouring statements outrage
*Topic 6
- church kim fast walk recall
*Topic 7
- deliver hermannsburg zealand philip lasted
*Topic 8
- deliver hailed filled kim enjoyed
*Topic 9
- church golf honorary hill hailed
*Topic 10
- church listed surrounded golf honorary
*Topic 11
- church enjoyed fast walk recall
*Topic 12
- church performances listed surrounded honorary
*Topic 13
- jailed managed practice obviously chanted
*Topic 14
- church breaking rebuild milos quality
*Topic 15
- church midway tribune protesters complex
*Topic 16
- church bubis croatian leon celebrates
*Topic 17
- deliver protesters complex grandmother neighbour
*Topic 18
- church protesters complex taught grandmother
*Topic 19
- church 1998 worried citizens prigione
'''

# 每个文档的topic矩阵，每行代表一个文档的话题分布，和为1
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))

# 输出：
'''
type(doc_topic): <type 'numpy.ndarray'>
shape: (395, 20)
'''

# 显示前10个文档的top1的话题
for n in range(10):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}".format(n, topic_most_pr))


# 输出：
'''
doc: 0 topic: 8
doc: 1 topic: 1
doc: 2 topic: 14
doc: 3 topic: 8
doc: 4 topic: 14
doc: 5 topic: 14
doc: 6 topic: 14
doc: 7 topic: 14
doc: 8 topic: 14
doc: 9 topic: 8
'''