主題模型+TF-IDF提取文本的關鍵詞
前言
如題,本文是LDA(Latent )主題模型的第二篇,第一篇是 折肘法+困惑度確定LDA主題模型的主題數。在上一篇文章中,簡單介紹LDA模型的概念和LDA主題模型的主題數的確定方法-折肘法+困惑度折線法。本文將基于上一篇文章中的方法,確定給定文本中的主題數=5,進而提取關鍵詞(key word)。
理論
LDA主題模型如何提取文本中的關鍵詞?本文采用如下方案:
- 使用 tf-idf 對數據集中的每個詞進行加權,得到加權后的向量表示;
- 通過詞空間構建和向量化方法,得到給定數據集的主題-詞分布;
- 計算詞的分布和文檔的分布的相似度,取相似度最高的keyword num個詞作為關鍵詞。
代碼 0. 加載依賴包
import gensim
import math
import jieba
import jieba.posseg as posseg
from jieba import analyse
from gensim import corpora, models
import functools
import numpy as np
import os
import time
from tqdm import tqdm
1. 主題模型類定義
class TopicModel(object):
"""使用gensim的接口,將文本轉為向量化表示"""
def __init__(self, doc_list, keyword_num, model='LSI', num_topics=5):
"""形參:給定數據集,關鍵詞數,候選主題模型(可選LSI、LDA等),主題數"""
"""構建詞空間,BOW模型向量化"""
self.dictionary = corpora.Dictionary(doc_list)
corpus = [self.dictionary.doc2bow(doc) for doc in doc_list]
"""對于文本中的每個單詞,使用tf-idf進行加權,得到加權后的向量表示"""
self.tfidf_model = models.TfidfModel(corpus)
self.tfidf_corpus = self.tfidf_model[corpus]
self.keyword_num = keyword_num
self.num_topics = num_topics
"""加載主題模型"""
if model == 'LSI':
self.model = self.train_lsi()
else:
self.model = self.train_lda()
"""給定數據集的主題-詞分布"""
word_dic = self.word_dictionary(doc_list)
self.wordtopic_dic = self.get_wordtopic(word_dic)
"""BOW向量化"""
def doc2bowvec(self, word_list):
vec_list = [1 if word in word_list else 0 for word in self.dictionary]
print("vec_list", vec_list)
return vec_list
"""詞空間構建方法和向量化方法"""
def word_dictionary(self, doc_list):
dictionary = []
for doc in doc_list:
dictionary.extend(doc)
dictionary = list(set(dictionary))

return dictionary
"""給定數據集的主題-詞分布"""
def get_wordtopic(self, word_dic):
wordtopic_dic = {}
for word in word_dic:
singlist = [word]
# 計算每個詞的加權向量
word_corpus = self.tfidf_model[self.dictionary.doc2bow(singlist)]
# 計算每個詞的主題向量
word_topic = self.model[word_corpus]
wordtopic_dic[word] = word_topic
return wordtopic_dic
"""加載主題模型"""
def train_lsi(self):
lsi = models.LsiModel(self.tfidf_corpus, id2word=self.dictionary, num_topics=self.num_topics)
return lsi
def train_lda(self):
lda = models.LdaModel(self.tfidf_corpus, id2word=self.dictionary, num_topics=self.num_topics)
return lda
"""計算詞的分布和文檔的分布的相似度,將相似度最高的keyword_num個詞作為關鍵詞"""
def get_simword(self, word_list):
# 文檔的加權向量
sentcorpus = self.tfidf_model[self.dictionary.doc2bow(word_list)]
# 文檔主題
senttopic = self.model[sentcorpus]
def calsim(l1, l2):
a, b, c = 0.0, 0.0, 0.0
for t1, t2 in zip(l1, l2):
x1 = t1[1]
x2 = t2[1]
a += x1 * x1
b += x1 * x1
c += x2 * x2
sim = a / math.sqrt(b * c) if not (b * c) == 0.0 else 0.0
return sim
# 輸入文本和每個詞的主題分布相似度
sim_dic = {}
for k, v in self.wordtopic_dic.items():
# 計算每個文檔中的詞和文檔的相識度
if k not in word_list:
continue
sim = calsim(v, senttopic)
sim_dic[k] = sim
counts = {}
keyWordDict = []
for k, v in sorted(sim_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
if k is not None:
keyWordDict.append(k)
return keyWordDict
2. 數據預處理
# 加載數據
def load_whole_dataSet(datafolder_path):
prepared_data =[]
files = os.listdir(datafolder_path)
for file in files:
if not os.path.isdir(datafolder_path + file):
for line in open(datafolder_path+"/"+file, 'r', encoding='utf-8'):
prepared_data.append(line)
return prepared_data
# 數據預處理
def pre_dataSet(prepared_data, pos=False):
doc_list = []
for line in prepared_data:
content = line.strip()
seg_list = seg_to_list(content, pos)
filetr_list = word_filter(seg_list, pos)
doc_list.append(filetr_list)
return doc_list
# 停用詞
def get_stopword_list(stopword_path):
stopword_list = [stopword.replace('\n', ' ') for stopword in open(stopword_path, encoding='gbk').readlines()]
return stopword_list
# jieba分詞
def seg_to_list(sentence, pos=False):
if not pos:
seg_list = jieba.cut(sentence)
else:
seg_list = posseg.cut(sentence)
return seg_list
# 干擾詞
def word_filter(seg_list, stopword_path, pos=False):
stopword_list = get_stopword_list(stopword_path)
filter_list = []
for seg in seg_list:
if not pos:
word = seg
flag = 'n'
else:
word = seg.word
flag = seg.flag
if not flag.startswith('n'):
continue
if word not in stopword_list and len(word)>1:
filter_list.append(word)
return filter_list
3. 構建關鍵詞字典,提取給定數據集的關鍵詞
# 排序函數,抽取topK關鍵詞
def cmp(e1, e2):
res = np.sign(e1[1] - e2[1])
if res != 0:
return res
else:
a = e1[0] + e2[0]
b = e2[0] + e1[0]
if a > b:
return 1
elif a == b:
return 0
else:
return -1

# 構建關鍵詞字典,提取給定數據集的關鍵詞
def load_whole_dataSet(datafolder_path, pos=False, model='LDA', keyword_num=100):
prepared_data =[]
files = os.listdir(datafolder_path)
counts = {}
for file in tqdm(files):
prepared_data =[]
if not os.path.isdir(file):
for line in open(datafolder_path+"/"+file, 'r', encoding='utf-8'):
content = line.strip()
seg_list = seg_to_list(content, pos)
filetr_list = word_filter(seg_list, pos)
prepared_data.append(filetr_list)
if prepared_data:
try:
topic_model = TopicModel(prepared_data, keyword_num, model=model)
except ValueError as e:
pass
for doc_list_i in prepared_data:
keyWordDict = topic_model.get_simword(doc_list_i)
for word in keyWordDict:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word, 0) + 1
return counts
4. 主函數入口
if __name__ == '__main__':
# 設置停用詞路徑
stopword_path = '~/dataSet/stop_words/stop_words.txt'
keyword_num = 5
# text為需要提取關鍵詞的文本
text = "^ ^"
prepared_data = []
content = text.strip()
seg_list = seg_to_list(content, pos=False)
filetr_list = word_filter(seg_list, pos=False)
prepared_data.append(filetr_list)
topic_model = TopicModel(prepared_data, keyword_num, model='LDA')
counts = {}
for item in prepared_data:
keyWordDict = topic_model.get_simword(item)
for word in keyWordDict:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word, 0) + 1
print(counts)
備注 主函數入口中的,停用詞文件既可選hanlp或其他開源數據源,也可設計自定義文件;主題模型類定義中的lda和lsi同屬主題模型,詳見:gensim.models. 和 gensim.models.。 結論