操屁眼的视频在线免费看,日本在线综合一区二区,久久在线观看免费视频,欧美日韩精品久久综

新聞資訊

    主題模型+TF-IDF提取文本的關鍵詞

    前言

    如題,本文是LDA(Latent )主題模型的第二篇,第一篇是 折肘法+困惑度確定LDA主題模型的主題數。在上一篇文章中,簡單介紹LDA模型的概念和LDA主題模型的主題數的確定方法-折肘法+困惑度折線法。本文將基于上一篇文章中的方法,確定給定文本中的主題數=5,進而提取關鍵詞(key word)。

    理論

    LDA主題模型如何提取文本中的關鍵詞?本文采用如下方案:

    - 使用 tf-idf 對數據集中的每個詞進行加權,得到加權后的向量表示;

    - 通過詞空間構建和向量化方法,得到給定數據集的主題-詞分布;

    - 計算詞的分布和文檔的分布的相似度,取相似度最高的keyword num個詞作為關鍵詞。

    代碼 0. 加載依賴包

    import gensim
    import math
    import jieba
    import jieba.posseg as posseg
    from jieba import analyse
    from gensim import corpora, models
    import functools
    import numpy as np
    import os
    import time
    from tqdm import tqdm
    

    1. 主題模型類定義

    class TopicModel(object):
    	"""使用gensim的接口,將文本轉為向量化表示"""
        def __init__(self, doc_list, keyword_num, model='LSI', num_topics=5):
       		"""形參:給定數據集,關鍵詞數,候選主題模型(可選LSI、LDA等),主題數"""
            """構建詞空間,BOW模型向量化"""
            self.dictionary = corpora.Dictionary(doc_list)
            corpus = [self.dictionary.doc2bow(doc) for doc in doc_list]
            
            """對于文本中的每個單詞,使用tf-idf進行加權,得到加權后的向量表示"""
            self.tfidf_model = models.TfidfModel(corpus)
            self.tfidf_corpus = self.tfidf_model[corpus]
            self.keyword_num = keyword_num
            self.num_topics = num_topics
            
            """加載主題模型"""
            if model == 'LSI':
                self.model = self.train_lsi()
            else:
                self.model = self.train_lda()
                
            """給定數據集的主題-詞分布"""
            word_dic = self.word_dictionary(doc_list)
            self.wordtopic_dic = self.get_wordtopic(word_dic)
     
        """BOW向量化"""
        def doc2bowvec(self, word_list):
            vec_list = [1 if word in word_list else 0 for word in self.dictionary]
            print("vec_list", vec_list)
            return vec_list
     
        """詞空間構建方法和向量化方法"""
        def word_dictionary(self, doc_list):
            dictionary = []
            for doc in doc_list:
                dictionary.extend(doc)
            dictionary = list(set(dictionary))
    

    文本關鍵信息抽取_短文本關鍵詞抽取算法_抽取字符的函數

    return dictionary """給定數據集的主題-詞分布""" def get_wordtopic(self, word_dic): wordtopic_dic = {} for word in word_dic: singlist = [word] # 計算每個詞的加權向量 word_corpus = self.tfidf_model[self.dictionary.doc2bow(singlist)] # 計算每個詞的主題向量 word_topic = self.model[word_corpus] wordtopic_dic[word] = word_topic return wordtopic_dic """加載主題模型""" def train_lsi(self): lsi = models.LsiModel(self.tfidf_corpus, id2word=self.dictionary, num_topics=self.num_topics) return lsi def train_lda(self): lda = models.LdaModel(self.tfidf_corpus, id2word=self.dictionary, num_topics=self.num_topics) return lda """計算詞的分布和文檔的分布的相似度,將相似度最高的keyword_num個詞作為關鍵詞""" def get_simword(self, word_list): # 文檔的加權向量 sentcorpus = self.tfidf_model[self.dictionary.doc2bow(word_list)] # 文檔主題 senttopic = self.model[sentcorpus] def calsim(l1, l2): a, b, c = 0.0, 0.0, 0.0 for t1, t2 in zip(l1, l2): x1 = t1[1] x2 = t2[1] a += x1 * x1 b += x1 * x1 c += x2 * x2 sim = a / math.sqrt(b * c) if not (b * c) == 0.0 else 0.0 return sim # 輸入文本和每個詞的主題分布相似度 sim_dic = {} for k, v in self.wordtopic_dic.items(): # 計算每個文檔中的詞和文檔的相識度 if k not in word_list: continue sim = calsim(v, senttopic) sim_dic[k] = sim counts = {} keyWordDict = [] for k, v in sorted(sim_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]: if k is not None: keyWordDict.append(k) return keyWordDict

    2. 數據預處理

    # 加載數據
    def load_whole_dataSet(datafolder_path):
        prepared_data =[] 
        files = os.listdir(datafolder_path)
        for file in files:
            if not os.path.isdir(datafolder_path + file): 
                for line in open(datafolder_path+"/"+file, 'r', encoding='utf-8'):
                    prepared_data.append(line)
        return prepared_data
        
    # 數據預處理
    def pre_dataSet(prepared_data, pos=False):
        doc_list = []
        for line in prepared_data:
            content = line.strip()
            seg_list = seg_to_list(content, pos)
            filetr_list = word_filter(seg_list, pos)
            doc_list.append(filetr_list)
        return doc_list
    # 停用詞
    def get_stopword_list(stopword_path):
        stopword_list = [stopword.replace('\n', ' ') for stopword in open(stopword_path, encoding='gbk').readlines()]
        return stopword_list
    # jieba分詞
    def seg_to_list(sentence, pos=False):
        if not pos:
            seg_list = jieba.cut(sentence)
        else:
            seg_list = posseg.cut(sentence)
        return seg_list
    # 干擾詞
    def word_filter(seg_list, stopword_path, pos=False):
        stopword_list = get_stopword_list(stopword_path)
        filter_list = []
        for seg in seg_list:
            if not pos:
                word = seg
                flag = 'n'
            else:
                word = seg.word
                flag = seg.flag
            if not flag.startswith('n'):
                continue
            if word not in stopword_list and len(word)>1:
                filter_list.append(word)
        return filter_list
    

    3. 構建關鍵詞字典,提取給定數據集的關鍵詞

    # 排序函數,抽取topK關鍵詞
    def cmp(e1, e2):
        res = np.sign(e1[1] - e2[1])
        if res != 0:
            return res
        else:
            a = e1[0] + e2[0]
            b = e2[0] + e1[0]
            if a > b:
                return 1
            elif a == b:
                return 0
            else:
                return -1
        
    

    短文本關鍵詞抽取算法_文本關鍵信息抽取_抽取字符的函數

    # 構建關鍵詞字典,提取給定數據集的關鍵詞 def load_whole_dataSet(datafolder_path, pos=False, model='LDA', keyword_num=100): prepared_data =[] files = os.listdir(datafolder_path) counts = {} for file in tqdm(files): prepared_data =[] if not os.path.isdir(file): for line in open(datafolder_path+"/"+file, 'r', encoding='utf-8'): content = line.strip() seg_list = seg_to_list(content, pos) filetr_list = word_filter(seg_list, pos) prepared_data.append(filetr_list) if prepared_data: try: topic_model = TopicModel(prepared_data, keyword_num, model=model) except ValueError as e: pass for doc_list_i in prepared_data: keyWordDict = topic_model.get_simword(doc_list_i) for word in keyWordDict: if len(word) == 1: continue else: counts[word] = counts.get(word, 0) + 1 return counts

    4. 主函數入口

    if __name__ == '__main__':
    	# 設置停用詞路徑
        stopword_path = '~/dataSet/stop_words/stop_words.txt'
        keyword_num = 5
        # text為需要提取關鍵詞的文本
        text = "^ ^"
        prepared_data = []
        content = text.strip()
        seg_list = seg_to_list(content, pos=False)
        filetr_list = word_filter(seg_list, pos=False)
        prepared_data.append(filetr_list)
        topic_model = TopicModel(prepared_data, keyword_num, model='LDA')
        counts = {}
        for item in prepared_data:
            keyWordDict = topic_model.get_simword(item)
            for word in keyWordDict:
                if len(word) == 1:
                    continue
                else:
                    counts[word] = counts.get(word, 0) + 1
        print(counts)
    

    備注 主函數入口中的,停用詞文件既可選hanlp或其他開源數據源,也可設計自定義文件;主題模型類定義中的lda和lsi同屬主題模型,詳見:gensim.models. 和 gensim.models.。 結論

網站首頁   |    關于我們   |    公司新聞   |    產品方案   |    用戶案例   |    售后服務   |    合作伙伴   |    人才招聘   |   

友情鏈接: 餐飲加盟

地址:北京市海淀區    電話:010-     郵箱:@126.com

備案號:冀ICP備2024067069號-3 北京科技有限公司版權所有