python中如何使用TF-IDF和BM25提取文章關鍵詞

發(fā)布時間：2020-10-28 02:46:43 來源：億速云閱讀：401 作者：小新欄目：編程語言

小編給大家分享一下python中如何使用TF-IDF和BM25提取文章關鍵詞，希望大家閱讀完這篇文章后大所收獲，下面讓我們一起去探討吧！

使用TF-IDF和BM25提取文章關鍵詞

評估方法：

人工從文章中提取1-5個關鍵詞，和機器提取的關鍵詞做比較

召回 = 機器提詞∩人工提詞 / 人工提詞

準確 = 機器提詞∩人工提詞 / 機器提詞

TF-IDF

原理參考：http://www.ruanyifeng.com/blog/2013/03/tf-idf.html

實現參考：tf-idf-keyword

其他參考：使用不同的方法計算TF-IDF值

第一版標題和正文加權計算tf-idf

主要策略

（1）使用nlpc切詞服務（可用jieba切詞代替）+TF-IDF提取關鍵詞。

（2）去除停用詞

（3）按照體裁+年級分成若干類型，來訓練模型，示例用高中+敘事類，取了20000條數據訓練

（4）對標題進行加權，標題的每個詞匯頻率+6，再合一起計算tf-idf

（5）按照權重取前4個關鍵詞，在這4個關鍵詞中對于權重小于頻率(5)*平均IDF/總詞數的進行過濾

注：以上數據均為調節(jié)后最優(yōu)解

代碼實現

config.py

program = 'composition_term_weight'
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',
                    stream=sys.stderr,
                    datefmt='%a, %d %b %Y %H:%M:%S')
logging.root.setLevel(level=logging.INFO)

IDFLoader.py

class IDFLoader(object):
    """詞典加載類"""
    def __init__(self, idf_path):
        self.idf_path = idf_path
        self.idf_freq = {}  # idf
        self.mean_len = 0 #平均長度
        self.mean_idf = 0.0  # 均值
        self.load_idf()
    def load_idf(self):
        """從文件中載入idf"""
        cnt = 0
        with open(self.idf_path, 'rb') as f:
            for line in f:
                try:
                    word, freq = line.strip().decode('utf-8', errors='ignore').split(' ')
                    if word == 'LEN_AVG':
                        self.mean_len = int(freq)
                        break
                    self.idf_freq[word] = float(freq)
                    cnt += 1
                except Exception as e:
                    # logger.error('load_idf error: ' + e.message + ' line: ' + line.decode('utf-8', errors='ignore'))
                    continue
        self.mean_idf = sum(self.idf_freq.values()) / cnt
        logger.info('Vocabularies %s loaded: %d mean_idf: %d' % (self.idf_path, cnt, self.mean_idf))

class TfIdf(object):
    """TF-IDF"""
    # 對正文進行過濾
    p_cut = re.compile(r'[a-zA-Z0-9]', re.VERBOSE)
    # 對標題進行過濾
    p_title = re.compile(r'作文|\d+字|.年級|_', re.VERBOSE)
    # 過濾常用標點符號等，也可以放到停用詞表中
    ignored = ['', ' ', '', '。', '：', '，', '）', '（', '！', '?', '”', '“', '＂', '―', '．', '說', '好', '時']
    # 主題最小出現次數，用于過濾權重不達標的關鍵詞
    min_times = 5.0
    # 標題加權次數
    title_add_times = 6.0
    # 取關鍵詞的個數
    words_num = 4
    def __init__(self):
        # 1. 獲取停用詞庫
        my_stop_words_path = 'stop_words.utf8.txt'
        self.stop_words_dict = []
        with open(my_stop_words_path, 'rb') as fr:
            for line in fr.readlines():
                self.stop_words_dict.append(line.strip())
    def my_cut(self, inTxt):
        """切詞"""
        inTxt = self.p_cut.sub('', str(inTxt))
        words_list = []
        # 由于性能問題，一句一句的切詞
        for l in inTxt.split('。'):
            # NLPC切詞服務，可用jieba切詞代替
            r = cut(l)
            if r is not None:
                words_list += r
        return [w for w in words_list if w not in self.stop_words_dict and w not in self.ignored and len(w.strip()) > 0]
    def get_tfidf(self, idf_loader, title, content):
        """計算文章tf-idf"""
        filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore'))
        title_words = self.my_cut(filter_title)
        corpus0 = title_words + self.my_cut(content)
        freq = {}
        for w in corpus0:
            freq[w] = freq.get(w, 0.0) + 1.0
        # 對標題進行加權
        for w in title_words:
            logger.info(freq[w])
            freq[w] = freq.get(w, 0.0) + self.title_add_times
            logger.info(freq[w])
        total = sum(freq.values())
        for k in freq:  # 計算 TF-IDF
            freq[k] *= idf_loader.idf_freq.get(k, idf_loader.mean_idf) / total
        return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words
    def get_term_weight(self, idf_loader, title, content):
        """獲得term權重"""
        result, words_number, title_words = self.get_tfidf(idf_loader, title, content)
        bound = self.min_times * idf_loader.mean_idf / words_number
        machine_words = [item for item in result[:4] if item[1] > bound]
        # machine_words = [item for item in result[:self.words_num]]
        if len(machine_words) < 1:
            # 如果一個term都沒有，則把標題拿出來
            machine_words = [item for item in result if item[1] in title_words]
        data = []
        offset = 0
        for i, word in enumerate(machine_words):
            data.append('%s:%d:%s' % (word[0], offset, str(round(word[1], 4))))
            offset += len(word[0].decode('utf-8', errors='ignore'))
        return data
    def getCorpus(self, data_path):
        """獲取詞表"""
        count = 0
        corpus_list = []
        with open(data_path, 'rb') as f:
            for line in f:
                info = json.loads(line.decode('utf-8', errors='ignore'))
                sentence = self.p_title.sub('', info.get('title').encode('utf-8', errors='ignore')) + '。' + info.get(
                    '@merge_text').encode('utf-8', errors='ignore')
                r = self.my_cut(sentence)
                if not r:
                    continue
                corpus_list.append(r)
                count += 1
                if count % 1000 == 0:
                    logger.info("processd " + str(count) + " segment_sentence")
        return corpus_list
    def train(self, dir_name, data_path):
        """訓練模型"""
        idf_path = 'data/%s/idf.txt' % dir_name
        documents = self.getCorpus(data_path)
        id_freq = {}
        i = 0
        len_sum = 0
        for doc in documents:
            len_sum += len(doc)
            doc = set(doc)
            for x in doc:
                id_freq[x] = id_freq.get(x, 0) + 1
            if i % 1000 == 0:
                logger.info('Documents processed: ' + str(i) + ', time: ' + str(datetime.datetime.now()))
            i += 1
        del documents
        with open(idf_path, 'wb') as f:
            for key, value in id_freq.items():
                f.write(key + ' ' + str(math.log(i / value, 2)) + '\n')
            logger.info(str(i) + ' ' + str(len_sum))
            f.write('LEN_AVG ' + str(len_sum / i))
    def test_one(self, dir_name, method='tfidf'):
        """單個測試"""
        idf_loader = IDFLoader('data/%s/idf.txt' % dir_name)
        for item in sys.stdin:
            info = json.loads(item.decode('utf-8', errors='ignore'))
            title = info['title']
            content = info['@merge_text']
            if method == 'tfidf':
                result, words_number, title_words = self.get_tfidf(idf_loader, title, content)
            else:
                result, words_number, title_words = self.get_bm25(idf_loader, title, content)
            bound = self.min_times * idf_loader.mean_idf / words_number
            print '_____words_number bound_____'
            print words_number, bound
            print '_____tfidf_result_____'
            for item in result[:20]:
                print item[0].encode('utf-8', errors='ignore'), item[1]

經調優(yōu)，最優(yōu)解為：min_times=5 title_add_times=6.0 words_num=4

結果

人工抽樣評估了100個
TF-IDF召回率：0.2778
TF-IDF準確率：0.2778

BM25

算法參考：搜索中的權重度量利器: TF-IDF和BM25

第一版

TfIdf.py 增加方法：

    def get_bm25(self, idf_loader, title, content):
        """計算bm25"""
        k = 1.2  # 用來限制TF值的增長極限
        b = 0.75  # b是一個常數，它的作用是規(guī)定L對評分的影響有多大。
        # L是文檔長度與平均長度的比值
        EPSILON = 0.25  # 如果idf詞表中沒有，則平均idf*該值
        filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore'))
        title_words = self.my_cut(filter_title)
        corpus0 = title_words + self.my_cut(content)
        freq = {}
        for w in corpus0:
            freq[w] = freq.get(w, 0.0) + 1.0
        # 對標題進行加權
        for w in title_words:
            freq[w] = freq.get(w, 0.0) + self.title_add_times
        total = sum(freq.values())
        logger.info(str((k, b, total, idf_loader.mean_len)))
        for i in freq:
            tf = freq[i] / total
            idf = idf_loader.idf_freq.get(i, idf_loader.mean_idf * EPSILON)
            freq[i] = idf * ((k + 1) * tf) / (k * (1.0 - b + b * (total / idf_loader.mean_len)) + tf)
        return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words

經調優(yōu)，最優(yōu)解為：min_times=2.5 title_add_times=6.0 words_num=4 k=1.2 b=0.75 EPSILON=0.25

結果

人工抽樣評估了100個
BM25召回率：0.2889
BM25準確率：0.3333

看完了這篇文章，相信你對python中如何使用TF-IDF和BM25提取文章關鍵詞有了一定的了解，想了解更多相關知識，歡迎關注億速云行業(yè)資訊頻道，感謝各位的閱讀！

向AI問一下細節(jié)

python中如何使用TF-IDF和BM25提取文章關鍵詞

猜你喜歡

最新資訊

相關推薦

相關標簽