IT教程 ·

文本相似性热度统计算法实现(一)-整句热度统计

Linux系统之LVS+Keepalived实现

1. 场景形貌

软件老王在上一节引见到相似性热度统计的4个需求(),依据需求要从差别维度举行统计:

(1)分组不分句热度统计(依据某列起首举行分组,然后再对形貌类列举行相似性统计);
(2)分组分句热度统计(依据某列起首举行分组,然后对形貌类列根据标点符号举行拆分,然后再对这些句举行热度统计);
(3)整句及分句热度统计;(对形貌类列/按标点符号举行分句,举行热度统计)
(4)热词统计(对形貌类类举行热词统计,反应改体式格局做不不大)

2. 解决计划

热词统计统计对营业没啥协助,软件老王就是用了jieba分词,已包括在其他几个需求中了,不再引见了,直接引见整句及分句热度统计,计划包括完全的excel读入,结果写入到excel及导航到明细等。

2.1 完全代码

完全代码,有须要的朋侪能够直接拿走,不想看代码引见的,能够直接拿走实行。

import jieba.posseg as pseg
import jieba.analyse
import xlwt
import openpyxl
from gensim import corpora, models, similarities
import re

#停词函数
def StopWordsList(filepath):
    wlst = [w.strip() for w in open(filepath, 'r', encoding='utf8').readlines()]
    return wlst

def str_to_hex(s):
    return ''.join([hex(ord(c)).replace('0x', '') for c in s])

# jieba分词
def seg_sentence(sentence, stop_words):
    stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'f', 'r']
    sentence_seged = pseg.cut(sentence)
    outstr = []
    for word, flag in sentence_seged:
        if word not in stop_words and flag not in stop_flag:
            outstr.append(word)
    return outstr

if __name__ == '__main__':
    #1 这些是jieba分词的自定义辞书,软件老王这里增加的花样行业术语,花样就是文档,一列一个词一行就好了,
    # 这个几个辞书软件老王就不上传了,可解释掉。
    jieba.load_userdict("g1.txt")
    jieba.load_userdict("g2.txt")
    jieba.load_userdict("g3.txt")

    #2 停用词,简朴明白就是此次词不支解,这个软件老王找的网上通用的,会提交下。
    spPath = 'stop.txt'
    stop_words = StopWordsList(spPath)

    #3 excel处置惩罚
    wbk = xlwt.Workbook(encoding='ascii')
    sheet = wbk.add_sheet("软件老王sheet")  # sheet称号
    sheet.write(0, 0, '表头-软件老王1')
    sheet.write(0, 1, '表头-软件老王2')
    sheet.write(0, 2, '导航-链接到明细sheet表')
    wb = openpyxl.load_workbook('软件老王-source.xlsx')
    ws = wb.active
    col = ws['B']
    # 4 相似性处置惩罚
    rcount = 1
    texts = []
    orig_txt = []
    key_list = []
    name_list = []
    sheet_list = []

    for cell in col:
        if cell.value is None:
            continue
        if not isinstance(cell.value, str):
            continue
        item = cell.value.strip('nr').split('t')  # 制表格切分
        string = item[0]
        if string is None or len(string) == 0:
            continue
        else:
            textstr = seg_sentence(string, stop_words)
            texts.append(textstr)
            orig_txt.append(string)
    dictionary = corpora.Dictionary(texts)
    feature_cnt = len(dictionary.token2id.keys())
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.LsiModel(corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt)
    result_lt = []
    word_dict = {}
    count =0
    for keyword in orig_txt:
        count = count+1
        print('入手下手实行,第'+ str(count)+'行')
        if keyword in result_lt or keyword is None or len(keyword) == 0:
            continue
        kw_vector = dictionary.doc2bow(seg_sentence(keyword, stop_words))
        sim = index[tfidf[kw_vector]]
        result_list = []
        for i in range(len(sim)):
            if sim[i] > 0.5:
                if orig_txt[i] in result_lt and orig_txt[i] not in result_list:
                    continue
                result_list.append(orig_txt[i])
                result_lt.append(orig_txt[i])
        if len(result_list) >0:
            word_dict[keyword] = len(result_list)
        if len(result_list) >= 1:
            sname = re.sub(u"([^u4e00-u9fa5u0030-u0039u0041-u005au0061-u007a])", "", keyword[0:10])+ '_'
                    + str(len(result_list)+ len(str_to_hex(keyword))) + str_to_hex(keyword)[-5:]
            sheet_t = wbk.add_sheet(sname)  # Excel单元格名字
            for i in range(len(result_list)):
                sheet_t.write(i, 0, label=result_list[i])

    #5 根据热度排序 -软件老王
    with open("rjlw.txt", 'w', encoding='utf-8') as wf2:
        orderList = list(word_dict.values())
        orderList.sort(reverse=True)
        count = len(orderList)
        for i in range(count):
            for key in word_dict:
                if word_dict[key] == orderList[i]:
                    key_list.append(key)
                    word_dict[key] = 0
        wf2.truncate()
    #6 写入目的excel
    for i in range(len(key_list)):
        sheet.write(i+rcount, 0, label=key_list[i])
        sheet.write(i+rcount, 1, label=orderList[i])
        if orderList[i] >= 1:
            shname = re.sub(u"([^u4e00-u9fa5u0030-u0039u0041-u005au0061-u007a])", "", key_list[i][0:10]) 
                     + '_'+ str(orderList[i]+ len(str_to_hex(key_list[i])))+ str_to_hex(key_list[i])[-5:]
            link = 'HYPERLINK("#%s!A1";"%s")' % (shname, shname)
            sheet.write(i+rcount, 2, xlwt.Formula(link))
    rcount = rcount + len(key_list)
    key_list = []
    orderList = []
    texts = []
    orig_txt = []
    wbk.save('软件老王-target.xls')

2.2 代码申明

(1) #1 以下代码 是jieba分词的自定义辞书,软件老王这里增加的花样行业术语,花样就是文档,就一列,一个词一行就好了, 这个几个行业辞书软件老王就不上传了,可解释掉。

    jieba.load_userdict("g1.txt")
    jieba.load_userdict("g2.txt")
    jieba.load_userdict("g3.txt")

(2) #2 停用词,简朴明白就是这些词不拆分,这个文件软件老王是从网上找的通用的,也能够不必。

    spPath = 'stop.txt'
    stop_words = StopWordsList(spPath)

(3) #3 excel处置惩罚,这里新增了称号为“软件老王sheet”的sheet,表头有三个,分别为“表头-软件老王1”,“表头-软件老王2”,“导航-链接到明细sheet表”,个中“导航-链接到明细sheet表”带超链接,能够导航到明细数据。

    wbk = xlwt.Workbook(encoding='ascii')
    sheet = wbk.add_sheet("软件老王sheet")  # sheet称号
    sheet.write(0, 0, '表头-软件老王1')
    sheet.write(0, 1, '表头-软件老王2')
    sheet.write(0, 2, '导航-链接到明细sheet表')
    wb = openpyxl.load_workbook('软件老王-source.xlsx')
    ws = wb.active
    col = ws['B']

(4)# 4 相似性处置惩罚

算法道理在(中有细致申明。

    rcount = 1
    texts = []
    orig_txt = []
    key_list = []
    name_list = []
    sheet_list = []
    for cell in col:
        if cell.value is None:
            continue
        if not isinstance(cell.value, str):
            continue
        item = cell.value.strip('nr').split('t')  # 制表格切分
        string = item[0]
        if string is None or len(string) == 0:
            continue
        else:
            textstr = seg_sentence(string, stop_words)
            texts.append(textstr)
            orig_txt.append(string)
    dictionary = corpora.Dictionary(texts)
    feature_cnt = len(dictionary.token2id.keys())
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.LsiModel(corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt)
    result_lt = []
    word_dict = {}
    count =0
    for keyword in orig_txt:
        count = count+1
        print('入手下手实行,第'+ str(count)+'行')
        if keyword in result_lt or keyword is None or len(keyword) == 0:
            continue
        kw_vector = dictionary.doc2bow(seg_sentence(keyword, stop_words))
        sim = index[tfidf[kw_vector]]
        result_list = []
        for i in range(len(sim)):
            if sim[i] > 0.5:
                if orig_txt[i] in result_lt and orig_txt[i] not in result_list:
                    continue
                result_list.append(orig_txt[i])
                result_lt.append(orig_txt[i])
        if len(result_list) >0:
            word_dict[keyword] = len(result_list)
        if len(result_list) >= 1:
            sname = re.sub(u"([^u4e00-u9fa5u0030-u0039u0041-u005au0061-u007a])", "", keyword[0:10])+ '_'
                    + str(len(result_list)+ len(str_to_hex(keyword))) + str_to_hex(keyword)[-5:]
            sheet_t = wbk.add_sheet(sname)  # Excel单元格名字
            for i in range(len(result_list)):
                sheet_t.write(i, 0, label=result_list[i])

(5) #5 根据热度上下排序 -软件老王

  
    with open("rjlw.txt", 'w', encoding='utf-8') as wf2:
        orderList = list(word_dict.values())
        orderList.sort(reverse=True)
        count = len(orderList)
        for i in range(count):
            for key in word_dict:
                if word_dict[key] == orderList[i]:
                    key_list.append(key)
                    word_dict[key] = 0
        wf2.truncate()

(6) #6 写入目的excel-软件老王

for i in range(len(key_list)):
        sheet.write(i+rcount, 0, label=key_list[i])
        sheet.write(i+rcount, 1, label=orderList[i])
        if orderList[i] >= 1:
            shname = re.sub(u"([^u4e00-u9fa5u0030-u0039u0041-u005au0061-u007a])", "", key_list[i][0:10]) 
                     + '_'+ str(orderList[i]+ len(str_to_hex(key_list[i])))+ str_to_hex(key_list[i])[-5:]
            link = 'HYPERLINK("#%s!A1";"%s")' % (shname, shname)
            sheet.write(i+rcount, 2, xlwt.Formula(link))
    rcount = rcount + len(key_list)
    key_list = []
    orderList = []
    texts = []
    orig_txt = []
    wbk.save('软件老王-target.xls')

2.3 结果图

(1)软件老王-source.xlsx

文本相似性热度统计算法实现(一)-整句热度统计 IT教程 第1张

(2)软件老王-target.xls

文本相似性热度统计算法实现(一)-整句热度统计 IT教程 第2张

(3)简朴申明

​ 实在数据不太轻易宣布,随便造了几个演示数据申明下结果花样。

I’m 「软件老王」,假如以为还能够的话,关注下呗,后续更新秒知!迎接讨论区、同名民众号留言交换!

spark sql 执行计划生成案例

参与评论