爬虫

NLP

import jiebaimport numpy as npimport pymongofrom NLP.Config import *'''db'''client = pymongo.MongoClient(MONGO_URL)db = client[MONGO_DB]def get_comments_from_db(count=100):    try:        comments_list = [msg['comment'] for msg in db[MONGO_TABLE].find().limit(count)]        return comments_list    except Exception as e:        print(e.args)        return None'''将txt转为list'''def get_list_from_file(name=''):    path = './{}.txt'.format(name)    words = []    with open(path, mode='r', encoding='utf-8') as f:        for w in f:            words.append(w.strip())    return words'''几个词典'''stop_words = get_list_from_file(name='stopwords')refute_words = get_list_from_file(name='refute')nega_words = get_list_from_file(name='negative')posi_words = get_list_from_file(name='positive')degree_words = get_list_from_file(name='degree')degree_index_list = ['extreme', 'very', 'more', 'ish', 'last']degree_dict = {}for i in range(4):    first_index = degree_index_list[i]    second_index = degree_index_list[i+1]    degree_dict[first_index]\        = degree_words[degree_words.index(first_index)+1 : degree_words.index(second_index)]'''1. 分词(情感词, 否定词, 程度词/号, )2. 评论的情感值的均值与方差'''def sentiment_value(**kwargs):    comment = kwargs['comment']    words = [w.strip() for w in jieba.cut(comment, cut_all=False) if w not in stop_words]    sent_value_list = []    # 对每个word求一次情感值    pre_index = 0    print(words)    for word in list(words):        seg_sent_value = 0        # 求情感值,        if word in posi_words:            seg_sent_value += POSI_VALUE        elif word in nega_words:            seg_sent_value += NEGA_VALUE        if seg_sent_value != 0:            index = words.index(word)            for w in words[pre_index : index]:                if w in degree_dict['extreme']:                    seg_sent_value *= EXTREME_VALUE                elif w in degree_dict['very']:                    seg_sent_value *= VERY_VALUE                elif w in degree_dict['more']:                    seg_sent_value *= MORE_VALUE                elif w in degree_dict['ish']:                    seg_sent_value *= ISH_VALUE                elif w in refute_words:                    seg_sent_value *= REFUTE_VALUE            pre_index = index + 1            sent_value_list.append(seg_sent_value)    if sent_value_list:        arr = np.array(sent_value_list)        print(arr)        words_value_dict = {            'sum' : arr.sum(),            'avg' : arr.mean(),            'std' : arr.std()        }        return words_value_dict    else:        return Nonedef run():    for comment in get_comments_from_db():        print(sentiment_value(comment=comment))if __name__ == '__main__':    run()

链接: 密码: y4xa

转载地址：https://lipenglin.blog.csdn.net/article/details/74380276 如侵犯您的版权，请留言回复原文章的地址，我们会给您删除此文章，给您带来不便请您谅解！

上一篇：python - 制作简单 ‘词云图‘

下一篇：python - selenium 抓取‘楚乔传’ 评论

发表评论

关于作者

喝酒易醉，品茶养心，人生如梦，品茶悟道，何以解忧？唯有杜康！

-- 愿君每日到此一游！

爬虫

NLP

发表评论

最新留言

关于作者

推荐文章