python3 - 使用 jieba3k 对直播平台房间标题进行分词
发布日期:2021-06-30 19:50:46 浏览次数:2 分类:技术文章

本文共 3925 字,大约阅读时间需要 13 分钟。

python3 安装jieba:

pip3 install jieba

或者,先下载 ,解压后运行 python setup.py install

参考:

实例:

得到标签和创建mydict

import requestsfrom pyquery import PyQuery as pqfrom db import MongoClientfrom config import MY_DICTdb = MongoClient()def get_label(url):    r = requests.get(url)    r.encoding = 'utf-8' # 通过r.encoding设置页面编码    doc = pq(r.text)    table = doc.find('body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(154) > tr').items()    id = 0    for tr in table:        if tr.find('td'): # 去掉th            '''            页面table有问题,单独修改一下            db.getCollection('hero').update(                // query                 {                    "id" : 2                },                                // update                 { '$set' : {'hero_name' : '幻翎', 'hero_name_list' : ['幻翎', '洛'], "join_time" : "2017年4月18日"}                },                                // options                 {                    "multi" : false,  // update only one document                     "upsert" : false  // insert a new document, if no existing document match the query                 }            );            '''            id += 1            hero_name = tr.find('td:nth-child(2)').text().strip()            hero_name_list = []            hero_name_list.append(tr.find('td:nth-child(2)').text().strip())            hero_name_list.append(tr.find('td:nth-child(3)').text().strip())            join_time = tr.find('td:nth-child(6)').text().strip()            msg = {                'id' : id,                'hero_name' : hero_name,                'hero_name_list' : hero_name_list,                'join_time' : join_time            }            db.save(msg)def make_mydict():    with open(MY_DICT, mode='w', encoding='utf-8') as f:        for name in db.get_hero_name_list():            print(name, file=f) # 直接换行if __name__ == '__main__':    # get_label('http://baike.baidu.com/item/英雄联盟/4615671#4')    make_mydict() # 创建词典    print('ok...')

分词器

import jiebaimport jieba.analysefrom db import MongoClientfrom config import MY_DICTclass Tokenizer(object):    def __init__(self):        self._db = MongoClient()        # 载入自己的词库        jieba.load_userdict(MY_DICT)    def get_hero_list(self):        hero_list = []        with open(MY_DICT, mode='r', encoding='utf-8') as f:            for hero in f:                hero_list.append(hero.strip())        return hero_list    def participle(self):        hero_list = self.get_hero_list()        print('/'.join(hero_list))        for room in self._db.get_rooms():            # 分词 [默认精确]            msg = jieba.lcut(room['r_name'])            label_list = set([w for w in msg if w in hero_list]) # 去重复            self._db.set_label(query={'r_id' : room['r_id']},                               data={'$set' : {'r_label' : list(label_list)}})            print(msg, label_list)if __name__ == '__main__':    # 分词器    tokenizer = Tokenizer()    tokenizer.participle()

db

import pymongofrom config import *class MongoClient(object):    def __init__(self):        self._client = pymongo.MongoClient(MONGO_URL)    def get_rooms(self):        db = self._client[MONGO_DB]        for room in db[MONGO_TABLE].find(): # 去掉limit            yield {                'r_id' : room['r_id'],                'r_name' : room['r_name']            }    def set_label(self, **kwargs):        self._client[MONGO_DB][MONGO_TABLE].\           update(kwargs['query'], kwargs['data'], upsert=False)    def save(self, msg):        try:            self._client[MONGO_DB][MONGO_HERO_NAME].insert(msg)        except Exception as e:            print("e: ", e)    def get_hero_name_list(self):        for hero_name in self._client[MONGO_DB][MONGO_HERO_NAME].find():            for name in hero_name['hero_name_list']:                yield name

问题:

1. 运行的文件名和import xxx 的包名重复

import jiebajieba.cut("我来到北京清华大学")
AttributeError: 'module' object has no attribute 'cut'
不要将运行的文件名命名为jieba.py,自己撸自己当然出错了

转载地址:https://lipenglin.blog.csdn.net/article/details/72625344 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:python3 - jieba:去停词,词性判断,计算词频
下一篇:python - scrapy 入门

发表评论

最新留言

哈哈,博客排版真的漂亮呢~
[***.90.31.176]2024年05月01日 10时28分51秒