python3 - 使用 jieba3k 对直播平台房间标题进行分词
发布日期:2021-06-30 19:50:46
浏览次数:2
分类:技术文章
本文共 3925 字,大约阅读时间需要 13 分钟。
python3 安装jieba:
pip3 install jieba或者,先下载 ,解压后运行 python setup.py install 参考:
实例:
得到标签和创建mydict
import requestsfrom pyquery import PyQuery as pqfrom db import MongoClientfrom config import MY_DICTdb = MongoClient()def get_label(url): r = requests.get(url) r.encoding = 'utf-8' # 通过r.encoding设置页面编码 doc = pq(r.text) table = doc.find('body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(154) > tr').items() id = 0 for tr in table: if tr.find('td'): # 去掉th ''' 页面table有问题,单独修改一下 db.getCollection('hero').update( // query { "id" : 2 }, // update { '$set' : {'hero_name' : '幻翎', 'hero_name_list' : ['幻翎', '洛'], "join_time" : "2017年4月18日"} }, // options { "multi" : false, // update only one document "upsert" : false // insert a new document, if no existing document match the query } ); ''' id += 1 hero_name = tr.find('td:nth-child(2)').text().strip() hero_name_list = [] hero_name_list.append(tr.find('td:nth-child(2)').text().strip()) hero_name_list.append(tr.find('td:nth-child(3)').text().strip()) join_time = tr.find('td:nth-child(6)').text().strip() msg = { 'id' : id, 'hero_name' : hero_name, 'hero_name_list' : hero_name_list, 'join_time' : join_time } db.save(msg)def make_mydict(): with open(MY_DICT, mode='w', encoding='utf-8') as f: for name in db.get_hero_name_list(): print(name, file=f) # 直接换行if __name__ == '__main__': # get_label('http://baike.baidu.com/item/英雄联盟/4615671#4') make_mydict() # 创建词典 print('ok...')
分词器
import jiebaimport jieba.analysefrom db import MongoClientfrom config import MY_DICTclass Tokenizer(object): def __init__(self): self._db = MongoClient() # 载入自己的词库 jieba.load_userdict(MY_DICT) def get_hero_list(self): hero_list = [] with open(MY_DICT, mode='r', encoding='utf-8') as f: for hero in f: hero_list.append(hero.strip()) return hero_list def participle(self): hero_list = self.get_hero_list() print('/'.join(hero_list)) for room in self._db.get_rooms(): # 分词 [默认精确] msg = jieba.lcut(room['r_name']) label_list = set([w for w in msg if w in hero_list]) # 去重复 self._db.set_label(query={'r_id' : room['r_id']}, data={'$set' : {'r_label' : list(label_list)}}) print(msg, label_list)if __name__ == '__main__': # 分词器 tokenizer = Tokenizer() tokenizer.participle()
db
import pymongofrom config import *class MongoClient(object): def __init__(self): self._client = pymongo.MongoClient(MONGO_URL) def get_rooms(self): db = self._client[MONGO_DB] for room in db[MONGO_TABLE].find(): # 去掉limit yield { 'r_id' : room['r_id'], 'r_name' : room['r_name'] } def set_label(self, **kwargs): self._client[MONGO_DB][MONGO_TABLE].\ update(kwargs['query'], kwargs['data'], upsert=False) def save(self, msg): try: self._client[MONGO_DB][MONGO_HERO_NAME].insert(msg) except Exception as e: print("e: ", e) def get_hero_name_list(self): for hero_name in self._client[MONGO_DB][MONGO_HERO_NAME].find(): for name in hero_name['hero_name_list']: yield name
问题:
1. 运行的文件名和import xxx 的包名重复
import jiebajieba.cut("我来到北京清华大学")
AttributeError: 'module' object has no attribute 'cut'
不要将运行的文件名命名为jieba.py,自己撸自己当然出错了
转载地址:https://lipenglin.blog.csdn.net/article/details/72625344 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!
发表评论
最新留言
哈哈,博客排版真的漂亮呢~
[***.90.31.176]2024年05月01日 10时28分51秒
关于作者
喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!
推荐文章
JAVA学习笔记10 - 继承
2019-04-30
JAVA学习笔记11 - 接口interface
2019-04-30
JAVA学习笔记12 - 包package
2019-04-30
Android 开发学习笔记 00 - Getting Started
2019-04-30
【学习笔记】Android Activity
2021-07-03
【学习笔记】Android Fragments
2021-07-03
Android使用Retrofit_00_Getting Started
2021-07-03
Android使用Retrofit_01_OAuth2 + GitHub
2021-07-03
Django + REST学习笔记
2021-07-03
【转载】将Ubuntu16.04 中gedit在仅显示一个文件时显示文件名tab
2021-07-03
fstream 对象多次使用时注意clear
2021-07-03
调试 LenaCV 3D Camera (Linux)
2021-07-03
OpenCV杂记 - Mat in C++
2021-07-03
lnmp部署
2021-07-03
location区段
2021-07-03
nginx访问控制、基于用户认证、https配置
2021-07-03