Python3--爬取海词信息
发布日期:2021-07-01 04:21:15
浏览次数:2
分类:技术文章
本文共 4630 字,大约阅读时间需要 15 分钟。
上代码:
#!/usr/bin/python3import queueimport threadingimport requests,csv,time,random from bs4 import BeautifulSoup from fake_useragent import UserAgent import pandas as pd exitFlag = 0#利用pandas读取csv文件def getNames(csvfile): data = pd.read_csv(csvfile,delimiter='|') # 1--读取的文件编码问题有待考虑 names = data['EnName'] return names#获取ip列表 def get_ip_list(): f=open('ip.txt','r') ip_list=f.readlines() f.close() return ip_list #从IP列表中获取随机IP def get_random_ip(ip_list): proxy_ip = random.choice(ip_list) proxy_ip=proxy_ip.strip('\n') proxies = {'https': proxy_ip} return proxies #功能:将信息写入文件 def write_file(filePath,row): with open(filePath,'a+',encoding='utf-8',newline='') as csvfile: spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL) spanreader.writerow(row) def get_content(url,ip_list): try: try: time.sleep(1) proxies = get_random_ip(ip_list) headers = {'User-Agent':str(UserAgent().random)} req = requests.get(url=url, proxies=proxies,headers=headers,timeout=20) except: print("重新运行") time.sleep(10) proxies = get_random_ip(ip_list) headers = {'User-Agent':str(UserAgent().random)} req = requests.get(url=url, proxies=proxies,headers=headers,timeout=40) except: print("第二次重新运行") time.sleep(15) proxies = get_random_ip(ip_list) headers = {'User-Agent':str(UserAgent().random)} req = requests.get(url=url, proxies=proxies,headers=headers) req.encoding = 'utf-8' soup = BeautifulSoup(req.text,'lxml') content = soup.find_all('div',class_='mbox') return req.status_code, content#获取准确的英文名、中文名、名字含义、来源、性别等信息def get_infor_header(content): content = content.find_all('span') EnName = [] CnName = [] Gender = [] Source = [] Meaning = [] EnName.append(content[0].get_text()) if len(content) != 1: CnName.append(content[1].get_text()) Meaning.append(content[2].get_text()) Source.append(content[3].get_text()) Gender.append(content[4].em.get('title')) else: CnName.append('') Meaning.append('') Source.append('') Gender.append('') #信息的链接方式EnName|CnName|Gender|Source|Meaning list_header = EnName + CnName + Gender + Source + Meaning return list_header#获取英文名对应的名人def get_infor_celebrity(content): content = content.find_all('li') list_celebrity = [] str_celebrity='' for each in content: if not str_celebrity: str_celebrity +=each.get_text() else: str_celebrity +='@' + each.get_text() list_celebrity.append(str_celebrity) return list_celebrityclass myThread (threading.Thread): def __init__(self, threadID, name, q,ip_list): threading.Thread.__init__(self) self.threadID = threadID self.name = name self.q = q self.ip_list = ip_list def run(self): print ("开启线程:" + self.name) process_data(self.name, self.q,ip_list) print ("退出线程:" + self.name)def process_data(threadName, q,ip_list): while not exitFlag: queueLock.acquire() if not workQueue.empty(): data = q.get() queueLock.release() print ("%s processing %s" % (threadName, data)) url = 'http://ename.dict.cn/{}'.format(data) status_code, content = get_content(url,ip_list) if status_code==200: #获取准确的中文名、名字含义、来源、性别等信息 list_header = get_infor_header(content[0]) #获取名人信息 list_celebrity = get_infor_celebrity(content[1]) row = list_header + list_celebrity queueLock.acquire() write_file('haici_infor.csv',row) queueLock.release() else: queueLock.release() time.sleep(1)threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5", "Thread-6", "Thread-7", "Thread-8", "Thread-9", "Thread-10"]nameList = getNames('A-Z.csv')queueLock = threading.Lock()workQueue = queue.Queue(100000)threads = []threadID = 1# 创建新线程ip_list = get_ip_list()for tName in threadList: thread = myThread(threadID, tName, workQueue,ip_list) thread.start() threads.append(thread) threadID += 1# 填充队列queueLock.acquire()for word in nameList: workQueue.put(word)queueLock.release()# 等待队列清空while not workQueue.empty(): pass# 通知线程是时候退出exitFlag = 1# 等待所有线程完成for t in threads: t.join()print ("退出主线程")
转载地址:https://mtyjkh.blog.csdn.net/article/details/80009804 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!
发表评论
最新留言
做的很好,不错不错
[***.243.131.199]2024年05月07日 10时41分02秒
关于作者
喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!
推荐文章
NVMe技术架构深度分析
2019-05-02
技术爆炸时代如何做技术的掌控者?
2019-05-02
机柜服务器如何选择,有哪些学问?
2019-05-02
Ceph存储系统Scrub机制分析
2019-05-02
OpenStack重组,敢问未来路在何方?
2019-05-02
CTO,是怎样炼成的?
2019-05-02
选择GPU服务器的基本原则
2019-05-02
关于数据中台系统,需要了解哪些技术?
2019-05-02
全面分析HDFS基本技术原理
2019-05-02
详解以太网介质技术发展史!
2019-05-02
详解“硬核”虚拟化技术SR-IOV原理
2019-05-02
SAP HANA解决方案设计10问详解
2019-05-02
详解内存运算架构、挑战和趋势
2019-05-02
Lightbits能否让NVMe/TCP新标准旗开得胜?
2019-05-02
数据中台,何为正解?!
2019-05-02
架构师进阶必看!架构师的工作都干些什么?
2019-05-02
详解RDMA架构和技术原理
2019-05-02
Virtio技术架构简明分析
2019-05-02
浅谈数据库高可用性(HA)技术
2019-05-02
许式伟的架构课
2019-05-02