问题导读：

抓取‘君不见’性福模块文章，列表页文章图片

url queue 1个进程，文章和图片各10个进程

解决方案：

#!/usr/bin/env python# coding=utf-8import multiprocessingimport urllib2import reimport uuidimport osimport timeclass Spider():    def __init__(self):        self.counter = 0        self.lock = multiprocessing.Lock()        # 文章超链接        self.queue = multiprocessing.Queue()        # img        self.imgs = multiprocessing.Queue()        self.filename = './data/' + str(uuid.uuid1()) + '.txt'        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'}    def getUrls(self):        urls = []        for i in range(1, 51):            urls.append('http://www.junbujian.cc/xingfu/' + str(i) + '.html')        for url in urls:            request = urllib2.Request(url, headers=self.headers)            try:                response = urllib2.urlopen(request)                content = response.read().decode('utf-8')            except:                content = ''                print 'error:' + url                pass            pattern = re.compile(r'')            article_url = re.findall(pattern, content)            for a_url in article_url:                # print a_url                self.queue.put(a_url, timeout=2)            img_pat = re.compile(r'')            img_url = re.findall(img_pat, content)            for i_url in img_url:                self.imgs.put(i_url, timeout=2)    def getImg(self):        while not self.imgs.empty():            img_url = self.imgs.get(timeout=2)            # os.path.isfile('filename')            if not os.path.exists('./img/'):                os.mkdir('./img/')            img_name = './img/' + str(uuid.uuid1()) + '.jpg'            with open(img_name, 'wb') as f:                try:                    f.write(urllib2.urlopen(img_url).read())                    self.lock.acquire()                    # print img_url                    print multiprocessing.current_process().name, ' ', img_name, ' 已保存...'                    self.lock.release()                except:                    print 'error:' + img_url                    pass    def getArticle(self):        while not self.queue.empty():            art_url = self.queue.get(timeout=2)            request = urllib2.Request(art_url, headers=self.headers)            try:                response = urllib2.urlopen(request)                content = response.read().decode('utf-8')            except:                print 'error:',art_url                content = ''            # 更改点 (.) 的含义，使它与每一个字符匹配（而不是与除 \n 之外的每个字符匹配）            pat = re.compile(r'
    
     (.*?)
    
', re.S)            article = re.findall(pat, content)            if not os.path.exists('./data'):                os.mkdir('./data')            with open('./data/' + str(uuid.uuid1()) + '.html', 'w') as f:                try:                    f.write(multiprocessing.current_process().name + '\n' + article[0].encode('utf-8'))                except:                    with open('./data/error.txt', 'a') as e:                        e.write('url:' + art_url + '\n')                    pass                print '...'    def run(self):        urls_proc = multiprocessing.Process(target=self.getUrls)        urls_proc.daemon =True        urls_proc.start()        print 'starting urls_proc...'        time.sleep(20)        imgs_proc_list = []        art_proc_list = []        for i in range(10):            imgs_proc = multiprocessing.Process(target=self.getImg)            imgs_proc_list.append(imgs_proc)            imgs_proc.daemon = True            imgs_proc.start()            print 'starting proc',i        for i in range(10):            art_proc = multiprocessing.Process(target=self.getArticle)            art_proc_list.append(art_proc)            art_proc.daemon = True            art_proc.start()            print 'staring proc_a',i        urls_proc.join(15)        for proc in imgs_proc_list:            proc.join(15)        for proc in art_proc_list:            proc.join(15)        print 'end...'if __name__ == '__main__':    spider = Spider()    spider.run()

转载地址：https://lipenglin.blog.csdn.net/article/details/53328455 如侵犯您的版权，请留言回复原文章的地址，我们会给您删除此文章，给您带来不便请您谅解！

上一篇：Python - 静态IP池

下一篇：python －多进程

发表评论

关于作者

喝酒易醉，品茶养心，人生如梦，品茶悟道，何以解忧？唯有杜康！

-- 愿君每日到此一游！

问题导读：

解决方案：

发表评论

最新留言

关于作者

推荐文章