一个百度贴吧下载指定单个帖子所有回复的工具(JavaScript)实现
(.*?)<\/div>/g); var posts = []; for (var i = 0; i < rawPosts.length; i ++) { posts.push(this.purify(rawPosts[i])); } return posts; }, writeFile: function(fileName, data) { fs.appendFile((fileName || 'output') + '.txt', data, function(err) { if (err) { throw err; } }) }};var spider = new Spider(process.argv[2] || 3138733512);spider.crawl();
发布日期:2021-06-30 14:16:02
浏览次数:2
分类:技术文章
本文共 3677 字,大约阅读时间需要 12 分钟。
var http = require('http'), fs = require('fs');function Spider(postId, seeLz) { this.currentPageNum = 1; this.numOfPagesToCrawl = 0; this.baseUrl = 'http://tieba.baidu.com/p/' + postId + '?see_lz=' + (seeLz || 1) + '&pn='; this.data = '';}Spider.prototype = { constructor: Spider, crawl: function(pageNum) { var self = this; var url = this.baseUrl + (pageNum || this.currentPageNum); http.request(url, function(res){ res.setEncoding("utf8"); // response returns chunks res.on('data', function(chunk){ this.data += chunk; }); res.on('end', function(){ var that = this; // because of the nature of asynchronous, can't simply return the data here, but bring in Processor object for data processing var processor = new Processor(this.data); var title = processor.getTitle(); this.numOfPagesToCrawl = processor.getPageCount(); console.log(title); console.log('===================='); console.log(''); var posts = processor.getPosts(); var index = 0; var interval = setInterval(function() { console.log(posts[index]); console.log("> posts left on the current page: " + (posts.length - index)); console.log('~~~~~~~~~~~~~~~~~~~~'); processor.writeFile(title, posts[index] + '\n\n'); index++; if (index >= posts.length) { console.log("end of this page"); this.data = ''; self.currentPageNum++; clearInterval(interval); if (self.currentPageNum < that.numOfPagesToCrawl) { console.log("get ready to the next page"); self.crawl(self.currentPageNum); } else { console.log("that's all there's to it..."); } } }, 500); }); }).end(); } };function Processor(data) { this.data = data;}Processor.prototype = { constructor: Processor, // to extract page content that only exists in a single location matchSingle: function(regex) { var matched = this.data.match(regex); var result = !!matched? matched[1] : ''; return result; }, // to extract page contents that exist in multiple locations matchMulti: function(regex) { var matched = this.data.match(regex); var results = !!matched? matched : []; return results; }, // to remove rubbish contents purify: function(str) { var htmlTags = /<.*?>/g; var spaces = /\s+/g; var purified = str.replace(htmlTags, '') .replace(spaces, ''); return purified; }, // to extract total page counts getPageCount: function() { var pageCount = this.matchSingle(/
转载地址:https://jerry.blog.csdn.net/article/details/104892014 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!
发表评论
最新留言
表示我来过!
[***.240.166.169]2024年04月22日 17时32分34秒
关于作者
喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!
推荐文章
openSession和getCurrentSession
2019-05-01
Spring事务传播行为和隔离机制
2019-05-01
Session的策略
2019-05-01
虚拟机类加载机制
2019-05-01
类加载器和双亲委派机制
2019-05-01
虚拟机字节码执行引擎
2019-05-01
方法区
2019-05-01
Servlet的单例实现多线程
2019-05-01
What is java语法糖
2019-05-01
深入理解Javac编译器
2019-05-01
Servlet的认识
2019-05-01
HotSpot虚拟机即时编译器4大问题解决-即时编译器的学习
2019-05-01
使用json web token
2019-05-01
解密ThreadLocal
2019-05-01
redis 数据类型详解 以及 redis适用场景场合
2019-05-01
SSE(Server-Send Event):服务器推送数据的新方式
2019-05-01
rest推送实现--jesey SSE
2019-05-01
BlockingQueue解析
2019-05-01
Java并发包中的同步队列SynchronousQueue实现原理
2019-05-01
jwt(json-web-token)在rest中的实现--jersey
2019-05-01