Java - OnlyLady Spider(HttpClient 4.5 )
发布日期:2021-06-30 19:51:24
浏览次数:3
分类:技术文章
本文共 9120 字,大约阅读时间需要 30 分钟。
pom
4.0.0 peerslee Onlylady 0.0.1-SNAPSHOT jar Onlylady http://maven.apache.org UTF-8 junit junit 3.8.1 test org.apache.httpcomponents httpclient 4.5 org.jsoup jsoup 1.10.2 org.mongodb mongo-java-driver 3.4.2
Spider
package peerslee.Onlylady;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Spider { private static HttpClient client = HttpClients.createDefault(); private static MongoUtil util = new MongoUtil(); Pattern pattern = null; Matcher matcher = null; Integer Max_Value = 500; //每个帖子,最多抓500页评论,最大不要过3000条,单条数据插入容量溢出 // util,返回jsoup document Document get_doc(String url) { RequestConfig requestConfig = RequestConfig.custom() .setConnectTimeout(5000) //设置连接超时时间,单位毫秒 .setConnectionRequestTimeout(5000) // 设置从connect Manager获取Connection 超时时间,单位毫秒 .setSocketTimeout(5000) //请求获取数据的超时时间,单位毫秒 .build(); HttpGet get = new HttpGet(url); get.setConfig(requestConfig); try { HttpResponse response = client.execute(get); HttpEntity entity = response.getEntity(); String html = EntityUtils.toString(entity); return Jsoup.parse(html); } catch (Exception e) { System.out.println("------超时------"); return get_doc(url); //重新抓取 } } // 评论 Listutil
package peerslee.Onlylady;import com.mongodb.MongoClient;import com.mongodb.client.MongoCollection;import com.mongodb.client.MongoDatabase;import org.bson.Document;import java.util.ArrayList;import java.util.List;import java.util.Map;/** * Created by PeersLee on 2017/1/30. */public class MongoUtil { private MongoClient client = null; private String dbName = null; public MongoUtil() { dbName = "Onlylady"; this.client = new MongoClient("127.0.0.1", 27017); } //插入(去重复) public void insertCol(String colName, Mapmsg) { try { MongoDatabase db = client.getDatabase(dbName); MongoCollection col = db.getCollection(colName); Document doc = new Document(); for(Map.Entry entry : msg.entrySet()) { doc.append(entry.getKey(), entry.getValue()); } List docs = new ArrayList (); docs.add(doc); col.insertMany(docs); System.out.println("Doc insert" + colName + " ok..."); } catch (Exception e) { System.out.println(e.getClass().getName() + ":" + e.getMessage()); } }}
代理
HttpClient client = HttpClients.createDefault(); // 设置代理 HttpHost proxy = new HttpHost(ip, port); DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy); // HttpClientBuilder client = HttpClients.custom().setRoutePlanner(routePlanner). setConnectionTimeToLive(2, TimeUnit.SECONDS).build();
转载地址:https://lipenglin.blog.csdn.net/article/details/78222090 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!
发表评论
最新留言
很好
[***.229.124.182]2024年04月29日 15时55分03秒
关于作者
喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!
推荐文章
Python的__future__模块
2019-04-30
计算机视觉中的cost-volume的概念具体指什么(代价体积)
2019-04-30
启发函数heuristic 与 A*
2019-04-30
Image Pyramid(图像金字塔)
2019-04-30
Oracle 作业记录
2019-04-30
putty连接AWS配置(multimedia project)
2019-04-30
Hourglass Network 沙漏网络 (pose estimation姿态估计)
2019-04-30
OpenCV实战(二)——答题卡识别判卷
2019-04-30
目标检测神经网络的发展历程(52 个目标检测模型)
2019-04-30
Boundary loss 损失函数
2019-04-30
tensorflow使用tensorboard进行可视化
2019-04-30
凸优化 convex optimization
2019-04-30
数据库索引 & 为什么要对数据库建立索引 / 数据库建立索引为什么会加快查询速度
2019-04-30
IEEE与APA引用格式
2019-04-30
research gap
2019-04-30
pytorch训练cifar10数据集查看各个种类图片的准确率
2019-04-30