Java - OnlyLady Spider(HttpClient 4.5 )
发布日期:2021-06-30 19:51:24 浏览次数:3 分类:技术文章

本文共 9120 字,大约阅读时间需要 30 分钟。

pom

4.0.0
peerslee
Onlylady
0.0.1-SNAPSHOT
jar
Onlylady
http://maven.apache.org
UTF-8
junit
junit
3.8.1
test
org.apache.httpcomponents
httpclient
4.5
org.jsoup
jsoup
1.10.2
org.mongodb
mongo-java-driver
3.4.2

Spider

package peerslee.Onlylady;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Spider {	private static HttpClient client = HttpClients.createDefault();	private static MongoUtil util = new MongoUtil();	Pattern pattern = null;	Matcher matcher = null;	Integer Max_Value = 500; //每个帖子,最多抓500页评论,最大不要过3000条,单条数据插入容量溢出	// util,返回jsoup document	Document get_doc(String url) {		RequestConfig requestConfig = RequestConfig.custom()				.setConnectTimeout(5000) //设置连接超时时间,单位毫秒				.setConnectionRequestTimeout(5000) // 设置从connect Manager获取Connection 超时时间,单位毫秒				.setSocketTimeout(5000) //请求获取数据的超时时间,单位毫秒				.build();		HttpGet get = new HttpGet(url);		get.setConfig(requestConfig);		try {			HttpResponse response = client.execute(get);			HttpEntity entity = response.getEntity();			String html = EntityUtils.toString(entity);			return Jsoup.parse(html);		} catch (Exception e) {			System.out.println("------超时------");			return get_doc(url); //重新抓取		} 	}		// 评论	List
> crawl_comment(String url) { List
> list = new ArrayList
>(); Document doc = get_doc(url); Elements elements = doc.select("#postlist > div"); for(int i = 0; i < elements.size() - 1; i++) { //最后一个div不是评论块 Map
map = new HashMap
(); Element element = elements.get(i); String comment_id = element.attr("id"); pattern = Pattern.compile("\\d+"); matcher = pattern.matcher(comment_id); matcher.find(); comment_id = matcher.group(0); map.put("comment_id", comment_id); String comment_author = element.select("div.authi > a.xw1").text(); map.put("comment_author", comment_author); String comment_content = element.select("td.t_f").text(); map.put("comment_content", comment_content); String comment_time = doc.select("#authorposton" + comment_id).text() .replace("发表于", "").trim(); map.put("comment_time", comment_time); list.add(map); // System.out.println(comment_id + " " + comment_author + " " + comment_content + " " + comment_time); } System.out.println(list); return list; } // 帖子 void crawl_info(String url) { Document doc = get_doc(url); // info Map
info_map = new HashMap
(); // 帖子标题 String title = doc.select("div.ts > a > h1").text(); info_map.put("title", title); // 帖子id pattern = Pattern.compile("thread-(\\d+)-\\d+-\\d+"); matcher = pattern.matcher(url); matcher.find(); String topic_id = matcher.group(1); info_map.put("topic_id", topic_id); // 作者 String author = doc.select("#threadstamp + table div.authi > a:eq(0)").text(); info_map.put("author", author); // 查看数 String look = doc.select("div.hm > span:eq(1)").text(); info_map.put("look", look); // 评论数 String reply = doc.select("div.hm > span:eq(4)").text(); info_map.put("reply", reply); // 发帖时间 String first_div_id = doc.select("#postlist > div").attr("id"); pattern = Pattern.compile("\\d+"); matcher = pattern.matcher(first_div_id); matcher.find(); first_div_id = matcher.group(0); String time = doc.select("#authorposton" + first_div_id).text().trim() .replace("发表于", "").trim(); info_map.put("time", time); // 当前时间 SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); String date = df.format(new Date()); info_map.put("date", date); // 评论 List
> comment_list = new ArrayList
>(); Elements elements = doc.select("#postlist > div"); for(int i = 1; i < elements.size() - 1; i++) { Map
map = new HashMap
(); Element element = elements.get(i); // 评论id String comment_id = element.attr("id"); pattern = Pattern.compile("\\d+"); matcher = pattern.matcher(comment_id); matcher.find(); comment_id = matcher.group(0); map.put("comment_id", comment_id); // 评论者 String comment_author = element.select("div.authi > a.xw1").text(); map.put("comment_author", comment_author); // 评论内容 String comment_content = element.select("td.t_f").text(); map.put("comment_content", comment_content); // 评论时间 String comment_time = element.select("#authorposton" + comment_id).text().trim() .replace("发表于", "").trim(); map.put("comment_time", comment_time); comment_list.add(map);// System.out.println(comment_id + " " + comment_author + " " + comment_content + " " + comment_time); } System.out.println(comment_list); // 评论翻页 String str; try { str = doc.select("a.last").first().text(); //1. 超过10页 } catch (java.lang.NullPointerException e) { Elements es = doc.select("div.pg > a"); if(es.isEmpty()) { str = "1"; //3. 评论就一页 } else { str = es.get(es.size()-2).text(); // 2. 少于10页 } } pattern = Pattern.compile("\\d+"); matcher = pattern.matcher(str); matcher.find(); Integer total = Integer.parseInt(matcher.group(0)); // 总页 System.out.println("帖子:" + topic_id + " 共:" + total + " 页..."); for(int i = 2; i <= (total < Max_Value ? total : Max_Value) ; i++) { String comment_url = "http://bbs.onlylady.com/thread-" + topic_id + "-" + i +"-1.html"; System.out.println("crawl 第 " + i +" 页评论..."); List comment_per_list = crawl_comment(comment_url); comment_list.addAll(comment_per_list); // 评论列表插入 } info_map.put("comment_list", comment_list); // 帖子内容 String post_content = doc.select("td.t_f").text(); pattern = Pattern.compile("(\\d+-){2}\\d+\\s(\\d+:){2}\\d+\\s上传\\s下载附件 \\(.*?\\)"); matcher = pattern.matcher(post_content); post_content = matcher.replaceAll("").trim(); info_map.put("post_content", post_content); util.insertCol("test", info_map); //db System.out.println("=============="); } // 页 void crawl_topic_url(String url) { List
topic_list = new ArrayList
(); Document doc = get_doc(url); Elements links = doc.select("#moderate tbody > tr > td.icn > a"); // 每页所有的帖子链接 for(Element e : links) { String link = e.attr("href"); System.out.println(link); crawl_info(link); } } // 类 void crawl_category(Integer type) { String url = "http://bbs.onlylady.com/forum-" + type + "-1.html"; Document doc = get_doc(url); String str = doc.select("a.last").first().text(); Matcher matcher = null; matcher = Pattern.compile("\\d+").matcher(str); matcher.find(); Integer total = Integer.parseInt(matcher.group(0)); System.out.println("类别" + type + "共:" + total + "页..."); for(int i = 1; i <= total; i++) { String topic_url = "http://bbs.onlylady.com/forum-" + type + "-" + i +".html"; System.out.println("crawl 第" + i +"页..."); crawl_topic_url(topic_url); } } public static void main(String[] args) { Spider spider = new Spider(); spider.crawl_category(86);// spider.crawl_info("http://bbs.onlylady.com/thread-4030747-1-1.html");// spider.crawl_info("http://bbs.onlylady.com/thread-4031594-1-1.html"); }}
util

package peerslee.Onlylady;import com.mongodb.MongoClient;import com.mongodb.client.MongoCollection;import com.mongodb.client.MongoDatabase;import org.bson.Document;import java.util.ArrayList;import java.util.List;import java.util.Map;/** * Created by PeersLee on 2017/1/30. */public class MongoUtil {	private MongoClient client =  null;	private String dbName = null;	public MongoUtil() {		dbName = "Onlylady";		this.client = new MongoClient("127.0.0.1", 27017);	}	//插入(去重复)	public void insertCol(String colName, Map
msg) { try { MongoDatabase db = client.getDatabase(dbName); MongoCollection
col = db.getCollection(colName); Document doc = new Document(); for(Map.Entry
entry : msg.entrySet()) { doc.append(entry.getKey(), entry.getValue()); } List
docs = new ArrayList
(); docs.add(doc); col.insertMany(docs); System.out.println("Doc insert" + colName + " ok..."); } catch (Exception e) { System.out.println(e.getClass().getName() + ":" + e.getMessage()); } }}

代理

HttpClient client = HttpClients.createDefault();		// 设置代理		HttpHost proxy = new HttpHost(ip, port);		DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);		// HttpClientBuilder		client = HttpClients.custom().setRoutePlanner(routePlanner).				setConnectionTimeToLive(2, TimeUnit.SECONDS).build();

转载地址:https://lipenglin.blog.csdn.net/article/details/78222090 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:Math - 高斯分布(正态分布)
下一篇:Java - Set、List、Map

发表评论

最新留言

很好
[***.229.124.182]2024年04月29日 15时55分03秒