Java使用HttpClient和Jsoup爬取豆瓣小组的帖子并存入Mysql (3)

这里的数据库操作和第一步的很类似,故从略。只展示最后的测试类。

package com.fan; import com.fan.mapper.DoubanPostMapper; import com.fan.mapper.PostMapper; import com.fan.pojo.DoubanPost; import com.fan.pojo.Post; import com.fan.util.HttpUtils; import org.apache.http.protocol.HTTP; import org.jsoup.Jsoup; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * 爬取豆瓣主帖子的内容 * @author * @date 2020/8/24 - 14:08 */ @SpringBootTest public class CrawDoubanMainPosts { @Autowired private DoubanPostMapper doubanPostMapper; @Autowired private PostMapper postMapper; // 批量查询帖子的html,并将主帖子的内容存入数据库中 @Test public void test01() throws URISyntaxException { // 请求地址 // 每一个帖子对应的url // String url = null; Map<String, String> map = new HashMap<>(); Map<String, String> mapTitle = new HashMap<>(); // 设置请求头 mapTitle.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36 Edg/84.0.522.61"); mapTitle.put("Cookie", "ll=\'118171\'; bid=s9mINaPcmtA; __utmv=30149280.21555; douban-fav-remind=1; push_doumail_num=0; push_noty_num=0; ct=y; douban-profile-remind=1; ck=PnCH; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1598334460%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252F%252Fgroup%252Ftopic%252F175432568%252F%22%5D; _pk_id.100001.8cb4=9fc580371d86dd7a.1592482197.27.1598334460.1598257938.; _pk_ses.100001.8cb4=*; __utma=30149280.1877381396.1592482199.1598255406.1598334460.26; __utmc=30149280; __utmz=30149280.1598334460.26.3.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=http://www.likecs.com/passport/login; __utmt=1; __utmb=30149280.4.8.1598334460"); DoubanPost doubanPost = null; // 每一次查询的链接类 Post post = new Post(); for (int i = 11; i < 14068; i++) { doubanPost = doubanPostMapper.queryDoubanPostById(i); // 装填title post.setTitle(doubanPost.getTitle()); // 装填authorHref post.setAuthorHref(doubanPost.getAuthor()); // 装填postHref post.setPostHref(doubanPost.getPostHref()); // 获取postid String postHref = doubanPost.getPostHref(); // 后面根据这个url获取网页 String s = postHref.substring(35, postHref.length() - 1); int postId = Integer.parseInt(s); // 装填postId post.setPostId(postId); // 接下来只需要装填author和content String html = HttpUtils.doGetHtml(postHref, map, mapTitle); Document document = Jsoup.parse(html); Element content = document.getElementById("content"); // 防止报空指针异常 if (content == null) { System.out.println("内容不存在"); post.setContent("内容不存在"); // 装填昵称 String username = "此人已删帖"; post.setAuthor(username); // 保存进数据库 postMapper.addPost(post); System.out.println("添加第" + i + "条数据成功!"); continue; } Element elementById = content.getElementById("link-report"); Elements p = elementById.getElementsByTag("p"); String article = ""; for (Element element : p) { String html02 = element.html(); html02 += "\n"; article += html02; } // 装填文章内容 post.setContent(article); // 装填昵称 String username = content.child(0).child(0).child(2).child(1).child(0).child(0).child(0).html(); post.setAuthor(username); // 保存进数据库 postMapper.addPost(post); System.out.println("添加第" + i + "条数据成功!"); // 舍弃图片和链接,暂时只截取文字部分 } System.out.println("前1000条数据获取完成"); } }

数据库的结果:

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/zwjsgd.html