Java爬虫 (2)

创建mybatis配置文件 mybatis-config.xml

<?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE configuration PUBLIC "-//mybatis.org//DTD Config 3.0//EN" "http://mybatis.org/dtd/mybatis-3-config.dtd"> <configuration> <properties resource="jdbc.properties"></properties> <environments default="development"> <environment> <transactionManager type="JDBC"/> <dataSource type="POOLED"> <property value="${driver}"/> <property value="${url}"/> <property value="${username}"/> <property value="${password}"/> </dataSource> </environment> </environments> <mappers> <mapper resource="MovieMapper.xml"/> </mappers> </configuration>

创建mapper.xml映射文件

<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> <mapper namespace="com.cn.scitc.mapper.MovieMapper"> <resultMap type="com.cn.scitc.model.Movie"> <id column="id" property="id" jdbcType="VARCHAR"/> <id column="title" property="title" jdbcType="VARCHAR"/> <id column="cover" property="cover" jdbcType="VARCHAR"/> <id column="rate" property="rate" jdbcType="VARCHAR"/> <id column="casts" property="casts" jdbcType="VARCHAR"/> <id column="directors" property="directors" jdbcType="VARCHAR"/> </resultMap> <insert keyProperty="id" parameterType="com.cn.scitc.model.Movie"> INSERT INTO movie(id,title,cover,rate,casts,directors) VALUES (#{id},#{title},#{cover},#{rate},#{casts},#{directors}) </insert> <select resultMap="MovieMapperMap"> SELECT * FROM movie </select> </mapper>

由于这里没有用任何的第三方爬虫框架,用的是原生Java的Http协议进行爬取的,所以我写了一个工具类

public class GetJson { public JSONObject getHttpJson(String url, int comefrom) throws Exception { try { URL realUrl = new URL(url); HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection(); connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 建立实际的连接 connection.connect(); //请求成功 if (connection.getResponseCode() == 200) { InputStream is = connection.getInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); //10MB的缓存 byte[] buffer = new byte[10485760]; int len = 0; while ((len = is.read(buffer)) != -1) { baos.write(buffer, 0, len); } String jsonString = baos.toString(); baos.close(); is.close(); //转换成json数据处理 // getHttpJson函数的后面的参数1,表示返回的是json数据,2表示http接口的数据在一个()中的数据 JSONObject jsonArray = getJsonString(jsonString, comefrom); return jsonArray; } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } return null; } public JSONObject getJsonString(String str, int comefrom) throws Exception{ JSONObject jo = null; if(comefrom==1){ return new JSONObject(str); }else if(comefrom==2){ int indexStart = 0; //字符处理 for(int i=0;i<str.length();i++){ if(str.charAt(i)=='('){ indexStart = i; break; } } String strNew = ""; //分割字符串 for(int i=indexStart+1;i<str.length()-1;i++){ strNew += str.charAt(i); } return new JSONObject(strNew); } return jo; } }

爬取豆瓣电影的启动类

public class Main { public static void main(String [] args) { String resource = "mybatis-config.xml"; 定义配置文件路径 InputStream inputStream = null; try { inputStream = Resources.getResourceAsStream(resource);//读取配置文件 } catch (IOException e) { e.printStackTrace(); } SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream);//注册mybatis 工厂 SqlSession sqlSession = sqlSessionFactory.openSession();//得到连接对象 MovieMapper movieMapper = sqlSession.getMapper(MovieMapper.class);//从mybatis中得到dao对象 int start;//每页多少条 int total = 0;//记录数 int end = 9979;//总共9979条数据 for (start = 0; start <= end; start += 20) { try { String address = "https://Movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start=" + start; JSONObject dayLine = new GetJson().getHttpJson(address, 1); System.out.println("start:" + start); JSONArray json = dayLine.getJSONArray("data"); List<Movie> list = JSON.parseArray(json.toString(), Movie.class); if (start <= end){ System.out.println("已经爬取到底了"); sqlSession.close(); } for (Movie movie : list) { movieMapper.insert(movie); sqlSession.commit(); } total += list.size(); System.out.println("正在爬取中---共抓取:" + total + "条数据"); } catch (Exception e) { e.printStackTrace(); } } } }

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/wpfgpz.html