时下互联网第一波的浪潮已消逝,随着而来的基于万千数据的物联网时代,因而数据成为企业的重要战略资源之一。基于数据抓取技术,本文介绍了 java 相关抓取工具,并附上 demo 源码供感兴趣的朋友测试!
1)JDK 自带 HTTP 连接,获取页面或 Json
2) JDK 自带 URL 连接,获取页面或 Json
3)HttpClient Get 工具,获取页面或 Json
完整代码:
- package com.yeezhao.common.http;
- import java.io.BufferedReader;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.net.HttpURLConnection;
- import java.net.URL;
- import org.apache.commons.httpclient.HttpClient;
- import org.apache.commons.httpclient.HttpMethod;
- import org.apache.commons.httpclient.methods.GetMethod;
- import org.apache.commons.io.IOUtils;
- import org.jsoup.Jsoup;
- /**
- * http工具对比
- *
- * @author Administrator -> junhong
- *
- * 2016年12月27日
- */
- public class HttpFetchUtil {
- /**
- * 获取访问的状态码
- * @param request
- * @return
- * @throws Exception
- */
- public static int getResponseCode(String request) throws Exception {
- URL url = new URL(request);
- HttpURLConnection conn = (HttpURLConnection) url.openConnection();
- return conn.getResponseCode();
- }
- /**
- * 1)JDK自带HTTP连接,获取页面或Json
- * @param request
- * @param charset
- * @return
- * @throws Exception
- */
- public static String JDKFetch(String request, String charset) throws Exception {
- URL url = new URL(request);
- HttpURLConnection conn = (HttpURLConnection) url.openConnection();
- //模拟浏览器参数
- conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) ApplewebKit/537.36"
- + " (Khtml, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
- if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
- InputStream input = conn.getInputStream();
- StringBuffer sb = new StringBuffer();
- BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset));
- String s;
- while ((s = reader.readLine()) != null) {
- sb.append(s + "\n");
- }
- input.close();
- conn.disconnect();
- return sb.toString();
- }
- return "";
- }
- /**
- * 2) JDK自带URL连接,获取页面或Json
- * @param request
- * @param charset
- * @return
- * @throws Exception
- */
- public static String URLFetch(String request, String charset) throws Exception {
- URL url = new URL(request);
- return IOUtils.toString(url.openStream());
- }
- /**
- * 3)HttpClient Get工具,获取页面或Json
- * @param url
- * @param charset
- * @return
- * @throws Exception
- */
- public static String httpClientFetch(String url, String charset) throws Exception {
- // GET
- HttpClient httpClient = new HttpClient();
- httpClient.getParams().setContentCharset(charset);
- HttpMethod method = new GetMethod(url);
- httpClient.executeMethod(method);
- return method.getResponseBodyAsString();
- }
- /**
- * 4)commons-io工具,获取页面或Json
- * @param url
- * @param charset
- * @return
- * @throws Exception
- */
- public static String commonsIOFetch(String url, String charset) throws Exception {
- return IOUtils.toString(new URL(url), charset);
- }
- /**
- * 5) Jsoup工具(通常用于html字段解析),获取页面,非Json返回格式
- * @param url
- * @return
- * @throws Exception
- */
- public static String jsoupFetch(String url) throws Exception {
- return Jsoup.parse(new URL(url), 2 * 1000).html();
- }
- }
测试代码:
- package com.yeezhao.common.http;
- import org.junit.After;
- import org.junit.Before;
- import org.junit.Test;
- /**
- * 测试类
- * 3个测试链接:
- * 1)百科网页
- * 2)浏览器模拟获取接口数据
- * 3)获取普通接口数据
- * @author Administrator -> junhong
- *
- * 2016年12月27日
- */
- public class HttpFetchUtilTest {
- String seeds[] = {"http://baike.baidu.com/view/1.htm","http://m.ximalaya.com/tracks/26096131.json","http://remyapi.yeezhao.com/api/query?wd=周星驰的电影"};
- final static String DEFAULT_CHARSET = "UTF-8";
- @Before
- public void setUp() throws Exception {
- }
- @After
- public void tearDown() throws Exception {
- System.out.println("--- down ---");
- }
- @Test
- public void testGetResponseCode() throws Exception{
- for(String seed:seeds){
- int responseCode = HttpFetchUtil.getResponseCode(seed);
- System.out.println("ret="+responseCode);
- }
- }
- @Test
- public void testJDKFetch() throws Exception{
- for(String seed:seeds){
- String ret = HttpFetchUtil.JDKFetch(seed, DEFAULT_CHARSET);
- System.out.println("ret="+ret);
- }
- }
- @Test
- public void testURLFetch() throws Exception{
- for(String seed:seeds){
- String ret = HttpFetchUtil.URLFetch(seed, DEFAULT_CHARSET);
- System.out.println("ret="+ret);
- }
- }
- @Test
- public void testHttpClientFetch()throws Exception {
- for(String seed:seeds){
- String ret = HttpFetchUtil.httpClientFetch(seed, DEFAULT_CHARSET);
- System.out.println("ret="+ret);
- }
- }
- @Test
- public void testCommonsIOFetch()throws Exception {
- for(String seed:seeds){
- String ret = HttpFetchUtil.commonsIOFetch(seed, DEFAULT_CHARSET);
- System.out.println("ret="+ret);
- }
- }
- @Test
- public void testJsoupFetch() throws Exception{
- for(String seed:seeds){
- String ret = HttpFetchUtil.jsoupFetch(seed);
- System.out.println("ret="+ret);
- }
- }
- }
附:相关 jar 依赖
- ...
- org.jsoup
- jsoup
- 1.7.3
- commons-httpclient
- commons-httpclient
- 3.1
- commons-io
- commons-io
- 2.4
- ...
现在的数据时代,有着 "数据即财富" 的理念。因此,数据抓取技术将一直发展更新,基于此后续还将扩充针对 POST 方法的抓取方式,敬请期待!
来源: