java 是一种可以撰写跨平台应用软件的面向对象的程序设计语言,是由 Sun Microsystems 公司于 1995 年 5 月推出的 Java 程序设计语言和 Java 平台(即 JavaEE(j2ee), JavaME(j2me), JavaSE(j2se))的总称。
本篇文章主要介绍了 Java 实现多种方式的 http 数据抓取,小编觉得挺不错的,现在分享给大家,也给大家做个参考。一起跟随小编过来看看吧。
前言:
时下互联网第一波的浪潮已消逝,随着而来的基于万千数据的物联网时代,因而数据成为企业的重要战略资源之一。基于数据抓取技术,本文介绍了 java 相关抓取工具,并附上 demo 源码供感兴趣的朋友测试!
1)JDK 自带 HTTP 连接,获取页面或 Json
2) JDK 自带 URL 连接,获取页面或 Json
3)HttpClient Get 工具,获取页面或 Json
4)commons-io 工具,获取页面或 Json
5) Jsoup 工具(通常用于 html 字段解析),获取页面, 非 Json 返回格式】
--------------------------------------------------------------------------------
完整代码:
- package com.yeezhao.common.http;
- import java.io.BufferedReader;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.net.HttpURLConnection;
- import java.net.URL;
- import org.apache.commons.httpclient.HttpClient;
- import org.apache.commons.httpclient.HttpMethod;
- import org.apache.commons.httpclient.methods.GetMethod;
- import org.apache.commons.io.IOUtils;
- import org.jsoup.Jsoup;
- /**
- * http工具对比
- *
- * @author Administrator -> junhong
- *
- * 2016年12月27日
- */
- public class HttpFetchUtil {
- /**
- * 获取访问的状态码
- * @param request
- * @return
- * @throws Exception
- */
- public static int getResponseCode(String request) throws Exception {
- URL url = new URL(request);
- HttpURLConnection conn = (HttpURLConnection) url.openConnection();
- return conn.getResponseCode();
- }
- /**
- * 1)JDK自带HTTP连接,获取页面或Json
- * @param request
- * @param charset
- * @return
- * @throws Exception
- */
- public static String JDKFetch(String request, String charset) throws Exception {
- URL url = new URL(request);
- HttpURLConnection conn = (HttpURLConnection) url.openConnection();
- //模拟浏览器参数
- conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) ApplewebKit/537.36" + " (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
- if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
- InputStream input = conn.getInputStream();
- StringBuffer sb = new StringBuffer();
- BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset));
- String s;
- while ((s = reader.readLine()) != null) {
- sb.append(s + "\n");
- }
- input.close();
- conn.disconnect();
- return sb.toString();
- }
- return "";
- }
- /**
- * 2) JDK自带URL连接,获取页面或Json
- * @param request
- * @param charset
- * @return
- * @throws Exception
- */
- public static String URLFetch(String request, String charset) throws Exception {
- URL url = new URL(request);
- return IOUtils.toString(url.openStream());
- }
- /**
- * 3)HttpClient Get工具,获取页面或Json
- * @param url
- * @param charset
- * @return
- * @throws Exception
- */
- public static String httpClientFetch(String url, String charset) throws Exception {
- // GET
- HttpClient httpClient = new HttpClient();
- httpClient.getParams().setContentCharset(charset);
- HttpMethod method = new GetMethod(url);
- httpClient.executeMethod(method);
- return method.getResponseBodyAsString();
- }
- /**
- * 4)commons-io工具,获取页面或Json
- * @param url
- * @param charset
- * @return
- * @throws Exception
- */
- public static String commonsIOFetch(String url, String charset) throws Exception {
- return IOUtils.toString(new URL(url), charset);
- }
- /**
- * 5) Jsoup工具(通常用于html字段解析),获取页面,非Json返回格式
- * @param url
- * @return
- * @throws Exception
- */
- public static String jsoupFetch(String url) throws Exception {
- return Jsoup.parse(new URL(url), 2 * 1000).html();
- }
- }
测试代码:
- package com.yeezhao.common.http;
- import org.junit.After;
- import org.junit.Before;
- import org.junit.Test;
- /**
- * 测试类
- * 3个测试链接:
- * 1)百科网页
- * 2)浏览器模拟获取接口数据
- * 3)获取普通接口数据
- * @author Administrator -> junhong
- *
- * 2016年12月27日
- */
- public class HttpFetchUtilTest {
- String seeds[] = {
- "http://baike.baidu.com/view/1.htm",
- "http://m.ximalaya.com/tracks/26096131.json",
- "http://remyapi.yeezhao.com/api/query?wd=周星驰的电影"
- };
- final static String DEFAULT_CHARSET = "UTF-8";@Before public void setUp() throws Exception {}
- @After public void tearDown() throws Exception {
- System.out.println("--- down ---");
- }
- @Test public void testGetResponseCode() throws Exception {
- for (String seed: seeds) {
- int responseCode = HttpFetchUtil.getResponseCode(seed);
- System.out.println("ret=" + responseCode);
- }
- }
- @Test public void testJDKFetch() throws Exception {
- for (String seed: seeds) {
- String ret = HttpFetchUtil.JDKFetch(seed, DEFAULT_CHARSET);
- System.out.println("ret=" + ret);
- }
- }
- @Test public void testURLFetch() throws Exception {
- for (String seed: seeds) {
- String ret = HttpFetchUtil.URLFetch(seed, DEFAULT_CHARSET);
- System.out.println("ret=" + ret);
- }
- }
- @Test public void testHttpClientFetch() throws Exception {
- for (String seed: seeds) {
- String ret = HttpFetchUtil.httpClientFetch(seed, DEFAULT_CHARSET);
- System.out.println("ret=" + ret);
- }
- }
- @Test public void testCommonsIOFetch() throws Exception {
- for (String seed: seeds) {
- String ret = HttpFetchUtil.commonsIOFetch(seed, DEFAULT_CHARSET);
- System.out.println("ret=" + ret);
- }
- }
- @Test public void testJsoupFetch() throws Exception {
- for (String seed: seeds) {
- String ret = HttpFetchUtil.jsoupFetch(seed);
- System.out.println("ret=" + ret);
- }
- }
- }
来源: http://www.phperz.com/article/17/1214/359042.html