- package com.spider;
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Date;
- import java.util.List;
- import java.util.UUID;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.commons.io.IOUtils;
- /**
- * @author lpf
- * 一个URL缓存工具类
- */
- public class Spider {
- static String refuseUrl[] = new String[] {"/shop/acart.do"};//多有不需要处理的url地址
- static String[][] convert = new String[4][2] ;//url变更规则
- static String[] v= new String[]{"<link.+href=\\"(.+?)\\"","<script.+src=\\"(.+?)\\"","<img.+src=\\"(.+?)\\"","url\\\\((.+?)\\\\)"} ;//给CSS,js,jpg,png 加版本号
- static{
- convert[0]=new String[]{ "^/(.+?).do$", "/%s.html"}; //处理以/开头,以.do结尾的
- convert[1]=new String[]{ "^/(.+?).jsp$", "/%s.html"};//处理以/开头,以.jsp结尾的
- convert[2]=new String[]{ "^/phone/peijian/index.jsp\\\\?type=(.+)", "/phone/peijian/index_type_%s.html"};//处理分类
- convert[3]=new String[]{ "^/phone/product.do\\\\?id=(.+)", "/phone/product%s.html"};//处理产品
- }
- public static void main(String[] args) throws Exception {
- System.out.println(addV( staticUrl(getUrlHtml("http://www.daq.cn")) ) );
- }
- /**
- * 给css,js,jpg,png 加版本号
- * @author lpf
- * @param sb
- * @return
- */
- public static StringBuffer addV(StringBuffer sb) {
- StringBuffer res=new StringBuffer();
- for (int i = 0; i < v.length; i++) {
- res=new StringBuffer();
- Pattern pattern = Pattern.compile( v[i]);
- Matcher matcher = pattern.matcher( sb);
- while (matcher.find()) {
- String v=matcher.group(0);
- String v1=matcher.group(1);
- matcher.appendReplacement(res,v.replace(v1 ,v1+"?v="+ new Date().getTime()));
- }
- matcher.appendTail(res);
- sb=res;
- }
- return res;
- }
- /**
- * 得到一个网页的所有url
- * @author lpf
- * @param url
- * @return
- * @throws Exception
- */
- public static List<String> getPageUrl(String url) throws Exception {
- List<String> ls=new ArrayList<String>();
- StringBuffer sb= getUrlHtml(url);
- Pattern pattern = Pattern.compile("<a.+href=\\"(.+?)\\"");
- Matcher matcher = pattern.matcher( sb);
- while (matcher.find()) {
- ls.add( matcher.group(1));
- }
- return ls;
- }
- /**
- * 得到一个网页的html内容
- * @author lpf
- * @param url
- * @return
- */
- public static StringBuffer getUrlHtml(String url) {
- List<String> ls=new ArrayList<String>();
- try {
- URL u = new URL( url);
- InputStream in = u.openStream();
- ByteArrayOutputStream os=new ByteArrayOutputStream();
- IOUtils.copy(in, os);
- in.close();
- StringBuffer sb=new StringBuffer( new String( os.toByteArray()) );
- os.close();
- return sb;
- } catch (MalformedURLException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return null;
- }
- /**
- * 把一个网页的内容url地址全部静态化
- * @author lpf
- * @param sb
- * @return
- */
- public static StringBuffer staticUrl(StringBuffer sb) {
- Pattern pattern = Pattern.compile("<a.+href=\\"(.+?)\\"");
- Matcher matcher = pattern.matcher( sb);
- StringBuffer res=new StringBuffer();
- while (matcher.find()) {
- String v=matcher.group(0);
- String v1=matcher.group(1);
- matcher.appendReplacement(res,v.replace(v1 ,convertUrl( matcher.group(1))));
- }
- matcher.appendTail(res);
- return res;
- }
- /**
- * 把一个url地址,通过规则变成另外个地址
- * @author lpf
- * @param url
- * @return
- */
- private static String convertUrl(String url) {
- //先过滤不需要处理的页面
- for (int i = 0; i < refuseUrl.length; i++) {
- if ( refuseUrl[i].equals(url)) {
- return url;
- }
- }
- //按照规则改写地址
- for (int i = 0; i < convert.length; i++) {
- String []c=convert[i];
- Pattern pattern = Pattern.compile( c[0]);
- Matcher matcher = pattern.matcher( url);
- while (matcher.find()) {
- return String.format( c[1] ,matcher.group(1));
- }
- }
- return url;
- }
- }
- //该片段来自于http://www.codesnippet.cn/detail/1310201513792.html
来源: http://www.codesnippet.cn/detail/1310201513792.html