一个页面静态化的工具类

 
package com.spider;
  
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
  
import org.apache.commons.io.IOUtils;
/**
 * @author lpf
 * 一个URL缓存工具类
 */
public class Spider {
    static String refuseUrl[] = new String[] {"/shop/acart.do"};//多有不需要处理的url地址
    static String[][] convert = new String[4][2] ;//url变更规则
    static String[] v= new String[]{"<link.+href=\\"(.+?)\\"","<script.+src=\\"(.+?)\\"","<img.+src=\\"(.+?)\\"","url\\\\((.+?)\\\\)"} ;//给CSS,js,jpg,png 加版本号
    static{
        convert[0]=new String[]{ "^/(.+?).do$", "/%s.html"}; //处理以/开头，以.do结尾的
        convert[1]=new String[]{ "^/(.+?).jsp$", "/%s.html"};//处理以/开头，以.jsp结尾的
        convert[2]=new String[]{ "^/phone/peijian/index.jsp\\\\?type=(.+)", "/phone/peijian/index_type_%s.html"};//处理分类
        convert[3]=new String[]{ "^/phone/product.do\\\\?id=(.+)", "/phone/product%s.html"};//处理产品
    }
    public static void main(String[] args) throws Exception {
     System.out.println(addV(  staticUrl(getUrlHtml("http://www.daq.cn")) ) );
    }
    /**
     * 给css,js,jpg,png 加版本号
     * @author lpf
     * @param sb
     * @return
     */
    public static StringBuffer  addV(StringBuffer sb)   {
        StringBuffer res=new StringBuffer();
        for (int i = 0; i < v.length; i++) {
             res=new StringBuffer();
             Pattern pattern = Pattern.compile( v[i]);
             Matcher matcher = pattern.matcher( sb);
              while (matcher.find()) {
                  String v=matcher.group(0);
                  String v1=matcher.group(1);
                  matcher.appendReplacement(res,v.replace(v1 ,v1+"?v="+  new Date().getTime()));
              }
              matcher.appendTail(res);
              sb=res;
        }
        return res;
    }
    /**
     * 得到一个网页的所有url
     * @author lpf
     * @param url
     * @return
     * @throws Exception
     */
    public static List<String> getPageUrl(String url) throws  Exception {
          List<String> ls=new ArrayList<String>();
          StringBuffer sb= getUrlHtml(url);
          Pattern pattern = Pattern.compile("<a.+href=\\"(.+?)\\"");
          Matcher matcher = pattern.matcher( sb);
          while (matcher.find()) {
              ls.add( matcher.group(1));
          }
          return ls;
    }
    /**
     * 得到一个网页的html内容
     * @author lpf
     * @param url
     * @return
     */
    public static StringBuffer getUrlHtml(String url)  {
        List<String> ls=new ArrayList<String>();
        try {
              URL u = new URL(  url);
              InputStream in = u.openStream();
              ByteArrayOutputStream os=new ByteArrayOutputStream();
              IOUtils.copy(in, os);
              in.close();
              StringBuffer sb=new StringBuffer( new String( os.toByteArray()) );
              os.close();
              return sb;
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
    /**
     * 把一个网页的内容url地址全部静态化
     * @author lpf
     * @param sb
     * @return
     */
    public static StringBuffer staticUrl(StringBuffer sb)   {
          Pattern pattern = Pattern.compile("<a.+href=\\"(.+?)\\"");
          Matcher matcher = pattern.matcher( sb);
          StringBuffer res=new StringBuffer();
          while (matcher.find()) {
              String v=matcher.group(0);
              String v1=matcher.group(1);
              matcher.appendReplacement(res,v.replace(v1 ,convertUrl( matcher.group(1))));
          }
          matcher.appendTail(res);
          return res;
    }
      
    /**
     * 把一个url地址，通过规则变成另外个地址
     * @author lpf
     * @param url
     * @return
     */
    private static String convertUrl(String url)   {
        //先过滤不需要处理的页面
        for (int i = 0; i < refuseUrl.length; i++) {
            if  (  refuseUrl[i].equals(url)) {
                return url;
            }
        }
        //按照规则改写地址
        for (int i = 0; i < convert.length; i++) {
              String []c=convert[i];
              Pattern pattern = Pattern.compile( c[0]);
              Matcher matcher = pattern.matcher( url);
              while (matcher.find()) {
                  return String.format( c[1] ,matcher.group(1));
              }
        }
        return url;
    }
}
//该片段来自于http://www.codesnippet.cn/detail/1310201513792.html
来源: http://www.codesnippet.cn/detail/1310201513792.html
与本文相关文章

暂无,快来抢沙发吧！