- import org.apache.commons.lang3.StringUtils;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import java.io.File;
- import java.io.IOException;
- /**
- * 文件中地址替换解析
- * @author
- * Created by Administrator on 2014/10/21.
- */
- public class JsoupUtils
- {
- /**
- * 给页面头部增加base href
- * @param html
- * @return
- * @throws IOException
- */
- public static String addBaseHrefOfHtmlStr(String html, String prefix) throws IOException
- {
- Document doc = Jsoup.parse(html, "UTF-8");
- Element base=doc.select("base").first();
- if(base != null)
- {
- base.attr("href", prefix);
- }
- else
- {
- Element head = doc.select("head").first();
- Element node = doc.createElement("base");
- node.attr("href", prefix);
- head.prependChild(node);
- }
- return doc.toString();
- }
- /**
- * 给页面头部增加base href
- * @param file
- * @return
- * @throws IOException
- */
- public static String addBaseHref(File file, String prefix) throws IOException
- {
- Document doc = Jsoup.parse(file, "UTF-8");
- Element base=doc.select("base").first();
- if(base != null)
- {
- base.attr("href", prefix);
- }
- else
- {
- Element head = doc.select("head").first();
- Element node = doc.createElement("base");
- node.attr("href", prefix);
- head.prependChild(node);
- }
- return doc.toString();
- }
- /**
- * 网页中关于图片等地址的替换
- * @param file 网页文件
- * @param urlPrefix 路径前缀
- * @return
- * @throws IOException
- */
- public static String htmlContentURLReplace(File file, String urlPrefix) throws IOException
- {
- Document doc = Jsoup.parse(file, "UTF-8");
- //a href
- Element content = doc.tagName("html");
- Elements links = content.getElementsByTag("a");
- for (Element link : links)
- {
- String linkHref = link.attr("href");
- if(StringUtils.isNotBlank(linkHref) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("href", urlPrefix + pathCharReplact(linkHref));
- }
- }
- //img src
- links = content.getElementsByTag("img");
- for (Element link : links)
- {
- String linkHref = link.attr("src");
- if(StringUtils.isNotBlank(linkHref) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("src", urlPrefix + pathCharReplact(linkHref));
- }
- }
- //script src
- links = content.getElementsByTag("script");
- for (Element link : links)
- {
- String linkSrc = link.attr("src");
- String linkHref = link.attr("href");
- if(StringUtils.isNotBlank(linkSrc) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("src", urlPrefix + pathCharReplact(linkSrc));
- }
- if(StringUtils.isNotBlank(linkHref) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("href", urlPrefix + pathCharReplact(linkHref));
- }
- }
- links = content.getElementsByTag("link");
- for (Element link : links)
- {
- String linkSrc = link.attr("src");
- String linkHref = link.attr("href");
- if(StringUtils.isNotBlank(linkSrc) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("src", urlPrefix + pathCharReplact(linkSrc));
- }
- else if(StringUtils.isNotBlank(linkHref) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("href", urlPrefix + pathCharReplact(linkHref));
- }
- }
- return doc.toString();
- }
- /**
- * 网页中关于图片等地址的替换
- * @param htmlStr 网页文件
- * @param urlPrefix 路径前缀
- * @return
- * @throws IOException
- */
- public static String htmlContentURLReplace(String htmlStr, String urlPrefix) throws IOException
- {
- urlPrefix = urlPrefix.replaceAll("\\\\\\\\", "/");
- if(urlPrefix.endsWith("/"))
- {
- urlPrefix = urlPrefix.substring(0, urlPrefix.lastIndexOf("/") -1);
- }
- Document doc = Jsoup.parse(htmlStr, "UTF-8");
- //a href
- Element content = doc.tagName("html");
- Elements links = content.getElementsByTag("a");
- for (Element link : links)
- {
- String linkHref = link.attr("href");
- if(StringUtils.isNotBlank(linkHref) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("href", urlPrefix + pathCharReplact(linkHref));
- }
- }
- //img src
- links = content.getElementsByTag("img");
- for (Element link : links)
- {
- String linkHref = link.attr("src");
- if(StringUtils.isNotBlank(linkHref) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("src", urlPrefix + pathCharReplact(linkHref));
- }
- }
- //script src
- links = content.getElementsByTag("script");
- for (Element link : links)
- {
- String linkSrc = link.attr("src");
- String linkHref = link.attr("href");
- if(StringUtils.isNotBlank(linkSrc) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("src", urlPrefix + pathCharReplact(linkSrc));
- }
- if(StringUtils.isNotBlank(linkHref) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("href", urlPrefix + pathCharReplact(linkHref));
- }
- }
- links = content.getElementsByTag("link");
- for (Element link : links)
- {
- String linkSrc = link.attr("src");
- String linkHref = link.attr("href");
- if(StringUtils.isNotBlank(linkSrc) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("src", urlPrefix + pathCharReplact(linkSrc));
- }
- else if(StringUtils.isNotBlank(linkHref) && JsoupUtils.isFilterPath(linkHref))
- {
- link.attr("href", urlPrefix + pathCharReplact(linkHref));
- }
- }
- return doc.toString();
- }
- /**
- * 检测内容是否是属于过滤掉
- * @param proValue
- * @return
- */
- @Deprecated
- public static boolean isFilterPath(String proValue)
- {
- boolean flag = true;
- if(StringUtils.containsIgnoreCase(proValue, "#") || StringUtils.containsIgnoreCase(proValue, "javascript") || StringUtils.containsIgnoreCase(proValue, "#"))
- {
- flag = false;
- }
- return flag;
- }
- /**
- * 路径中存在 ./等的处理。
- * @param path
- * @return
- */
- public static String pathCharReplact(String path)
- {
- int start = StringUtils.lastIndexOf(path, "./");
- if(start > -1)
- {
- path = path.substring(start + 1);
- }
- else if(!path.startsWith("/"))
- {
- path = "/" + path;
- }
- return path;
- }
- }
- //该片段来自于http://www.codesnippet.cn/detail/2601201614488.html
来源: http://www.codesnippet.cn/detail/2601201614488.html