- package com.zhishang.lucene;
- import net.htmlparser.jericho.Element;
- import net.htmlparser.jericho.HTMLElementName;
- import net.htmlparser.jericho.Source;
- import org.junit.Test;
- import java.io.File;
- import java.io.IOException;
- /**
- * Created by Administrator on 2017/7/8.
- */
- public class HtmlBeanUtil {
- @Test
- public void parseHtml(){
- String path = "G:\\data\\index.html";
- try {
- Source sc = new Source(new File(path));
- Element element = sc.getFirstElement(HTMLElementName.TITLE);
- System.out.println(element.getTextExtractor().toString());
- System.out.println(sc.getTextExtractor().toString());
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
来源: http://www.bubuko.com/infodetail-2159046.html