- package com.zhishang.lucene;
- /**
- * Created by Administrator on 2017/7/8.
- */
- public class htmlBean {
- private String title;
- private String content;
- private String url;
- public void setTitle(String title) {
- this.title = title;
- }
- public void setContent(String content) {
- this.content = content;
- }
- public void setUrl(String url) {
- this.url = url;
- }
- public String getTitle() {
- return title;
- }
- public String getContent() {
- return content;
- }
- public String getUrl() {
- return url;
- }
- }
- package com.zhishang.lucene;
- import net.htmlparser.jericho.Element;
- import net.htmlparser.jericho.HTMLElementName;
- import net.htmlparser.jericho.Source;
- import org.junit.Test;
- import java.io.File;
- import java.io.IOException;
- /**
- * Created by Administrator on 2017/7/8.
- */
- public class HtmlBeanUtil {
- public static HtmlBean parseHtml(File file){
- try {
- Source sc = new Source(file);
- Element element = sc.getFirstElement(HTMLElementName.TITLE);
- if (element == null || element.getTextExtractor() == null){
- return null;
- }
- HtmlBean htmlBean = new HtmlBean();
- htmlBean.setTitle(element.getTextExtractor().toString());
- htmlBean.setContent(sc.getTextExtractor().toString());
- htmlBean.setUrl(file.getAbsolutePath());
- return htmlBean;
- } catch (IOException e) {
- e.printStackTrace();
- }
- return null;
- }
- }
- package com.zhishang.lucene;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.io.filefilter.TrueFileFilter;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.*;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.store.RAMDirectory;
- import org.apache.lucene.util.Version;
- import org.junit.Test;
- import org.wltea.analyzer.lucene.IKAnalyzer;
- import java.io.File;
- import java.io.IOException;
- import java.util.Collection;
- /**
- * Created by Administrator on 2017/7/7.
- */
- public class CreateIndex {
- public static final String indexDir = "G:/index";
- public static final String dataDir = "G:/data";
- public void createIndex(){
- try {
- Directory dir = FSDirectory.open(new File(indexDir));
- //分词器
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9);
- IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9,analyzer);
- config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
- IndexWriter writer = new IndexWriter(dir,config);
- File file = new File(dataDir);
- RAMDirectory ramdir = new RAMDirectory();
- Analyzer analyzer1 = new IKAnalyzer();
- IndexWriterConfig config1 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer1);
- IndexWriter ramWriter = new IndexWriter(ramdir,config1);
- Collection<File> files = FileUtils.listFiles(file, TrueFileFilter.INSTANCE,TrueFileFilter.INSTANCE);
- int count = 0;
- for(File f:files){
- HtmlBean bean = HtmlBeanUtil.parseHtml(f);
- if(bean != null){
- Document document = new Document();
- document.add(new StringField("title",bean.getTitle(), Field.Store.YES));
- document.add(new TextField("content",bean.getContent(), Field.Store.YES));
- document.add(new StringField("url",bean.getUrl(), Field.Store.YES));
- ramWriter.addDocument(document);
- count++;
- if (count == 50){
- ramWriter.close();
- writer.addIndexes(ramdir);
- ramdir = new RAMDirectory();
- Analyzer analyzer2 = new IKAnalyzer();
- IndexWriterConfig config2 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer2);
- ramWriter = new IndexWriter(ramdir,config2);
- count = 0;
- }
- }
- }
- writer.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- package com.zhishang.lucene;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.Version;
- import org.junit.Test;
- import java.io.File;
- /**
- * Created by Administrator on 2017/7/8.
- */
- public class LuceneBean {
- /*
- 创建索引
- */
- @Test
- public void createIndex(){
- File file = new File(CreateIndex.indexDir);
- if (file.exists()){
- file.delete();
- file.mkdirs();
- }
- CreateIndex createIndex = new CreateIndex();
- createIndex.createIndex();
- }
- }
来源: http://www.bubuko.com/infodetail-2159034.html