Lucene 系列 (一) 快速入门
Lucene 系列(二)luke 使用及索引文档的基本操作
一 准备
创建项目并添加 Maven 依赖
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>4.12</version>
- <scope>test</scope>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
- <!-- Lucene 核心库 -->
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-core</artifactId>
- <version>7.2.1</version>
- </dependency>
- <!-- Lucene 解析库 -->
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-queryparser</artifactId>
- <version>7.2.1</version>
- </dependency>
- <!-- Lucene 附加的分析库 -->
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-common</artifactId>
- <version>7.2.1</version>
- </dependency>
- <!-- 高亮显示 -->
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-highlighter</artifactId>
- <version>7.2.1</version>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-smartcn -->
- <!-- 中文分词 -->
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-smartcn</artifactId>
- <version>7.2.0</version>
- </dependency>
- </dependencies>
二 对特定单词查询 / 模糊查询和查询表达式
写索引
- import java.io.File;
- import java.io.FileReader;
- import java.nio.file.Paths;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- public class Indexer {
- private IndexWriter writer; // 写索引实例
- /**
- * 构造方法 实例化 IndexWriter
- * @param indexDir
- * @throws Exception
- */
- public Indexer(String indexDir)throws Exception{
- Directory dir=FSDirectory.open(Paths.get(indexDir));
- Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
- IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
- writer=new IndexWriter(dir, iwc);
- }
- /**
- * 关闭写索引
- * @throws Exception
- */
- public void close()throws Exception{
- writer.close();
- }
- /**
- * 索引指定目录的所有文件
- * @param dataDir
- * @throws Exception
- */
- public int index(String dataDir)throws Exception{
- File []files=new File(dataDir).listFiles();
- for(File f:files){
- indexFile(f);
- }
- return writer.numDocs();
- }
- /**
- * 索引指定文件
- * @param f
- */
- private void indexFile(File f) throws Exception{
- System.out.println("索引文件:"+f.getCanonicalPath());
- Document doc=getDocument(f);
- writer.addDocument(doc);
- }
- /**
- * 获取文档, 文档里再设置每个字段
- * @param f
- */
- private Document getDocument(File f)throws Exception {
- Document doc=new Document();
- doc.add(new TextField("contents",new FileReader(f)));
- doc.add(new TextField("fileName", f.getName(),Field.Store.YES));
- doc.add(new TextField("fullPath",f.getCanonicalPath(),Field.Store.YES));
- return doc;
- }
- public static void main(String[] args) {
- String indexDir="D:\\lucene\\searchindex";
- String dataDir="D:\\lucene\\data";
- Indexer indexer=null;
- int numIndexed=0;
- long start=System.currentTimeMillis();
- try {
- indexer = new Indexer(indexDir);
- numIndexed=indexer.index(dataDir);
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }finally{
- try {
- indexer.close();
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- long end=System.currentTimeMillis();
- System.out.println("索引:"+numIndexed+"个文件 花费了"+(end-start)+"毫秒");
- }
- }
读取索引
- import java.nio.file.Paths;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.queryparser.classic.QueryParser;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.junit.After;
- import org.junit.Before;
- import org.junit.Test;
- public class SearchTest {
- private Directory dir;
- private IndexReader reader;
- private IndexSearcher is;
- @Before
- public void setUp() throws Exception {
- dir=FSDirectory.open(Paths.get("D:\\lucene\\searchindex"));
- reader=DirectoryReader.open(dir);
- is=new IndexSearcher(reader);
- }
- @After
- public void tearDown() throws Exception {
- reader.close();
- }
- }
对特定单词查询和模糊查询
- /**
- * 对特定单词查询及模糊查询
- *
- * @throws Exception
- */
- @Test
- public void testTermQuery() throws Exception {
- String searchField = "contents";
- // 所给出的必须是单词, 不然差不到
- String q = "authorship";
- // 一个 Term 表示来自文本的一个单词
- Term t = new Term(searchField, q);
- // 为 Term 构造查询
- Query query = new TermQuery(t);
- /**
- * 1. 需要根据条件查询
- *
- * 2. 最大可编辑数, 取值范围 0,1,2
- * 允许我的查询条件的值, 可以错误几个字符
- *
- */
- Query query2 = new FuzzyQuery(new Term(searchField,"authorshioo"),1);
- TopDocs hits = is.search(query, 10);
- // hits.totalHits: 查询的总命中次数即在几个文档中查到给定单词
- System.out.println("匹配'" + q + "', 总共查询到" + hits.totalHits + "个文档");
- for (ScoreDoc scoreDoc : hits.scoreDocs) {
- Document doc = is.doc(scoreDoc.doc);
- System.out.println(doc.get("fullPath"));
- }
- TopDocs hits2 = is.search(query2, 10);
- // hits.totalHits: 查询的总命中次数即在几个文档中查到给定单词
- System.out.println("匹配'" + "authorshioo"+ "', 总共查询到" + hits2.totalHits + "个文档");
- for (ScoreDoc scoreDoc : hits2.scoreDocs) {
- Document doc = is.doc(scoreDoc.doc);
- System.out.println(doc.get("fullPath"));
- }
- }
我们上面查询了单词 authorship 以及模糊查询了单词 "authorshioo", 结果如下:
可以看到只在 LICENSE.txt 文档下找到该单词
那么模糊查询为什么查不到单词 "authorshioo" 呢? 这是因为我们在这里允许可以错误几个字符为 1 个, 但是我们单词 "authorshioo" 错误字符个数为 2 个, 所以就查不到
Query query2 = new FuzzyQuery(new Term(searchField,"authorshioo"),1);
解析表达式的使用
- /**
- * 解析查询表达式
- *
- * @throws Exception
- */
- @Test
- public void testQueryParser() throws Exception {
- // 标准分词器
- Analyzer analyzer = new StandardAnalyzer();
- String searchField = "contents";
- String q = "atomic a atomicReader";
- String q2 = "AtomicReader and AtomicReaderContext";
- // 建立查询解析器
- //searchField: 要查询的字段;
- //analyzer: 标准分词器实例
- QueryParser parser = new QueryParser(searchField, analyzer);
- Query query = parser.parse(q);
- // 返回查询到的前 10 项(查到 100 个相关内容的话也只会返回 10 个)
- TopDocs hits = is.search(query, 10);
- System.out.println("匹配" + q + "查询到" + hits.totalHits + "个记录");
- for (ScoreDoc scoreDoc : hits.scoreDocs) {
- Document doc = is.doc(scoreDoc.doc);
- System.out.println(doc.get("fullPath"));
- }
- QueryParser parser2 = new QueryParser(searchField, analyzer);
- Query query2 = parser2.parse(q2);
- // 返回查询到的前 10 项(查到 100 个相关内容的话也只会返回 10 个)
- TopDocs hits2 = is.search(query2, 10);
- System.out.println("匹配" + q2 + "查询到" + hits2.totalHits + "个记录");
- for (ScoreDoc scoreDoc : hits2.scoreDocs) {
- Document doc = is.doc(scoreDoc.doc);
- System.out.println(doc.get("fullPath"));
- }
- }
我们上面分别查询了: atomic a atomicReader 和 AtomicReader and AtomicReaderContext, 通过查询结果可以看出即使稍微改变查询内容, 也还是可以查询到和我们给出的表达式相关的文档
三 中文查询及高亮
写索引
- import java.nio.file.Paths;
- import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.StringField;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- public class Indexer {
- private String[] ids={"1","2","3"};
- private String citys[]={"青岛","南京","上海"};
- private String descs[]={
- "青岛是一个漂亮的城市",
- "南京是一个文化的城市",
- "上海是一个繁华的城市"
- };
- private Directory dir;
- /**
- * 实例化 indexerWriter
- * @return
- * @throws Exception
- */
- private IndexWriter getWriter()throws Exception{
- // 中文分词器
- SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
- IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
- IndexWriter writer=new IndexWriter(dir, iwc);
- return writer;
- }
- /**
- * 获取 indexDir
- * @param indexDir
- * @throws Exception
- */
- private void index(String indexDir)throws Exception{
- dir=FSDirectory.open(Paths.get(indexDir));
- IndexWriter writer=getWriter();
- for(int i=0;i<ids.length;i++){
- Document doc=new Document();
- doc.add(new StringField("id", ids[i], Field.Store.YES));
- doc.add(new StringField("city",citys[i],Field.Store.YES));
- doc.add(new TextField("desc", descs[i], Field.Store.YES));
- writer.addDocument(doc);
- }
- writer.close();
- }
- public static void main(String[] args) throws Exception {
- new Indexer().index("D:\\lucene\\dataindex2");
- System.out.println("Success Indexer");
- }
- }
中文查询及高亮显示
- import java.io.StringReader;
- import java.nio.file.Paths;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.queryparser.classic.QueryParser;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.highlight.Fragmenter;
- import org.apache.lucene.search.highlight.Highlighter;
- import org.apache.lucene.search.highlight.QueryScorer;
- import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
- import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- /**
- *
- * 通过索引字段来读取文档
- * @author LXY
- *
- */
- public class SearchTest {
- public static void search(String indexDir, String par) throws Exception{
- // 得到读取索引文件的路径
- Directory dir = FSDirectory.open(Paths.get(indexDir));
- // 通过 dir 得到的路径下的所有的文件
- IndexReader reader = DirectoryReader.open(dir);
- // 建立索引查询器
- IndexSearcher searcher = new IndexSearcher(reader);
- // 中文分词器
- SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
- // 建立查询解析器
- /**
- * 第一个参数是要查询的字段;
- * 第二个参数是分析器 Analyzer
- * */
- QueryParser parser = new QueryParser("desc", analyzer);
- // 根据传进来的 par 查找
- Query query = parser.parse(par);
- // 计算索引开始时间
- long start = System.currentTimeMillis();
- // 开始查询
- /**
- * 第一个参数是通过传过来的参数来查找得到的 query;
- * 第二个参数是要出查询的行数
- * */
- TopDocs topDocs = searcher.search(query, 10);
- // 索引结束时间
- long end = System.currentTimeMillis();
- System.out.println("匹配"+par+", 总共花费了"+(end-start)+"毫秒, 共查到"+topDocs.totalHits+"条记录");
- // 高亮显示 start
- // 算分
- QueryScorer scorer=new QueryScorer(query);
- // 显示得分高的片段
- Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);
- // 设置标签内部关键字的颜色
- // 第一个参数: 标签的前半部分; 第二个参数: 标签的后半部分
- SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
- // 第一个参数是对查到的结果进行实例化; 第二个是片段得分(显示得分高的片段, 即摘要)
- Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);
- // 设置片段
- highlighter.setTextFragmenter(fragmenter);
- // 高亮显示 end
- // 遍历 topDocs
- /**
- * ScoreDoc: 是代表一个结果的相关度得分与文档编号等信息的对象
- * scoreDocs: 代表文件的数组
- * @throws Exception
- * */
- for(ScoreDoc scoreDoc : topDocs.scoreDocs){
- // 获取文档
- Document document = searcher.doc(scoreDoc.doc);
- // 输出全路径
- System.out.println(document.get("city"));
- System.out.println(document.get("desc"));
- String desc = document.get("desc");
- if(desc!=null){
- // 把全部得分高的摘要给显示出来
- // 第一个参数是对哪个参数进行设置; 第二个是以流的方式读入
- TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));
- // 获取最高的片段
- System.out.println(highlighter.getBestFragment(tokenStream, desc));
- }
- }
- reader.close();
- }
- // 开始测试
- public static void main(String[] args) {
- // 索引指定的路径
- String indexDir = "D:\\lucene\\dataindex2";
- // 查询的字段
- String par = "南京";
- try {
- search(indexDir,par);
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- }
结果会把我们查询的南京单词给高亮显示, 这在我们平时搜索中很常见了
我们平时搜索中的高亮就像下图:
来源: https://juejin.im/post/5ac0a066f265da238155c4f7