Lucene 系列 (三) 查询及高亮

Lucene 系列 (一) 快速入门

Lucene 系列(二)luke 使用及索引文档的基本操作

一准备

创建项目并添加 Maven 依赖

<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
			<scope>test</scope>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
		<!-- Lucene 核心库 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>7.2.1</version>
		</dependency>
		<!-- Lucene 解析库 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>7.2.1</version>
		</dependency>
		<!-- Lucene 附加的分析库 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-common</artifactId>
			<version>7.2.1</version>
		</dependency>
		<!-- 高亮显示 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-highlighter</artifactId>
			<version>7.2.1</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-smartcn -->
		<!-- 中文分词 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-smartcn</artifactId>
			<version>7.2.0</version>
		</dependency>
	</dependencies>

二对特定单词查询 / 模糊查询和查询表达式

写索引

import java.io.File;
import java.io.FileReader;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Indexer {
	private IndexWriter writer; // 写索引实例
	/**
	 * 构造方法 实例化 IndexWriter
	 * @param indexDir
	 * @throws Exception
	 */
	public Indexer(String indexDir)throws Exception{
		Directory dir=FSDirectory.open(Paths.get(indexDir));
		Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
		IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
		writer=new IndexWriter(dir, iwc);
	}
	/**
	 * 关闭写索引
	 * @throws Exception
	 */
	public void close()throws Exception{
		writer.close();
	}
	/**
	 * 索引指定目录的所有文件
	 * @param dataDir
	 * @throws Exception
	 */
	public int index(String dataDir)throws Exception{
		File []files=new File(dataDir).listFiles();
		for(File f:files){
			indexFile(f);
		}
		return writer.numDocs();
	}
	/**
	 * 索引指定文件
	 * @param f
	 */
	private void indexFile(File f) throws Exception{
		System.out.println("索引文件:"+f.getCanonicalPath());
		Document doc=getDocument(f);
		writer.addDocument(doc);
	}
	/**
	 * 获取文档, 文档里再设置每个字段
	 * @param f
	 */
	private Document getDocument(File f)throws Exception {
		Document doc=new Document();
		doc.add(new TextField("contents",new FileReader(f)));
		doc.add(new TextField("fileName", f.getName(),Field.Store.YES));
		doc.add(new TextField("fullPath",f.getCanonicalPath(),Field.Store.YES));
		return doc;
	}
	public static void main(String[] args) {
		String indexDir="D:\\lucene\\searchindex";
		String dataDir="D:\\lucene\\data";
		Indexer indexer=null;
		int numIndexed=0;
		long start=System.currentTimeMillis();
		try {
			indexer = new Indexer(indexDir);
			numIndexed=indexer.index(dataDir);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}finally{
			try {
				indexer.close();
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		long end=System.currentTimeMillis();
		System.out.println("索引:"+numIndexed+"个文件 花费了"+(end-start)+"毫秒");
	}
}

读取索引

import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
public class SearchTest {
	private Directory dir;
	private IndexReader reader;
	private IndexSearcher is;
	@Before
	public void setUp() throws Exception {
		dir=FSDirectory.open(Paths.get("D:\\lucene\\searchindex"));
		reader=DirectoryReader.open(dir);
		is=new IndexSearcher(reader);
	}
	@After
	public void tearDown() throws Exception {
		reader.close();
	}
}

对特定单词查询和模糊查询

/**
	 * 对特定单词查询及模糊查询
	 *
	 * @throws Exception
	 */
	@Test
	public void testTermQuery() throws Exception {
		String searchField = "contents";
		// 所给出的必须是单词, 不然差不到
		String q = "authorship";
		// 一个 Term 表示来自文本的一个单词
		Term t = new Term(searchField, q);
		// 为 Term 构造查询
		Query query = new TermQuery(t);
	     /**
         * 1. 需要根据条件查询
         *
         * 2. 最大可编辑数, 取值范围 0,1,2
         * 允许我的查询条件的值, 可以错误几个字符
         *
         */
	    Query query2 = new FuzzyQuery(new Term(searchField,"authorshioo"),1);
		TopDocs hits = is.search(query, 10);
		// hits.totalHits: 查询的总命中次数即在几个文档中查到给定单词
		System.out.println("匹配'" + q + "', 总共查询到" + hits.totalHits + "个文档");
		for (ScoreDoc scoreDoc : hits.scoreDocs) {
			Document doc = is.doc(scoreDoc.doc);
			System.out.println(doc.get("fullPath"));
		}
		TopDocs hits2 = is.search(query2, 10);
		// hits.totalHits: 查询的总命中次数即在几个文档中查到给定单词
		System.out.println("匹配'" + "authorshioo"+ "', 总共查询到" + hits2.totalHits + "个文档");
		for (ScoreDoc scoreDoc : hits2.scoreDocs) {
			Document doc = is.doc(scoreDoc.doc);
			System.out.println(doc.get("fullPath"));
		}
	}

我们上面查询了单词 authorship 以及模糊查询了单词 "authorshioo", 结果如下:

可以看到只在 LICENSE.txt 文档下找到该单词

那么模糊查询为什么查不到单词 "authorshioo" 呢? 这是因为我们在这里允许可以错误几个字符为 1 个, 但是我们单词 "authorshioo" 错误字符个数为 2 个, 所以就查不到

Query query2 = new FuzzyQuery(new Term(searchField,"authorshioo"),1);

解析表达式的使用

/**
	 * 解析查询表达式
	 *
	 * @throws Exception
	 */
	@Test
	public void testQueryParser() throws Exception {
		// 标准分词器
		Analyzer analyzer = new StandardAnalyzer();
		String searchField = "contents";
		String q = "atomic a atomicReader";
		String q2 = "AtomicReader and AtomicReaderContext";
		// 建立查询解析器
		//searchField: 要查询的字段;
		//analyzer: 标准分词器实例
		QueryParser parser = new QueryParser(searchField, analyzer);
		Query query = parser.parse(q);
		// 返回查询到的前 10 项(查到 100 个相关内容的话也只会返回 10 个)
		TopDocs hits = is.search(query, 10);
		System.out.println("匹配" + q + "查询到" + hits.totalHits + "个记录");
		for (ScoreDoc scoreDoc : hits.scoreDocs) {
			Document doc = is.doc(scoreDoc.doc);
			System.out.println(doc.get("fullPath"));
		}
		QueryParser parser2 = new QueryParser(searchField, analyzer);
		Query query2 = parser2.parse(q2);
		// 返回查询到的前 10 项(查到 100 个相关内容的话也只会返回 10 个)
		TopDocs hits2 = is.search(query2, 10);
		System.out.println("匹配" + q2 + "查询到" + hits2.totalHits + "个记录");
		for (ScoreDoc scoreDoc : hits2.scoreDocs) {
			Document doc = is.doc(scoreDoc.doc);
			System.out.println(doc.get("fullPath"));
		}
	}

我们上面分别查询了: atomic a atomicReader 和 AtomicReader and AtomicReaderContext, 通过查询结果可以看出即使稍微改变查询内容, 也还是可以查询到和我们给出的表达式相关的文档

三中文查询及高亮

写索引

import java.nio.file.Paths;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Indexer {
    private String[] ids={"1","2","3"};
    private String citys[]={"青岛","南京","上海"};
    private String descs[]={
            "青岛是一个漂亮的城市",
            "南京是一个文化的城市",
            "上海是一个繁华的城市"
    };
    private Directory dir;
    /**
     * 实例化 indexerWriter
     * @return
     * @throws Exception
     */
    private IndexWriter getWriter()throws Exception{
        // 中文分词器
    	SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
        IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
        IndexWriter writer=new IndexWriter(dir, iwc);
        return writer;
    }
    /**
     * 获取 indexDir
     * @param indexDir
     * @throws Exception
     */
    private void index(String indexDir)throws Exception{
        dir=FSDirectory.open(Paths.get(indexDir));
        IndexWriter writer=getWriter();
        for(int i=0;i<ids.length;i++){
            Document doc=new Document();
            doc.add(new StringField("id", ids[i], Field.Store.YES));
            doc.add(new StringField("city",citys[i],Field.Store.YES));
            doc.add(new TextField("desc", descs[i], Field.Store.YES));
            writer.addDocument(doc);
        }
        writer.close();
    }
    public static void main(String[] args) throws Exception {
        new Indexer().index("D:\\lucene\\dataindex2");
        System.out.println("Success Indexer");
    }
}

中文查询及高亮显示

import java.io.StringReader;
import java.nio.file.Paths;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
 *
 * 通过索引字段来读取文档
 * @author LXY
 *
 */
public class SearchTest {
    public static void search(String indexDir, String par) throws Exception{
            // 得到读取索引文件的路径
            Directory dir = FSDirectory.open(Paths.get(indexDir));
            // 通过 dir 得到的路径下的所有的文件
            IndexReader reader = DirectoryReader.open(dir);
            // 建立索引查询器
            IndexSearcher searcher = new IndexSearcher(reader);
            // 中文分词器
            SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
            // 建立查询解析器
            /**
             * 第一个参数是要查询的字段;
             * 第二个参数是分析器 Analyzer
             * */
            QueryParser parser = new QueryParser("desc", analyzer);
            // 根据传进来的 par 查找
            Query query = parser.parse(par);
            // 计算索引开始时间
            long start = System.currentTimeMillis();
            // 开始查询
            /**
             * 第一个参数是通过传过来的参数来查找得到的 query;
             * 第二个参数是要出查询的行数
             * */
            TopDocs topDocs = searcher.search(query, 10);
            // 索引结束时间
            long end = System.currentTimeMillis();
            System.out.println("匹配"+par+", 总共花费了"+(end-start)+"毫秒, 共查到"+topDocs.totalHits+"条记录");
            // 高亮显示 start
            // 算分
            QueryScorer scorer=new QueryScorer(query);
            // 显示得分高的片段
            Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);
        // 设置标签内部关键字的颜色
        // 第一个参数: 标签的前半部分; 第二个参数: 标签的后半部分
        SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
            // 第一个参数是对查到的结果进行实例化; 第二个是片段得分(显示得分高的片段, 即摘要)
                Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);
                // 设置片段
                highlighter.setTextFragmenter(fragmenter);
                // 高亮显示 end
                // 遍历 topDocs
                /**
                 * ScoreDoc: 是代表一个结果的相关度得分与文档编号等信息的对象
                 * scoreDocs: 代表文件的数组
                 * @throws Exception
                 * */
                for(ScoreDoc scoreDoc : topDocs.scoreDocs){
                    // 获取文档
                    Document document = searcher.doc(scoreDoc.doc);
                    // 输出全路径
                    System.out.println(document.get("city"));
                    System.out.println(document.get("desc"));
                    String desc = document.get("desc");
                    if(desc!=null){
                        // 把全部得分高的摘要给显示出来
            // 第一个参数是对哪个参数进行设置; 第二个是以流的方式读入
            TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));
            // 获取最高的片段
            System.out.println(highlighter.getBestFragment(tokenStream, desc));
                }
        }
        reader.close();
    }
    // 开始测试
    public static void main(String[] args) {
        // 索引指定的路径
        String indexDir = "D:\\lucene\\dataindex2";
        // 查询的字段
        String par = "南京";
        try {
            search(indexDir,par);
        } catch (Exception e) {
        // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
}

结果会把我们查询的南京单词给高亮显示, 这在我们平时搜索中很常见了

我们平时搜索中的高亮就像下图:

来源: https://juejin.im/post/5ac0a066f265da238155c4f7

与本文相关文章

暂无,快来抢沙发吧！