为了账号安全,请及时绑定邮箱和手机立即绑定

Lucene 高级搜索引擎工厂(二)

标签:
Java

Lucene 高级搜索引擎工厂(二)

前情回顾:
上篇文章介绍了,mongodb driver的使用方法。本文详细的介绍Lucene的全文检索功能。

1.定时任务 更新索引库

package com.noseparte.lucene;

import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.noseparte.mongo.MongoDBConfig;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.bson.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.IOException;
import java.nio.file.FileSystems;

/**
 * Copyright © 2018 noseparte © BeiJing BoLuo Network Technology Co. Ltd.
 * @Author Noseparte
 * @Compile 2018-12-24 -- 16:08
 * @Version 1.0
 * @Description 更新Lucene的索引库
 */
@Slf4j
@Component
public class LuceneIndexesFactory {

    public static String INDEX_PATH = "";
    public static String GEOGRAPHY_INDEX_PATH = "";

    /** 分配lucene索引库的磁盘地址 */
    static {
        String os_name = System.getProperty("os.name");
        if (os_name.substring(0, 3).toLowerCase().equals("win")) {
            INDEX_PATH = "D:\\data\\lucene\\lucene-db";
            GEOGRAPHY_INDEX_PATH = "D:\\data\\lucene\\lucene-geography-db";
        } else {
            INDEX_PATH = "/data/lucene/lucene-db";
            GEOGRAPHY_INDEX_PATH = "/data/lucene/lucene-geography-db";
        }
    }

    @Autowired
    private MongoDBConfig mongoDBConfig;

    public static IndexSearcher init(String path) throws IOException {
        IndexSearcher indexSearcher = null;
        AnalyzingInfixSuggester suggester = null;
        if (indexSearcher == null) {
            // 1、创建Directory
            Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(path));
            // 2、创建IndexReader
            DirectoryReader directoryReader = DirectoryReader.open(directory);
            // 3、根据IndexReader创建IndexSearch
            indexSearcher = new IndexSearcher(directoryReader);
        }
        return indexSearcher;
    }

    public static void main(String[] args) {
        System.out.println(System.getProperty("os.name").substring(0, 3));
    }

    /**
     * // 原始文档 ==> 创建索引 // 1.获得文档; 2.构架文档对象; 3.分析文档(分词); 4.创建索引;
     *
     * <p>见数据库文件生成本地索引文件,创建索引 {@link // https://www.cnblogs.com/dacc123/p/8228298.html} {@linkplain //
     * 索引的创建 IndexWriter 和索引速度的优化 }
     */
    @Scheduled(cron = "0 0/5 * * * ? ")
    public void updateIndex() {
        IndexWriter indexWriter = null;
        try {
            Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(INDEX_PATH));
            // 根据空格和符号来完成分词,还可以完成数字、字母、E-mail地址、IP地址以及中文字符的分析处理,还可以支持过滤词表,用来代替StopAnalyzer能够实现的过滤功能。
            // Analyzer analyzer = new StandardAnalyzer();
            // 实现了以词典为基础的正反向全切分,以及正反向最大匹配切分两种方法。IKAnalyzer是第三方实现的分词器,继承自Lucene的Analyzer类,针对中文文本进行处理。
            Analyzer analyzer = new IKAnalyzer(true);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
            indexWriterConfig.setRAMBufferSizeMB(16.0);
            indexWriter = new IndexWriter(directory, indexWriterConfig);
            long deleteCount = indexWriter.deleteAll(); // 清除以前的索引
            log.info("索引库清除完毕,清除数量为,deleteCount,{}", deleteCount);

            MongoDatabase database = mongoDBConfig.getDatabase("h5wx_login_server");
            MongoCollection<Document> collection = database.getCollection("player_tree_node_info");
            FindIterable<Document> Documents = collection.find();
            for (Document cursor : Documents) {
                org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
                document.add(new Field("id", cursor.getObjectId("_id").toString(), TextField.TYPE_STORED));
                document.add(new Field("playerId", cursor.getString("playerId"), TextField.TYPE_STORED));
                indexWriter.addDocument(document);
            }
        } catch (Exception e) {
            log.error("创建索引失败。 errorMsg,{}", e.getMessage());
        } finally {
            try {
                if (null != indexWriter) {
                    indexWriter.close();
                }
            } catch (Exception ex) {
                log.error("索引流关闭失败,error,{}", ex.getMessage());
            }
        }
    }

    @Scheduled(cron = "0 0/5 * * * ? ")
    public void updateGeographyIndex() {
        IndexWriter indexWriter = null;
        try {
            Directory directory =
                    FSDirectory.open(FileSystems.getDefault().getPath(GEOGRAPHY_INDEX_PATH));
            // 根据空格和符号来完成分词,还可以完成数字、字母、E-mail地址、IP地址以及中文字符的分析处理,还可以支持过滤词表,用来代替StopAnalyzer能够实现的过滤功能。
            // Analyzer analyzer = new StandardAnalyzer();
            // 实现了以词典为基础的正反向全切分,以及正反向最大匹配切分两种方法。IKAnalyzer是第三方实现的分词器,继承自Lucene的Analyzer类,针对中文文本进行处理。
            Analyzer analyzer = new IKAnalyzer(true);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
            indexWriterConfig.setRAMBufferSizeMB(16.0);
            indexWriter = new IndexWriter(directory, indexWriterConfig);
            long deleteCount = indexWriter.deleteAll(); // 清除以前的索引
            log.info("索引库清除完毕,清除数量为,deleteCount,{}", deleteCount);

            MongoDatabase database = mongoDBConfig.getDatabase("depth-search");
            MongoCollection<Document> collection = database.getCollection("mg_national_geography_repo");
            FindIterable<Document> Documents = collection.find();
            for (Document cursor : Documents) {
                org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
                document.add(new Field("id", cursor.getObjectId("_id").toString(), TextField.TYPE_STORED));
                document.add(new Field("scenery", cursor.getString("scenery"), TextField.TYPE_STORED));
                document.add(new Field("geography", cursor.getString("geography"), TextField.TYPE_STORED));
                document.add(new Field("title", cursor.getString("title"), TextField.TYPE_STORED));
                document.add(new Field("author", cursor.getString("author"), TextField.TYPE_STORED));
                indexWriter.addDocument(document);
            }
        } catch (Exception e) {
            log.error("创建索引失败。 errorMsg,{}", e.getMessage());
        } finally {
            try {
                if (null != indexWriter) {
                    indexWriter.close();
                }
            } catch (Exception ex) {
                log.error("索引流关闭失败,error,{}", ex.getMessage());
            }
        }
    }
}

2.根据关键字去索引库检索

package com.noseparte.lucene;

import com.noseparte.base.bean.Response;
import com.noseparte.base.controller.BaseController;
import com.noseparte.redis.RedisConstant;
import com.noseparte.redis.RedisUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.configurationprocessor.json.JSONObject;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.util.ArrayList;
import java.util.List;

/**
 * Copyright © 2018 noseparte © BeiJing BoLuo Network Technology Co. Ltd.
 *
 * @Author Noseparte
 * @Compile 2018-12-25 -- 11:15
 * @Version 1.0
 * @Description
 */
@Slf4j
@RestController
@RequestMapping("api/lucene")
public class LuceneSearchController extends BaseController {

    @Autowired
    private RedisUtils redisUtils;

    @PostMapping("/search")
    public Response retrieval(@RequestParam("keyword") String keyword){
        log.info("本次检索的关键词为:keyword,{} ========= ",keyword);
        List<String> result = new ArrayList<>();
        try{
            Analyzer analyzer = new IKAnalyzer(true);
            // 简单的查询,创建Query表示搜索域为content包含keyWord的文档
            //Query query = new QueryParser("content", analyzer).parse(keyWord);
            String[] fields = {"playerId"};
            // MUST 表示and,MUST_NOT 表示not ,SHOULD表示or
            BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD};
            // MultiFieldQueryParser表示多个域解析, 同时可以解析含空格的字符串,如果我们搜索"上海 中国"
            Query multiFieldQuery = MultiFieldQueryParser.parse(keyword, fields, clauses, analyzer);
            // 5、根据searcher搜索并且返回TopDocs
            IndexSearcher indexSearcher = LuceneIndexesFactory.init(LuceneIndexesFactory.INDEX_PATH);
            // 5、根据searcher搜索并且返回TopDocs
            TopDocs topDocs = indexSearcher.search(multiFieldQuery, 100);
            log.info("共找到的匹配处: hitsCount,{}",topDocs.totalHits);
            // 6、根据TopDocs获取ScoreDoc对象
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;
            log.info("共找到文档的匹配数:docLength,{}",scoreDocs.length);
            QueryScorer scorer = new QueryScorer(multiFieldQuery,"content");
            SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"color:red\">","</span>");
            Highlighter highlighter = new Highlighter(htmlFormatter, scorer);
            highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
            for(ScoreDoc scoreDoc : scoreDocs){
                // 7、根据searcher和ScoreDoc对象获取具体的Document对象
                Document document = indexSearcher.doc(scoreDoc.doc);
                String id = document.get("id");
                String playerId = document.get("playerId");
                JSONObject jsonObject = new JSONObject();
                jsonObject.put("id",id);
                jsonObject.put("playerId",playerId);
                result.add(jsonObject.toString());
            }
            return getResponse().success(result);
        }catch (Exception e){
            log.error("检索失败, 异常原因:errorMsg,{}",e.getMessage());
            return getResponse().failure(e.getMessage());
        }
    }

    @PostMapping("/searchGeo")
    public Response retrievalGeo(@RequestParam("keyword") String keyword){
        log.info("本次检索的关键词为:keyword,{} ========= ",keyword);
        List<String> result = new ArrayList<>();
        String[] fields = new String[10];
        try{
            Analyzer analyzer = new IKAnalyzer(true);
            // 简单的查询,创建Query表示搜索域为content包含keyWord的文档
            //Query query = new QueryParser("content", analyzer).parse(keyWord);
            String field = redisUtils.lGet(RedisConstant.GEO_FIELDS, 0L ,redisUtils.lGetListSize(RedisConstant.GEO_FIELDS));
            fields[0] = field;

            // MUST 表示and,MUST_NOT 表示not ,SHOULD表示or
            BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD};
            // MultiFieldQueryParser表示多个域解析, 同时可以解析含空格的字符串,如果我们搜索"上海 中国"
            Query multiFieldQuery = MultiFieldQueryParser.parse(keyword, fields, clauses, analyzer);
            // 5、根据searcher搜索并且返回TopDocs
            IndexSearcher indexSearcher = LuceneIndexesFactory.init(LuceneIndexesFactory.GEOGRAPHY_INDEX_PATH);
            // 5、根据searcher搜索并且返回TopDocs
            TopDocs topDocs = indexSearcher.search(multiFieldQuery, 100);
            log.info("共找到的匹配处: hitsCount,{}",topDocs.totalHits);
            // 6、根据TopDocs获取ScoreDoc对象
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;
            log.info("共找到文档的匹配数:docLength,{}",scoreDocs.length);
            QueryScorer scorer = new QueryScorer(multiFieldQuery,"content");
            SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"color:red\">","</span>");
            Highlighter highlighter = new Highlighter(htmlFormatter, scorer);
            highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
            for(ScoreDoc scoreDoc : scoreDocs){
                // 7、根据searcher和ScoreDoc对象获取具体的Document对象
                Document document = indexSearcher.doc(scoreDoc.doc);
                JSONObject jsonObject = new JSONObject();
                jsonObject.put("id", document.get("id"));
                jsonObject.put("scenery", document.get("scenery"));
                jsonObject.put("geography", document.get("geography"));
                jsonObject.put("link", document.get("link"));
                jsonObject.put("title", document.get("title"));
                jsonObject.put("author", document.get("author"));
                jsonObject.put("authorUrl",document.get("authorUrl"));
                jsonObject.put("publishedTime",document.get("publishedTime"));
                jsonObject.put("createTime",document.get("createTime"));
                result.add(jsonObject.toString());
            }
            return getResponse().success(result);
        }catch (Exception e){
            log.error("检索失败, 异常原因:errorMsg,{}",e.getMessage());
            return getResponse().failure(e.getMessage());
        }
    }



}

点击查看更多内容
TA 点赞

若觉得本文不错,就分享一下吧!

评论

作者其他优质文章

正在加载中
JAVA开发工程师
手记
粉丝
7112
获赞与收藏
545

关注作者,订阅最新文章

阅读免费教程

  • 推荐
  • 评论
  • 收藏
  • 共同学习,写下你的评论
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦
今天注册有机会得

100积分直接送

付费专栏免费学

大额优惠券免费领

立即参与 放弃机会
意见反馈 帮助中心 APP下载
官方微信

举报

0/150
提交
取消