国产探花免费观看_亚洲丰满少妇自慰呻吟_97日韩有码在线_资源在线日韩欧美_一区二区精品毛片,辰东完美世界有声小说,欢乐颂第一季,yy玄幻小说排行榜完本

首頁 > 學院 > 開發設計 > 正文

Lucene基礎(三)--中文分詞及高亮顯示

2019-11-14 15:23:15
字體:
來源:轉載
供稿:網友

Lucene分詞器及高亮

分詞器

在lucene中我們按照分詞方式把文檔進行索引,不同的分詞器索引的效果不太一樣,之前的例子使用的都是標準分詞器,對于英文的效果很好,但是中文分詞效果就不怎么樣,他會按照漢字的字直接分詞,沒有詞語的概念。

使用分詞的地方只需要把Analyzer實例化成我們第三方的分詞器即可

中文分詞有很多,這里使用IKAnalyzer 為例, 
下載地址 https://git.oschina.net/wltea/IK-Analyzer-2012FF 現在下來后里面有一篇教程。

高亮

導入lucene-highlighter-xxx.jar 在對查詢出來的結果實現高亮顯示

// 關鍵字高亮顯示的html標簽,需要導入lucene-highlighter-xxx.jar

  SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");

  Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));

  for (int i = 0; i < hits.length; i++) {

    Document doc = isearcher.doc(hits[i].doc);

    // 內容增加高亮顯示

    TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));

    String content = highlighter.getBestFragment(tokenStream, doc.get("content")); System.out.PRintln(content);

  }

 

Lucene中文分詞器

package lucene_demo04;

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
* 中文分詞,IKAnalayzer,對索引結果實現高亮顯示
*
* @author YipFun
*/
public class LuceneDemo04
{
  private static final Version version = Version.LUCENE_4_9;
  private Directory directory = null;
  private DirectoryReader ireader = null;
  private IndexWriter iwriter = null;
  private IKAnalyzer analyzer;

  // 測試數據
  private String[] content = { "你好,我是中共人", "中華人民共和國", "中國人民從此站起來了", "Lucene是一個不錯的全文檢索的工具", "全文檢索中文分詞" };

  /**
   * 構造方法
   */
  public LuceneDemo04()
  {
    directory = new RAMDirectory();
  }

  private IKAnalyzer getAnalyzer()
  {
    if (analyzer == null)
    {
      return new IKAnalyzer();
    } else
    {
      return analyzer;
    }
  }

  /**
  * 創建索引
  */
  public void createIndex()
  {
    Document doc = null;
    try
    {
      IndexWriterConfig iwConfig = new IndexWriterConfig(version, getAnalyzer());
      iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
      iwriter = new IndexWriter(directory, iwConfig);
      for (String text : content)
      {
      doc = new Document();
      doc.add(new TextField("content", text, Field.Store.YES));
      iwriter.addDocument(doc);
    }

  } catch (IOException e)
  {
    e.printStackTrace();
  } finally
  {
  try
  {
    if (iwriter != null)
    iwriter.close();
  } catch (IOException e)
  {
    e.printStackTrace();
  }
  }

  }

  public IndexSearcher getSearcher()
  {
    try
    {
      if (ireader == null)
      {
        ireader = DirectoryReader.open(directory);
      } else
      {
        DirectoryReader tr = DirectoryReader.openIfChanged(ireader);
        if (tr != null)
        {
          ireader.close();
          ireader = tr;
        }
      }
      return new IndexSearcher(ireader);
    } catch (CorruptIndexException e)
    {
      e.printStackTrace();
    } catch (IOException e)
    {
      e.printStackTrace();
    }
    return null;
  }

  public void searchByTerm(String field, String keyWord, int num) throws InvalidTokenOffsetsException
  {
    IndexSearcher isearcher = getSearcher();
    Analyzer analyzer = getAnalyzer();
    // 使用QueryParser查詢分析器構造Query對象
    QueryParser qp = new QueryParser(version, field, analyzer);
    // 這句所起效果?
    qp.setDefaultOperator(QueryParser.OR_OPERATOR);
    try
    {
      Query query = qp.parse(keyword);
      ScoreDoc[] hits;

      // 注意searcher的幾個方法
      hits = isearcher.search(query, null, num).scoreDocs;

      // 關鍵字高亮顯示的html標簽,需要導入lucene-highlighter-xxx.jar
      SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
      Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));

      for (int i = 0; i < hits.length; i++)
      {
        Document doc = isearcher.doc(hits[i].doc);
        // 內容增加高亮顯示
        TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
        String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
        System.out.println(content);
      }

    } catch (IOException e)
    {
      e.printStackTrace();
    } catch (ParseException e)
    {
      e.printStackTrace();
    }
  }

  /**
  * 使用過濾器查詢
  *
  * @param field
  * @param keyword
  * @param num
  * @throws InvalidTokenOffsetsException
  */
  public void searchByTermFilter(String field, String keyword, int num) throws InvalidTokenOffsetsException
  {
    IndexSearcher isearcher = getSearcher();
    Analyzer analyzer = getAnalyzer();
    // 使用QueryParser查詢分析器構造Query對象
    QueryParser qp = new QueryParser(version, field, analyzer);
    // 這句所起效果?
    qp.setDefaultOperator(QueryParser.OR_OPERATOR);
    try
    {
      Query query = qp.parse(keyword);
      Query q2 = qp.parse("全文檢索");
      ScoreDoc[] hits;

      QueryWrapperFilter filter = new QueryWrapperFilter(q2);
      // 注意searcher的幾個方法
      hits = isearcher.search(query, filter, num).scoreDocs;

      // 關鍵字高亮顯示的html標簽,需要導入lucene-highlighter-xxx.jar
      SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
      Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));

      for (int i = 0; i < hits.length; i++)
      {
        Document doc = isearcher.doc(hits[i].doc);
        // 內容增加高亮顯示
        TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
        String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
        System.out.println(content);
      }

    } catch (IOException e)
    {
      e.printStackTrace();
    } catch (ParseException e)
    {
      e.printStackTrace();
    }
  }

  public static void main(String[] args) throws InvalidTokenOffsetsException
  {
    System.out.println("start");
    LuceneDemo04 ld = new LuceneDemo04();
    ld.createIndex();
    long start = System.currentTimeMillis();
    ld.searchByTerm("content", "人民", 500);
    System.out.println("end search use " + (System.currentTimeMillis() - start) + "ms");
  }

}

運行結果:

 

start 加載擴展詞典:ext.dic

加載擴展停止詞典:stopword.dic

中華<span style='color:red'>人民</span>共和國

中國<span style='color:red'>人民</span>從此站起來了

end search use 129ms


發表評論 共有條評論
用戶名: 密碼:
驗證碼: 匿名發表
主站蜘蛛池模板: 阿拉善右旗| 桂林市| 开江县| 迁安市| 苍梧县| 同心县| 涟源市| 和硕县| 玛沁县| 沙田区| 穆棱市| 广饶县| 石台县| 台江县| 罗山县| 临泽县| 五常市| 惠安县| 句容市| 安丘市| 乳源| 什邡市| 临泉县| 根河市| 古蔺县| 樟树市| 体育| 安泽县| 新余市| 清远市| 皋兰县| 诏安县| 辽阳市| 湘潭市| 兰考县| 宜章县| 称多县| 页游| 新民市| 昌宁县| 天祝|