Java Artisan / Neil Chan: Hello Lucene 3.6.0

Lucene 不是一個應用程式，只是 Library，用來建立與查詢索引。

查詢要快，就要使用索引（Index），就像是書本後面常有的索引表，用索引表來找一個字遠比從書本第一頁找起一定快得多；作為一個 Information Retrieval（IR） Library，除了查詢功能外，還要有建立索引的功能。

建立索引 123

Document - Document 由一堆 Field 組成，Field 由 name-value 組成，value 就是放索引文字的地方，name 自行定義，Document 可以看成是 Java 的 Map。
Analyzer - 拆字，將 Document 裡的 Field value 拆成關鍵字。
IndexWriter - 將關鍵字寫到索引檔，就是 Directory 裡。

查詢索引 123

IndexSearcher - 用 IndexReader 開啟索引檔 Directory。
Query - 以關鍵字建立 Query 物件。
TopDocs - 將 Query 物件丟給 IndexSearch 去做查詢，得到結果 TopDocs。

以下範例以 Lucene Demo 做修改而來。

public class HelloLucene {

  private static final String F_ID = "id";
  private static final String F_NAME = "name";
  private static final String F_ADDRESS = "address";
  private static final String F_CITY = "city";
  private static final Version VERSION = Version.LUCENE_36;
  private String indexPath = "index";

  public static void main(String[] args) throws IOException {
    HelloLucene hello = new HelloLucene();
    hello.index();
    hello.search(HelloLucene.F_NAME, "George");
    hello.search(HelloLucene.F_ADDRESS, "College");
    hello.search(HelloLucene.F_CITY, "Oslo");
  }

  /**
   * 建立索引檔
   */
  public void index() {
    IndexWriter writer = null;
    try {
      // 使用標準 Analyzer 來拆字
      Analyzer analyzer = new StandardAnalyzer(HelloLucene.VERSION);
      // 建立設定檔，Lucene 3.1 以後才有的
      IndexWriterConfig config = new IndexWriterConfig(HelloLucene.VERSION,
          analyzer);
      // 新增索引檔
      config.setOpenMode(OpenMode.CREATE);
      // config.setOpenMode(OpenMode.CREATE_OR_APPEND);

      // Optional: for better indexing performance, if you
      // are indexing many documents, increase the RAM
      // buffer.  But if you do this, increase the max heap
      // size to the JVM (eg add -Xmx512m or -Xmx1g):
      // config.setRAMBufferSizeMB(256.0);

      // 建立 IndexWriter
      writer = new IndexWriter(FSDirectory.open(new File(this.indexPath)),
          config);

      // 建立索引
      this.indexAddress(writer);

      // NOTE: if you want to maximize search performance,
      // you can optionally call forceMerge here.  This can be
      // a terribly costly operation, so generally it's only
      // worth it when your index is relatively static (ie
      // you're done adding documents to it):
      // writer.forceMerge(1);
    }
    catch (IOException e) {
      e.printStackTrace();
    }
    finally {
      if (writer != null) {
        try {
          writer.close();
        }
        catch (IOException e) {
          e.printStackTrace();
        }
      }
    }
  }

  /**
   * 建立地址索引檔
   * 
   * @param writer
   * @throws IOException
   */
  private void indexAddress(IndexWriter writer) throws IOException {
    // 讀進地址資料
    List<Map<String, String>> list = this.loadAddress();
    // 逐筆建立索引
    Document doc;
    String name;
    for (Map<String, String> map : list) {
      doc = new Document();
      name = map.get("firstname") + " " + map.get("lastname");
      // 姓名要可以查詢所以要拆字，列表要顯示所以要儲存
      doc.add(new Field(HelloLucene.F_NAME, name, Field.Store.YES,
          Field.Index.ANALYZED));
      // 地址要可以查詢所以要拆字，列表不用顯示所以不要儲存
      doc.add(new Field(HelloLucene.F_ADDRESS, map.get("address"),
          Field.Store.NO, Field.Index.ANALYZED));
      // 城市要可以查詢所以要拆字，列表要顯示所以要儲存
      doc.add(new Field(HelloLucene.F_CITY, map.get("city"), Field.Store.YES,
          Field.Index.ANALYZED));
      // 數字欄位
      NumericField modifiedField = new NumericField(HelloLucene.F_ID,
          Field.Store.YES, true);
      modifiedField.setIntValue(Integer.parseInt(map.get("id")));
      doc.add(modifiedField);
      // 建立或更新索引
      if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
        System.out.println("adding " + name);
        writer.addDocument(doc);
      }
      else {
        System.out.println("updating " + name);
        writer.updateDocument(new Term(HelloLucene.F_ID, map.get("id")), doc);
      }
    }
  }

  /**
   * 讀進地址資料
   * 
   * @return
   * @throws IOException
   */
  private List<Map<String, String>> loadAddress() throws IOException {
    CsvReader reader = new CsvReader("src/address.csv");
    reader.readHeaders();
    List<Map<String, String>> list = new ArrayList<Map<String, String>>();
    Map<String, String> map;
    while (reader.readRecord()) {
      map = new HashMap<String, String>();
      map.put("id", reader.get("id"));
      map.put("firstname", StringUtils.strip(reader.get("firstname"), "'"));
      map.put("lastname", StringUtils.strip(reader.get("lastname"), "'"));
      map.put("address", StringUtils.strip(reader.get("address"), "'"));
      map.put("city", StringUtils.strip(reader.get("city"), "'"));
      list.add(map);
    }
    reader.close();
    return list;
  }

  /**
   * 查詢索引檔
   * 
   * @param field
   * @param q
   */
  public void search(String field, String q) {
    IndexReader reader = null;
    IndexSearcher searcher = null;
    try {
      // 開啟索引檔
      reader = IndexReader.open(FSDirectory.open(new File(this.indexPath)));
      // 建立 IndexSearcher
      searcher = new IndexSearcher(reader);
      // 使用標準 Analyzer，要與建立索引的 Analyzer 一致
      // 不然結果會很怪，就像是用不同語言溝通一樣
      Analyzer analyzer = new StandardAnalyzer(VERSION);
      // 查詢
      QueryParser parser = new QueryParser(VERSION, field, analyzer);
      Query query = parser.parse(q);
      TopDocs results = searcher.search(query, 100);
      ScoreDoc[] hits = results.scoreDocs;
      int totalHits = results.totalHits;
      System.out.println(totalHits + " total matching documents");
      int idx = 1;
      for (ScoreDoc hit : hits) {
        Document doc = searcher.doc(hit.doc);
        String id = doc.get(HelloLucene.F_ID);
        System.out.println(idx++ + ". " + id + "   Name: "
            + doc.get(HelloLucene.F_NAME) + "   Address: "
            + doc.get(HelloLucene.F_ADDRESS) + "   City: "
            + doc.get(HelloLucene.F_CITY));
      }
    }
    catch (IOException e) {
      e.printStackTrace();
    }
    catch (ParseException e) {
      e.printStackTrace();
    }
    finally {
      if (reader != null) {
        try {
          reader.close();
        }
        catch (IOException e) {
          e.printStackTrace();
        }
      }
      if (searcher != null) {
        try {
          searcher.close();
        }
        catch (IOException e) {
          e.printStackTrace();
        }
      }
    }
  }
}

2012-07-13

Hello Lucene 3.6.0

沒有留言:

張貼留言