查詢要快,就要使用索引(Index),就像是書本後面常有的索引表,用索引表來找一個字遠比從書本第一頁找起一定快得多;作為一個 Information Retrieval(IR) Library,除了查詢功能外,還要有建立索引的功能。
建立索引 123
- Document - Document 由一堆 Field 組成,Field 由 name-value 組成,value 就是放索引文字的地方,name 自行定義,Document 可以看成是 Java 的 Map。
- Analyzer - 拆字,將 Document 裡的 Field value 拆成關鍵字。
- IndexWriter - 將關鍵字寫到索引檔,就是 Directory 裡。
查詢索引 123
- IndexSearcher - 用 IndexReader 開啟索引檔 Directory。
- Query - 以關鍵字建立 Query 物件。
- TopDocs - 將 Query 物件丟給 IndexSearch 去做查詢,得到結果 TopDocs。
以下範例以 Lucene Demo 做修改而來。
public class HelloLucene { private static final String F_ID = "id"; private static final String F_NAME = "name"; private static final String F_ADDRESS = "address"; private static final String F_CITY = "city"; private static final Version VERSION = Version.LUCENE_36; private String indexPath = "index"; public static void main(String[] args) throws IOException { HelloLucene hello = new HelloLucene(); hello.index(); hello.search(HelloLucene.F_NAME, "George"); hello.search(HelloLucene.F_ADDRESS, "College"); hello.search(HelloLucene.F_CITY, "Oslo"); } /** * 建立索引檔 */ public void index() { IndexWriter writer = null; try { // 使用標準 Analyzer 來拆字 Analyzer analyzer = new StandardAnalyzer(HelloLucene.VERSION); // 建立設定檔,Lucene 3.1 以後才有的 IndexWriterConfig config = new IndexWriterConfig(HelloLucene.VERSION, analyzer); // 新增索引檔 config.setOpenMode(OpenMode.CREATE); // config.setOpenMode(OpenMode.CREATE_OR_APPEND); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // config.setRAMBufferSizeMB(256.0); // 建立 IndexWriter writer = new IndexWriter(FSDirectory.open(new File(this.indexPath)), config); // 建立索引 this.indexAddress(writer); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // writer.forceMerge(1); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * 建立地址索引檔 * * @param writer * @throws IOException */ private void indexAddress(IndexWriter writer) throws IOException { // 讀進地址資料 List<Map<String, String>> list = this.loadAddress(); // 逐筆建立索引 Document doc; String name; for (Map<String, String> map : list) { doc = new Document(); name = map.get("firstname") + " " + map.get("lastname"); // 姓名要可以查詢所以要拆字,列表要顯示所以要儲存 doc.add(new Field(HelloLucene.F_NAME, name, Field.Store.YES, Field.Index.ANALYZED)); // 地址要可以查詢所以要拆字,列表不用顯示所以不要儲存 doc.add(new Field(HelloLucene.F_ADDRESS, map.get("address"), Field.Store.NO, Field.Index.ANALYZED)); // 城市要可以查詢所以要拆字,列表要顯示所以要儲存 doc.add(new Field(HelloLucene.F_CITY, map.get("city"), Field.Store.YES, Field.Index.ANALYZED)); // 數字欄位 NumericField modifiedField = new NumericField(HelloLucene.F_ID, Field.Store.YES, true); modifiedField.setIntValue(Integer.parseInt(map.get("id"))); doc.add(modifiedField); // 建立或更新索引 if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { System.out.println("adding " + name); writer.addDocument(doc); } else { System.out.println("updating " + name); writer.updateDocument(new Term(HelloLucene.F_ID, map.get("id")), doc); } } } /** * 讀進地址資料 * * @return * @throws IOException */ private List<Map<String, String>> loadAddress() throws IOException { CsvReader reader = new CsvReader("src/address.csv"); reader.readHeaders(); List<Map<String, String>> list = new ArrayList<Map<String, String>>(); Map<String, String> map; while (reader.readRecord()) { map = new HashMap<String, String>(); map.put("id", reader.get("id")); map.put("firstname", StringUtils.strip(reader.get("firstname"), "'")); map.put("lastname", StringUtils.strip(reader.get("lastname"), "'")); map.put("address", StringUtils.strip(reader.get("address"), "'")); map.put("city", StringUtils.strip(reader.get("city"), "'")); list.add(map); } reader.close(); return list; } /** * 查詢索引檔 * * @param field * @param q */ public void search(String field, String q) { IndexReader reader = null; IndexSearcher searcher = null; try { // 開啟索引檔 reader = IndexReader.open(FSDirectory.open(new File(this.indexPath))); // 建立 IndexSearcher searcher = new IndexSearcher(reader); // 使用標準 Analyzer,要與建立索引的 Analyzer 一致 // 不然結果會很怪,就像是用不同語言溝通一樣 Analyzer analyzer = new StandardAnalyzer(VERSION); // 查詢 QueryParser parser = new QueryParser(VERSION, field, analyzer); Query query = parser.parse(q); TopDocs results = searcher.search(query, 100); ScoreDoc[] hits = results.scoreDocs; int totalHits = results.totalHits; System.out.println(totalHits + " total matching documents"); int idx = 1; for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); String id = doc.get(HelloLucene.F_ID); System.out.println(idx++ + ". " + id + " Name: " + doc.get(HelloLucene.F_NAME) + " Address: " + doc.get(HelloLucene.F_ADDRESS) + " City: " + doc.get(HelloLucene.F_CITY)); } } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } if (searcher != null) { try { searcher.close(); } catch (IOException e) { e.printStackTrace(); } } } } }
沒有留言:
張貼留言