查詢要快,就要使用索引(Index),就像是書本後面常有的索引表,用索引表來找一個字遠比從書本第一頁找起一定快得多;作為一個 Information Retrieval(IR) Library,除了查詢功能外,還要有建立索引的功能。
建立索引 123
- Document - Document 由一堆 Field 組成,Field 由 name-value 組成,value 就是放索引文字的地方,name 自行定義,Document 可以看成是 Java 的 Map。
- Analyzer - 拆字,將 Document 裡的 Field value 拆成關鍵字。
- IndexWriter - 將關鍵字寫到索引檔,就是 Directory 裡。
查詢索引 123
- IndexSearcher - 用 IndexReader 開啟索引檔 Directory。
- Query - 以關鍵字建立 Query 物件。
- TopDocs - 將 Query 物件丟給 IndexSearch 去做查詢,得到結果 TopDocs。
以下範例以 Lucene Demo 做修改而來。
public class HelloLucene {
private static final String F_ID = "id";
private static final String F_NAME = "name";
private static final String F_ADDRESS = "address";
private static final String F_CITY = "city";
private static final Version VERSION = Version.LUCENE_36;
private String indexPath = "index";
public static void main(String[] args) throws IOException {
HelloLucene hello = new HelloLucene();
hello.index();
hello.search(HelloLucene.F_NAME, "George");
hello.search(HelloLucene.F_ADDRESS, "College");
hello.search(HelloLucene.F_CITY, "Oslo");
}
/**
* 建立索引檔
*/
public void index() {
IndexWriter writer = null;
try {
// 使用標準 Analyzer 來拆字
Analyzer analyzer = new StandardAnalyzer(HelloLucene.VERSION);
// 建立設定檔,Lucene 3.1 以後才有的
IndexWriterConfig config = new IndexWriterConfig(HelloLucene.VERSION,
analyzer);
// 新增索引檔
config.setOpenMode(OpenMode.CREATE);
// config.setOpenMode(OpenMode.CREATE_OR_APPEND);
// Optional: for better indexing performance, if you
// are indexing many documents, increase the RAM
// buffer. But if you do this, increase the max heap
// size to the JVM (eg add -Xmx512m or -Xmx1g):
// config.setRAMBufferSizeMB(256.0);
// 建立 IndexWriter
writer = new IndexWriter(FSDirectory.open(new File(this.indexPath)),
config);
// 建立索引
this.indexAddress(writer);
// NOTE: if you want to maximize search performance,
// you can optionally call forceMerge here. This can be
// a terribly costly operation, so generally it's only
// worth it when your index is relatively static (ie
// you're done adding documents to it):
// writer.forceMerge(1);
}
catch (IOException e) {
e.printStackTrace();
}
finally {
if (writer != null) {
try {
writer.close();
}
catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 建立地址索引檔
*
* @param writer
* @throws IOException
*/
private void indexAddress(IndexWriter writer) throws IOException {
// 讀進地址資料
List<Map<String, String>> list = this.loadAddress();
// 逐筆建立索引
Document doc;
String name;
for (Map<String, String> map : list) {
doc = new Document();
name = map.get("firstname") + " " + map.get("lastname");
// 姓名要可以查詢所以要拆字,列表要顯示所以要儲存
doc.add(new Field(HelloLucene.F_NAME, name, Field.Store.YES,
Field.Index.ANALYZED));
// 地址要可以查詢所以要拆字,列表不用顯示所以不要儲存
doc.add(new Field(HelloLucene.F_ADDRESS, map.get("address"),
Field.Store.NO, Field.Index.ANALYZED));
// 城市要可以查詢所以要拆字,列表要顯示所以要儲存
doc.add(new Field(HelloLucene.F_CITY, map.get("city"), Field.Store.YES,
Field.Index.ANALYZED));
// 數字欄位
NumericField modifiedField = new NumericField(HelloLucene.F_ID,
Field.Store.YES, true);
modifiedField.setIntValue(Integer.parseInt(map.get("id")));
doc.add(modifiedField);
// 建立或更新索引
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
System.out.println("adding " + name);
writer.addDocument(doc);
}
else {
System.out.println("updating " + name);
writer.updateDocument(new Term(HelloLucene.F_ID, map.get("id")), doc);
}
}
}
/**
* 讀進地址資料
*
* @return
* @throws IOException
*/
private List<Map<String, String>> loadAddress() throws IOException {
CsvReader reader = new CsvReader("src/address.csv");
reader.readHeaders();
List<Map<String, String>> list = new ArrayList<Map<String, String>>();
Map<String, String> map;
while (reader.readRecord()) {
map = new HashMap<String, String>();
map.put("id", reader.get("id"));
map.put("firstname", StringUtils.strip(reader.get("firstname"), "'"));
map.put("lastname", StringUtils.strip(reader.get("lastname"), "'"));
map.put("address", StringUtils.strip(reader.get("address"), "'"));
map.put("city", StringUtils.strip(reader.get("city"), "'"));
list.add(map);
}
reader.close();
return list;
}
/**
* 查詢索引檔
*
* @param field
* @param q
*/
public void search(String field, String q) {
IndexReader reader = null;
IndexSearcher searcher = null;
try {
// 開啟索引檔
reader = IndexReader.open(FSDirectory.open(new File(this.indexPath)));
// 建立 IndexSearcher
searcher = new IndexSearcher(reader);
// 使用標準 Analyzer,要與建立索引的 Analyzer 一致
// 不然結果會很怪,就像是用不同語言溝通一樣
Analyzer analyzer = new StandardAnalyzer(VERSION);
// 查詢
QueryParser parser = new QueryParser(VERSION, field, analyzer);
Query query = parser.parse(q);
TopDocs results = searcher.search(query, 100);
ScoreDoc[] hits = results.scoreDocs;
int totalHits = results.totalHits;
System.out.println(totalHits + " total matching documents");
int idx = 1;
for (ScoreDoc hit : hits) {
Document doc = searcher.doc(hit.doc);
String id = doc.get(HelloLucene.F_ID);
System.out.println(idx++ + ". " + id + " Name: "
+ doc.get(HelloLucene.F_NAME) + " Address: "
+ doc.get(HelloLucene.F_ADDRESS) + " City: "
+ doc.get(HelloLucene.F_CITY));
}
}
catch (IOException e) {
e.printStackTrace();
}
catch (ParseException e) {
e.printStackTrace();
}
finally {
if (reader != null) {
try {
reader.close();
}
catch (IOException e) {
e.printStackTrace();
}
}
if (searcher != null) {
try {
searcher.close();
}
catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
沒有留言:
張貼留言