Lucene给文本索引和搜索功能的应用(2)

System.out.println("Indexing " + numIndexed + " files took "
      + (end - start) + " milliseconds");
  }
  private IndexWriter writer;
  public Indexer(String indexDir) throws IOException {
      Directory dir = FSDirectory.open(new File(indexDir));
      writer = new IndexWriter(dir,indexWriterConfig());
      //在这里进行索引的调试
            }

public void close() throws IOException {
    writer.close();                            //4
  }
  private IndexWriterConfig indexWriterConfig()
{
 Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47);
 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
 return config;
}
  public int index(String dataDir, FileFilter filter)
    throws Exception {


    File[] files = new File(dataDir).listFiles();


    for (File f: files) {
      if (!f.isDirectory() &&
          !f.isHidden() &&
          f.exists() &&
          f.canRead() &&
          (filter == null || filter.accept(f))) {
        indexFile(f);
      }
    }


    return writer.numDocs();                    //5
  }


  private static class TextFilesFilter implements FileFilter {
    public boolean accept(File path) {
      return path.getName().toLowerCase()        //6
            .endsWith(".txt");                  //6
    }
  }
 
  /**
    * 遍历每一个文件,然后读出文件中的每一行数据,当成一个document来处理
    * @param f
    * @throws Exception
    */
  private void indexFile(File f) throws Exception {
    System.out.println("Indexing " + f.getCanonicalPath());
  // Document doc = getDocument(f);
    List<String> lists = readFileNoDup(f);
    for(String list:lists){
    Document doc = new Document();
    doc.add(new Field("contents",list,TextField.TYPE_STORED));
    writer.addDocument(doc);   
    }
   
                            //10
  }
  //读取一个文件
  private List<String> readFile(File filePathAndName)throws IOException {

FileInputStream fis = new FileInputStream(filePathAndName);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader br = new BufferedReader(isr);
LineNumberReader lnr = new LineNumberReader(br);

List<String> returnValue = new ArrayList<String>();
int cnt = 0;
while (true) {
cnt++;
String tempStr = lnr.readLine();
if (tempStr == null)
break;
if (tempStr.length() < 2)
continue;
returnValue.add(tempStr);
}
lnr.close();
br.close();
isr.close();
fis.close();
return returnValue;
}
  //读取一个文件并排重后返回
  public static List<String> readFileNoDup(File filePathAndName)
throws IOException {
 
FileInputStream fis = new FileInputStream(filePathAndName);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader br = new BufferedReader(isr);
LineNumberReader lnr = new LineNumberReader(br);

Set<String> set = new HashSet<String>();
while (true) {
String tempStr = lnr.readLine();
if (tempStr == null)
break;
if (tempStr.length() < 2)
continue;
set.add(tempStr.trim());
}
lnr.close();
br.close();
isr.close();
fis.close();
List<String> returnValue = new ArrayList<String>(set.size());
returnValue.addAll(set);
return returnValue;
        }
}

//对刚才已经建好的索引进行搜索

package lucene.home.clq;


 


/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan     
*/


import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:http://www.heiqu.com/c2b8cd7d772c9b7f3bc1164f8f47b3cf.html