installation_lucene.md.vm 6.58 KB

Edit Raw Blame History

#Apache Lucene

Some code snippets to illustrate the use of Mtas directly with [Apache Lucene](https://lucene.apache.org/).

**Create index**

Create an index with three folia files using [configuration](indexing_configuration.html) file `folia.xml`

```java
String configFile = "folia.xml";
HashMap<String,String> files = new HashMap<String,String>();
files.put("title 1","resource1.xml.gz");
files.put("title 2","resource2.xml.gz");
files.put("title 3","resource3.xml.gz");
CreateIndex createIndex = new CreateIndex(configFile, files);
```

**Basic search**

With the created index and for [CQL](search_cql.html) expression `[pos="LID"]`

```java
String cql = "[pos=\"LID\"]";
Directory directory = createIndex.getDirectory();
```

the number of hits in each document can be computed with

```java
IndexReader indexReader = DirectoryReader.open(directory);
MtasSpanQuery q = createQuery(CreateIndex.FIELD_CONTENT, cql, null, null);
ListIterator<LeafReaderContext> iterator = indexReader.leaves()
    .listIterator();
IndexSearcher searcher = new IndexSearcher(indexReader);
SpanWeight spanweight = ((MtasSpanQuery) q.rewrite(indexReader))
    .createWeight(searcher, false);
while (iterator.hasNext()) {
  LeafReaderContext lrc = iterator.next();
  Spans spans = spanweight.getSpans(lrc, SpanWeight.Postings.POSITIONS);
  SegmentReader r = (SegmentReader) lrc.reader();
  if (spans != null) {
    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
      if (r.numDocs()==r.maxDoc() || r.getLiveDocs().get(spans.docID())) {
        System.out.print("Document "+(lrc.docBase+spans.docID())+": ");
        int hits = 0;
        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
          hits++;
        }
        System.out.println(hits+" hits in '"+r.document((lrc.docBase+spans.docID())).get(CreateIndex.FIELD_TITLE)+"'");
      }
    }
  }
}
indexReader.close();
```

**Advanced search**

By using the provided `collect` method, also more advanced
options are available, like computing the [termvector](search_query_termvector.html)

```java
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(indexReader);
ComponentField fieldStats = new ComponentField(CreateIndex.FIELD_CONTENT, CreateIndex.FIELD_ID);
ArrayList<Integer> fullDocSet = new ArrayList<Integer>(Arrays.asList(new Integer[]{0,1,2}));
ArrayList<Integer> fullDocList = new ArrayList<Integer>();
try {
  fieldStats.termVectorList.add(new ComponentTermVector("wordList", "t", null, false, "n,sum", CodecUtil.STATS_TYPE_SUM, CodecUtil.SORT_DESC, null, 10, null, null, null, null, null, null, null, null, null));
  CodecUtil.collect(CreateIndex.FIELD_CONTENT, searcher, indexReader, fullDocList,
      fullDocSet, fieldStats);
  for (ComponentTermVector ct : fieldStats.termVectorList) {
    HashMap<String, Map<String, Object>> tvList = new HashMap<String, Map<String, Object>>();
    Map<String, ?> tcList = ct.subComponentFunction.dataCollector
        .getResult().getList();
    for (String key : tcList.keySet()) {
      tvList.put(key,
          ((MtasDataItem<?, ?>) tcList.get(key)).rewrite(false));
    }
    System.out.println(tvList);
  }
} catch (IllegalAccessException | IllegalArgumentException
    | InvocationTargetException | mtas.parser.function.ParseException e) {
  e.printStackTrace();
}
```


**Appendix**

Code class `CreateIndex`

```java
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;

public class CreateIndex {

  public static String FIELD_ID = "id";
  public static String FIELD_TITLE = "title";
  public static String FIELD_CONTENT = "content";

  private Directory directory;

  public CreateIndex(String configFile, HashMap<String, String> files)
      throws IOException {
    this(null, configFile, files);
  }

  public CreateIndex(String indexPath, String configFile,
      HashMap<String, String> files) throws IOException {
    initialize(null, configFile, files);
  }

  public Directory getDirectory() {
    return directory;
  }

  private void initialize(String indexPath, String configFile,
      HashMap<String, String> files) throws IOException {
    if (indexPath != null) {
      directory = FSDirectory.open(Paths.get(indexPath));
    } else {
      directory = new RAMDirectory();
    }
    Map<String, String> paramsCharFilterMtas = new HashMap<String, String>();
    paramsCharFilterMtas.put("type", "file");
    Map<String, String> paramsTokenizer = new HashMap<String, String>();
    paramsTokenizer.put("configFile", configFile);
    Analyzer mtasAnalyzer = CustomAnalyzer
        .builder(Paths.get("docker").toAbsolutePath())
        .addCharFilter("mtas", paramsCharFilterMtas)
        .withTokenizer("mtas", paramsTokenizer).build();
    Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
    analyzerPerField.put(FIELD_CONTENT, mtasAnalyzer);
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
        new StandardAnalyzer(), analyzerPerField);
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setUseCompoundFile(false);
    config.setCodec(Codec.forName("MtasCodec"));
    IndexWriter w = new IndexWriter(directory, config);
    w.deleteAll();
    int counter = 0;
    for (String title : files.keySet()) {
      Document doc = new Document();
      doc.add(new StringField(FIELD_ID, Integer.valueOf(counter).toString(),
          Field.Store.YES));
      doc.add(new StringField(FIELD_TITLE, title, Field.Store.YES));
      doc.add(new TextField(FIELD_CONTENT, files.get(title), Field.Store.YES));
      w.addDocument(doc);
      counter++;
    }
    w.commit();
    w.close();
  }
}
```

Code method `createQuery`

```java
MtasSpanQuery createQuery(String field, String cql,
      MtasSpanQuery ignore, Integer maximumIgnoreLength) throws ParseException {
  Reader reader = new BufferedReader(new StringReader(cql));
  MtasCQLParser p = new MtasCQLParser(reader);
  return p.parse(field, null, null, ignore, maximumIgnoreLength);
}
```