Blame view

src/site/markdown/installation_lucene.md.vm 6.59 KB
Matthijs Brouwer authored
1
#Apache Lucene
Matthijs Brouwer authored
2
Matthijs Brouwer authored
3
Some code snippets to illustrate the use of Mtas directly with [Apache Lucene](https://lucene.apache.org/).
Matthijs Brouwer authored
4
Matthijs Brouwer authored
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
**Create index**

Create an index with three folia files using [configuration](indexing_configuration.html) file `folia.xml`

```java
String configFile = "folia.xml";
HashMap<String,String> files = new HashMap<String,String>();  
files.put("title 1","resource1.xml.gz");
files.put("title 2","resource2.xml.gz");
files.put("title 3","resource3.xml.gz");
CreateIndex createIndex = new CreateIndex(configFile, files);
```

**Basic search**

With the created index and for [CQL](search_cql.html) expression `[pos="LID"]`

```java
String cql = "[pos=\"LID\"]";
Directory directory = createIndex.getDirectory();
```

the number of hits in each document can be computed with

```java
IndexReader indexReader = DirectoryReader.open(directory);
MtasSpanQuery q = createQuery(CreateIndex.FIELD_CONTENT, cql, null, null);
ListIterator<LeafReaderContext> iterator = indexReader.leaves()
    .listIterator();
IndexSearcher searcher = new IndexSearcher(indexReader);
SpanWeight spanweight = ((MtasSpanQuery) q.rewrite(indexReader))
    .createWeight(searcher, false);
while (iterator.hasNext()) {
  LeafReaderContext lrc = iterator.next();
  Spans spans = spanweight.getSpans(lrc, SpanWeight.Postings.POSITIONS);
  SegmentReader r = (SegmentReader) lrc.reader();
  if (spans != null) {
    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
      if (r.numDocs()==r.maxDoc() || r.getLiveDocs().get(spans.docID())) { 
        System.out.print("Document "+(lrc.docBase+spans.docID())+": ");
        int hits = 0;
        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
          hits++;              
        }
        System.out.println(hits+" hits in '"+r.document((lrc.docBase+spans.docID())).get(CreateIndex.FIELD_TITLE)+"'");
      } 
    }
  }
}
indexReader.close();
```

**Advanced search**

By using the provided `collect` method, also more advanced 
Matthijs Brouwer authored
60
options are available, like computing the [termvector](search_component_termvector.html)
Matthijs Brouwer authored
61
62
63
64
65
66
67
68

```java
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(indexReader);
ComponentField fieldStats = new ComponentField(CreateIndex.FIELD_CONTENT, CreateIndex.FIELD_ID);
ArrayList<Integer> fullDocSet = new ArrayList<Integer>(Arrays.asList(new Integer[]{0,1,2}));
ArrayList<Integer> fullDocList = new ArrayList<Integer>();
try {
Matthijs Brouwer authored
69
  fieldStats.termVectorList.add(new ComponentTermVector("wordList", "t", null, false, "n,sum", CodecUtil.STATS_TYPE_SUM, CodecUtil.SORT_DESC, null, 10, null, null, null, null, null, null, null, null, null));
Matthijs Brouwer authored
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
  CodecUtil.collect(CreateIndex.FIELD_CONTENT, searcher, indexReader, fullDocList,
      fullDocSet, fieldStats);
  for (ComponentTermVector ct : fieldStats.termVectorList) {
    HashMap<String, Map<String, Object>> tvList = new HashMap<String, Map<String, Object>>();
    Map<String, ?> tcList = ct.subComponentFunction.dataCollector
        .getResult().getList();
    for (String key : tcList.keySet()) {
      tvList.put(key,
          ((MtasDataItem<?, ?>) tcList.get(key)).rewrite(false));
    }
    System.out.println(tvList);
  }
} catch (IllegalAccessException | IllegalArgumentException
    | InvocationTargetException | mtas.parser.function.ParseException e) {
  e.printStackTrace();
}
```


**Appendix**

Code class `CreateIndex`

```java
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;

public class CreateIndex {

  public static String FIELD_ID = "id";
  public static String FIELD_TITLE = "title";
  public static String FIELD_CONTENT = "content";

  private Directory directory;

  public CreateIndex(String configFile, HashMap<String, String> files)
      throws IOException {
    this(null, configFile, files);
  }

  public CreateIndex(String indexPath, String configFile,
      HashMap<String, String> files) throws IOException {
    initialize(null, configFile, files);
  }

  public Directory getDirectory() {
    return directory;
  }

  private void initialize(String indexPath, String configFile,
      HashMap<String, String> files) throws IOException {
    if (indexPath != null) {
      directory = FSDirectory.open(Paths.get(indexPath));
    } else {
      directory = new RAMDirectory();
    }
    Map<String, String> paramsCharFilterMtas = new HashMap<String, String>();
    paramsCharFilterMtas.put("type", "file");
    Map<String, String> paramsTokenizer = new HashMap<String, String>();
    paramsTokenizer.put("configFile", configFile);
    Analyzer mtasAnalyzer = CustomAnalyzer
        .builder(Paths.get("docker").toAbsolutePath())
        .addCharFilter("mtas", paramsCharFilterMtas)
        .withTokenizer("mtas", paramsTokenizer).build();
    Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
    analyzerPerField.put(FIELD_CONTENT, mtasAnalyzer);
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
        new StandardAnalyzer(), analyzerPerField);
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setUseCompoundFile(false);
    config.setCodec(Codec.forName("MtasCodec"));
    IndexWriter w = new IndexWriter(directory, config);
    w.deleteAll();
    int counter = 0;
    for (String title : files.keySet()) {
      Document doc = new Document();
      doc.add(new StringField(FIELD_ID, Integer.valueOf(counter).toString(),
          Field.Store.YES));
      doc.add(new StringField(FIELD_TITLE, title, Field.Store.YES));
      doc.add(new TextField(FIELD_CONTENT, files.get(title), Field.Store.YES));
      w.addDocument(doc);
      counter++;
    }
    w.commit();
    w.close();
  }
}
```

Code method `createQuery`

```java
MtasSpanQuery createQuery(String field, String cql,
      MtasSpanQuery ignore, Integer maximumIgnoreLength) throws ParseException {
  Reader reader = new BufferedReader(new StringReader(cql));
  MtasCQLParser p = new MtasCQLParser(reader);
  return p.parse(field, null, null, ignore, maximumIgnoreLength);
}
```