installation_lucene.md.vm
6.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#Apache Lucene
Some code snippets to illustrate the use of Mtas directly with [Apache Lucene](https://lucene.apache.org/).
**Create index**
Create an index with three folia files using [configuration](indexing_configuration.html) file `folia.xml`
```java
String configFile = "folia.xml";
HashMap<String,String> files = new HashMap<String,String>();
files.put("title 1","resource1.xml.gz");
files.put("title 2","resource2.xml.gz");
files.put("title 3","resource3.xml.gz");
CreateIndex createIndex = new CreateIndex(configFile, files);
```
**Basic search**
With the created index and for [CQL](search_cql.html) expression `[pos="LID"]`
```java
String cql = "[pos=\"LID\"]";
Directory directory = createIndex.getDirectory();
```
the number of hits in each document can be computed with
```java
IndexReader indexReader = DirectoryReader.open(directory);
MtasSpanQuery q = createQuery(CreateIndex.FIELD_CONTENT, cql, null, null);
ListIterator<LeafReaderContext> iterator = indexReader.leaves()
.listIterator();
IndexSearcher searcher = new IndexSearcher(indexReader);
SpanWeight spanweight = ((MtasSpanQuery) q.rewrite(indexReader))
.createWeight(searcher, false);
while (iterator.hasNext()) {
LeafReaderContext lrc = iterator.next();
Spans spans = spanweight.getSpans(lrc, SpanWeight.Postings.POSITIONS);
SegmentReader r = (SegmentReader) lrc.reader();
if (spans != null) {
while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
if (r.numDocs()==r.maxDoc() || r.getLiveDocs().get(spans.docID())) {
System.out.print("Document "+(lrc.docBase+spans.docID())+": ");
int hits = 0;
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
hits++;
}
System.out.println(hits+" hits in '"+r.document((lrc.docBase+spans.docID())).get(CreateIndex.FIELD_TITLE)+"'");
}
}
}
}
indexReader.close();
```
**Advanced search**
By using the provided `collect` method, also more advanced
options are available, like computing the [termvector](search_query_termvector.html)
```java
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(indexReader);
ComponentField fieldStats = new ComponentField(CreateIndex.FIELD_CONTENT, CreateIndex.FIELD_ID);
ArrayList<Integer> fullDocSet = new ArrayList<Integer>(Arrays.asList(new Integer[]{0,1,2}));
ArrayList<Integer> fullDocList = new ArrayList<Integer>();
try {
fieldStats.termVectorList.add(new ComponentTermVector("wordList", "t", null, false, "n,sum", CodecUtil.STATS_TYPE_SUM, CodecUtil.SORT_DESC, null, 10, null, null, null, null, null, null, null, null, null));
CodecUtil.collect(CreateIndex.FIELD_CONTENT, searcher, indexReader, fullDocList,
fullDocSet, fieldStats);
for (ComponentTermVector ct : fieldStats.termVectorList) {
HashMap<String, Map<String, Object>> tvList = new HashMap<String, Map<String, Object>>();
Map<String, ?> tcList = ct.subComponentFunction.dataCollector
.getResult().getList();
for (String key : tcList.keySet()) {
tvList.put(key,
((MtasDataItem<?, ?>) tcList.get(key)).rewrite(false));
}
System.out.println(tvList);
}
} catch (IllegalAccessException | IllegalArgumentException
| InvocationTargetException | mtas.parser.function.ParseException e) {
e.printStackTrace();
}
```
**Appendix**
Code class `CreateIndex`
```java
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
public class CreateIndex {
public static String FIELD_ID = "id";
public static String FIELD_TITLE = "title";
public static String FIELD_CONTENT = "content";
private Directory directory;
public CreateIndex(String configFile, HashMap<String, String> files)
throws IOException {
this(null, configFile, files);
}
public CreateIndex(String indexPath, String configFile,
HashMap<String, String> files) throws IOException {
initialize(null, configFile, files);
}
public Directory getDirectory() {
return directory;
}
private void initialize(String indexPath, String configFile,
HashMap<String, String> files) throws IOException {
if (indexPath != null) {
directory = FSDirectory.open(Paths.get(indexPath));
} else {
directory = new RAMDirectory();
}
Map<String, String> paramsCharFilterMtas = new HashMap<String, String>();
paramsCharFilterMtas.put("type", "file");
Map<String, String> paramsTokenizer = new HashMap<String, String>();
paramsTokenizer.put("configFile", configFile);
Analyzer mtasAnalyzer = CustomAnalyzer
.builder(Paths.get("docker").toAbsolutePath())
.addCharFilter("mtas", paramsCharFilterMtas)
.withTokenizer("mtas", paramsTokenizer).build();
Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
analyzerPerField.put(FIELD_CONTENT, mtasAnalyzer);
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
new StandardAnalyzer(), analyzerPerField);
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setUseCompoundFile(false);
config.setCodec(Codec.forName("MtasCodec"));
IndexWriter w = new IndexWriter(directory, config);
w.deleteAll();
int counter = 0;
for (String title : files.keySet()) {
Document doc = new Document();
doc.add(new StringField(FIELD_ID, Integer.valueOf(counter).toString(),
Field.Store.YES));
doc.add(new StringField(FIELD_TITLE, title, Field.Store.YES));
doc.add(new TextField(FIELD_CONTENT, files.get(title), Field.Store.YES));
w.addDocument(doc);
counter++;
}
w.commit();
w.close();
}
}
```
Code method `createQuery`
```java
MtasSpanQuery createQuery(String field, String cql,
MtasSpanQuery ignore, Integer maximumIgnoreLength) throws ParseException {
Reader reader = new BufferedReader(new StringReader(cql));
MtasCQLParser p = new MtasCQLParser(reader);
return p.parse(field, null, null, ignore, maximumIgnoreLength);
}
```