Commit 438ff8bff823e74c232f802939fc9371e00025d2
1 parent
f50d9e14
update
Showing
3 changed files
with
274 additions
and
9 deletions
src/mtas/codec/MtasCodecPostingsFormat.java
... | ... | @@ -285,10 +285,6 @@ public class MtasCodecPostingsFormat extends PostingsFormat { |
285 | 285 | } catch (Exception e) { |
286 | 286 | throw new IOException(e.getMessage()); |
287 | 287 | } |
288 | - Long termRef = inObject.readVLong(); | |
289 | - inTerm.seek(termRef); | |
290 | - token.setTermRef(termRef); | |
291 | - token.setValue(inTerm.readString()); | |
292 | 288 | return token; |
293 | 289 | } |
294 | 290 | |
... | ... |
src/mtas/codec/MtasFieldsConsumer.java
... | ... | @@ -35,15 +35,284 @@ import org.apache.lucene.index.SegmentWriteState; |
35 | 35 | import org.apache.lucene.index.Terms; |
36 | 36 | import org.apache.lucene.index.TermsEnum; |
37 | 37 | import org.apache.lucene.search.DocIdSetIterator; |
38 | -import org.apache.lucene.store.IOContext; | |
39 | 38 | import org.apache.lucene.store.IndexInput; |
40 | 39 | import org.apache.lucene.store.IndexOutput; |
41 | -import org.apache.lucene.store.Lock; | |
42 | 40 | import org.apache.lucene.util.BytesRef; |
43 | 41 | import org.apache.lucene.util.IOUtils; |
44 | 42 | |
45 | 43 | /** |
46 | 44 | * The Class MtasFieldsConsumer. |
45 | + * | |
46 | + * | |
47 | + * The Class MtasFieldsConsumer constructs several temporal and permanent files | |
48 | + * to provide a forward index | |
49 | + * | |
50 | + * <ul> | |
51 | + * <li><b>Temporary files</b><br> | |
52 | + * <ul> | |
53 | + * <li><b>Temporary file {@link #mtasTmpFieldFileName} with extension | |
54 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_FIELD_EXTENSION} </b><br> | |
55 | + * Contains for each field a reference to the list of documents. Structure of | |
56 | + * content: | |
57 | + * <ul> | |
58 | + * <li><b>String</b>: field</li> | |
59 | + * <li><b>VLong</b>: reference to {@link #mtasDocFileName}</li> | |
60 | + * <li><b>VInt</b>: number of documents</li> | |
61 | + * <li><b>VLong</b>: reference to {@link #mtasTermFileName}</li> | |
62 | + * <li><b>VInt</b>: number of terms</li> | |
63 | + * <li><b>VLong</b>: reference to {@link #mtasPrefixFileName}</li> | |
64 | + * <li><b>VInt</b>: number of prefixes</li> | |
65 | + * </ul> | |
66 | + * </li> | |
67 | + * <li><b>Temporary file {@link #mtasTmpObjectFileName} with extension | |
68 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_OBJECT_EXTENSION}</b><br> | |
69 | + * Contains for a specific field all objects constructed by | |
70 | + * {@link createObjectAndRegisterPrefix}. For all fields, the objects are later | |
71 | + * on copied to {@link #mtasObjectFileName} while statistics are collected. | |
72 | + * Structure of content identical to {@link #mtasObjectFileName}.</li> | |
73 | + * <li><b>Temporary file {@link #mtasTmpDocsFileName} with extension | |
74 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_DOCS_EXTENSION}</b> <br> | |
75 | + * Contains for a specific field for each doc multiple fragments. Each occurring | |
76 | + * term results in a fragment. Structure of content: | |
77 | + * <ul> | |
78 | + * <li><b>VInt</b>: docId</li> | |
79 | + * <li><b>VInt</b>: number of objects in this fragment</li> | |
80 | + * <li><b>VLong</b>: offset references to {@link #mtasTmpObjectFileName}</li> | |
81 | + * <li><b>VInt</b>,<b>VLong</b>: mtasId object, reference temporary object in | |
82 | + * {@link #mtasTmpObjectFileName} minus offset</li> | |
83 | + * <li><b>VInt</b>,<b>VLong</b>: ...</li> | |
84 | + * </ul> | |
85 | + * </li> | |
86 | + * <li><b>Temporary file {@link #mtasTmpDocsChainedFileName} with extension | |
87 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_DOCS_CHAINED_EXTENSION} | |
88 | + * </b><br> | |
89 | + * Contains for a specific field for each doc multiple chained fragments. | |
90 | + * Structure of content: | |
91 | + * <ul> | |
92 | + * <li><b>VInt</b>: docId</li> | |
93 | + * <li><b>VInt</b>: number of objects in this fragment</li> | |
94 | + * <li><b>VLong</b>: offset references to {@link #mtasTmpObjectFileName}</li> | |
95 | + * <li><b>VInt</b>,<b>VLong</b>: mtasId object, reference temporary object in | |
96 | + * {@link #mtasTmpObjectFileName} minus offset</li> | |
97 | + * <li><b>VInt</b>,<b>VLong</b>: ...</li> | |
98 | + * <li><b>VLong</b>: reference to next fragment in | |
99 | + * {@link #mtasTmpDocsChainedFileName}, self reference indicates end of chain | |
100 | + * </ul> | |
101 | + * </li> | |
102 | + * <li><b>Temporary file {@link #mtasTmpDocFileName} with extension | |
103 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_DOC_EXTENSION}</b><br> | |
104 | + * For each document | |
105 | + * <ul> | |
106 | + * <li><b>VInt</b>: docId</li> | |
107 | + * <li><b>VLong</b>: reference to {@link #mtasIndexObjectIdFileName}</li> | |
108 | + * <li><b>VLong</b>: reference first object, used as offset for tree index | |
109 | + * <li><b>VInt</b>: slope used in approximation reference objects index on id | |
110 | + * </li> | |
111 | + * <li><b>ZLong</b>: offset used in approximation reference objects index on id | |
112 | + * </li> | |
113 | + * <li><b>Byte</b>: flag indicating how corrections on the approximation | |
114 | + * references objects for the index on id are stored: | |
115 | + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_BYTE}, | |
116 | + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_SHORT}, | |
117 | + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_INTEGER} or | |
118 | + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_LONG}</li> | |
119 | + * <li><b>VInt</b>: number of objects in this document</li> | |
120 | + * <li><b>VInt</b>: first position</li> | |
121 | + * <li><b>VInt</b>: last position</li> | |
122 | + * </ul> | |
123 | + * </li> | |
124 | + * </ul> | |
125 | + * </li> | |
126 | + * <li><b>Final files</b><br> | |
127 | + * <ul> | |
128 | + * <li><b>File {@link #mtasIndexFieldFileName} with extension | |
129 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_FIELD_EXTENSION}</b><br> | |
130 | + * Contains for each field a reference to the list of documents and the | |
131 | + * prefixes. Structure of content: | |
132 | + * <ul> | |
133 | + * <li><b>String</b>: field</li> | |
134 | + * <li><b>VLong</b>: reference to {@link #mtasDocFileName}</li> | |
135 | + * <li><b>VLong</b>: reference to {@link #mtasIndexDocIdFileName}</li> | |
136 | + * <li><b>VInt</b>: number of documents</li> | |
137 | + * <li><b>VLong</b>: reference to {@link #mtasTermFileName}</li> | |
138 | + * <li><b>VInt</b>: number of terms</li> | |
139 | + * <li><b>VLong</b>: reference to {@link #mtasPrefixFileName}</li> | |
140 | + * <li><b>VInt</b>: number of prefixes</li> | |
141 | + * </ul> | |
142 | + * </li> | |
143 | + * <li><b>File {@link #mtasTermFileName} with extension | |
144 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TERM_EXTENSION}</b><br> | |
145 | + * For each field, all unique terms are stored here. Structure of content: | |
146 | + * <ul> | |
147 | + * <li><b>String</b>: term</li> | |
148 | + * </ul> | |
149 | + * </li> | |
150 | + * <li><b>File {@link #mtasPrefixFileName} with extension | |
151 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_PREFIX_EXTENSION}</b><br> | |
152 | + * For each field, all unique prefixes are stored here. Structure of content: | |
153 | + * <ul> | |
154 | + * <li><b>String</b>: prefix</li> | |
155 | + * </ul> | |
156 | + * </li> | |
157 | + * <li><b>File {@link #mtasObjectFileName} with extension | |
158 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_OBJECT_EXTENSION}</b><br> | |
159 | + * Contains all objects for all fields. Structure of content: | |
160 | + * <ul> | |
161 | + * <li><b>VInt</b>: mtasId</li> | |
162 | + * <li><b>VInt</b>: objectFlags | |
163 | + * <ul> | |
164 | + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PARENT}</li> | |
165 | + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_RANGE}</li> | |
166 | + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_SET}</li> | |
167 | + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_OFFSET}</li> | |
168 | + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_REALOFFSET}</li> | |
169 | + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PAYLOAD}</li> | |
170 | + * </ul> | |
171 | + * </li> | |
172 | + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PARENT}<br> | |
173 | + * <b>VInt</b>: parentId | |
174 | + * <li>Only if | |
175 | + * {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_RANGE}<br> | |
176 | + * <b>VInt</b>,<b>VInt</b>: startPosition and (endPosition-startPosition) | |
177 | + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_SET}<br> | |
178 | + * <b>VInt</b>,<b>VInt</b>,<b>VInt</b>,...: number of positions, firstPosition, | |
179 | + * (position-previousPosition),... | |
180 | + * <li>Only if no {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_RANGE} | |
181 | + * or {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_SET}<br> | |
182 | + * <b>VInt</b>: position | |
183 | + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_OFFSET}<br> | |
184 | + * <b>VInt</b>,<b>VInt</b>: startOffset, (endOffset-startOffset) | |
185 | + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_REALOFFSET}<br> | |
186 | + * <b>VInt</b>,<b>VInt</b>: startRealOffset, (endRealOffset-startRealOffset) | |
187 | + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PAYLOAD}<br> | |
188 | + * <b>VInt</b>,<b>Bytes</b>: number of bytes, payload | |
189 | + * <li><b>VLong</b>: reference to Term in {@link #mtasTermFileName}</li> | |
190 | + * </ul> | |
191 | + * </li> | |
192 | + * <li><b>File {@link #mtasIndexDocIdFileName} with extension | |
193 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_DOC_ID_EXTENSION} | |
194 | + * </b><br> | |
195 | + * Contains for each field a tree structure {@link MtasTree} to search reference | |
196 | + * to {@link #mtasDocFileName} by id. Structure of content for each node: | |
197 | + * <ul> | |
198 | + * <li><b>VLong</b>: offset references to {@link #mtasIndexDocIdFileName}, only | |
199 | + * available in root node</li> | |
200 | + * <li><b>Byte</b>: flag, should be zero for this tree, only available in root | |
201 | + * node</li> | |
202 | + * <li><b>VInt</b>: left</li> | |
203 | + * <li><b>VInt</b>: right</li> | |
204 | + * <li><b>VInt</b>: max</li> | |
205 | + * <li><b>VLong</b>: left reference to {@link #mtasIndexDocIdFileName} minus the | |
206 | + * offset stored in the root node</li> | |
207 | + * <li><b>VLong</b>: right reference to {@link #mtasIndexDocIdFileName} minus | |
208 | + * the offset stored in the root node</li> | |
209 | + * <li><b>VInt</b>: number of objects on this node (always 1 for this tree)</li> | |
210 | + * <li><b>VLong</b>: reference to {@link #mtasDocFileName} minus offset</li> | |
211 | + * </ul> | |
212 | + * </li> | |
213 | + * <li><b>File {@link #mtasDocFileName} with extension | |
214 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_DOC_EXTENSION}</b><br> | |
215 | + * For each document | |
216 | + * <ul> | |
217 | + * <li><b>VInt</b>: docId</li> | |
218 | + * <li><b>VLong</b>: reference to {@link #mtasIndexObjectIdFileName}</li> | |
219 | + * <li><b>VLong</b>: reference to {@link #mtasIndexObjectPositionFileName}</li> | |
220 | + * <li><b>VLong</b>: reference to {@link #mtasIndexObjectParentFileName}</li> | |
221 | + * <li><b>VLong</b>: reference first object, used as offset for tree index | |
222 | + * <li><b>VInt</b>: slope used in approximation reference objects index on id | |
223 | + * </li> | |
224 | + * <li><b>ZLong</b>: offset used in approximation reference objects index on id | |
225 | + * </li> | |
226 | + * <li><b>Byte</b>: flag indicating how corrections on the approximation | |
227 | + * references objects for the index on id are stored: | |
228 | + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_BYTE}, | |
229 | + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_SHORT}, | |
230 | + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_INTEGER} or | |
231 | + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_LONG}</li> | |
232 | + * <li><b>VInt</b>: number of objects</li> | |
233 | + * <li><b>VInt</b>: first position</li> | |
234 | + * <li><b>VInt</b>: last position</li> | |
235 | + * </ul> | |
236 | + * </li> | |
237 | + * <li><b>File {@link #mtasIndexObjectIdFileName} with extension | |
238 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_OBJECT_ID_EXTENSION} | |
239 | + * </b><br> | |
240 | + * Provides for each mtasId the reference to {@link #mtasObjectFileName}. These | |
241 | + * references are grouped by document, sorted by mtasId, and because the | |
242 | + * mtasId's for each document will always start with 0 and are sequential | |
243 | + * without gaps, a reference can be computed if the position of the first | |
244 | + * reference for a document is known from {@link #mtasDocFileName}. The | |
245 | + * reference is approximated by the reference to the first object plus the | |
246 | + * mtasId times a slope. Only a correction to this approximation is stored. | |
247 | + * Structure of content: | |
248 | + * <ul> | |
249 | + * <li><b>Byte</b>/<b>Short</b>/<b>Int</b>/<b>Long</b>: correction reference to | |
250 | + * {@link #mtasObjectFileName}</li> | |
251 | + * </ul> | |
252 | + * </li> | |
253 | + * <li><b>File {@link #mtasIndexObjectPositionFileName} with extension | |
254 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_OBJECT_POSITION_EXTENSION} | |
255 | + * </b><br> | |
256 | + * Contains for each document a tree structure {@link MtasTree} to search | |
257 | + * objects by position. Structure of content for each node: | |
258 | + * <ul> | |
259 | + * <li><b>VLong</b>: offset references to | |
260 | + * {@link #mtasIndexObjectPositionFileName}, only available in root node</li> | |
261 | + * <li><b>Byte</b>: flag, should be zero for this tree, only available in root | |
262 | + * node</li> | |
263 | + * <li><b>VInt</b>: left</li> | |
264 | + * <li><b>VInt</b>: right</li> | |
265 | + * <li><b>VInt</b>: max</li> | |
266 | + * <li><b>VLong</b>: left reference to {@link #mtasIndexObjectPositionFileName} | |
267 | + * minus the offset stored in the root node</li> | |
268 | + * <li><b>VLong</b>: right reference to {@link #mtasIndexObjectPositionFileName} | |
269 | + * minus the offset stored in the root node</li> | |
270 | + * <li><b>VInt</b>: number of objects on this node</li> | |
271 | + * <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>: set of the first reference to | |
272 | + * {@link #mtasObjectFileName} minus offset, the prefixId referring to the | |
273 | + * position the prefix in {@link #mtasPrefixFileName} and the reference to | |
274 | + * {@link #mtasTermFileName} minus offset</li> | |
275 | + * <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>,...: for optional other sets of | |
276 | + * reference to {@link #mtasObjectFileName}, position of the prefix in | |
277 | + * {@link #mtasPrefixFileName} and the reference to {@link #mtasTermFileName}; | |
278 | + * for the first item the difference between this reference minus the previous | |
279 | + * reference is stored</li> | |
280 | + * </ul> | |
281 | + * </li> | |
282 | + * <li><b>File {@link #mtasIndexObjectParentFileName} with extension | |
283 | + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_OBJECT_PARENT_EXTENSION} | |
284 | + * </b><br> | |
285 | + * Contains for each document a tree structure {@link MtasTree} to search | |
286 | + * objects by parent. Structure of content for each node: | |
287 | + * <ul> | |
288 | + * <li><b>VLong</b>: offset references to {@link #mtasIndexObjectParentFileName} | |
289 | + * , only available in root node</li> | |
290 | + * <li><b>Byte</b>: flag, for this tree equal to | |
291 | + * {@link mtas.codec.tree.MtasTree#SINGLE_POSITION_TREE} indicating a tree with | |
292 | + * exactly one point at each node, only available in root node</li> | |
293 | + * <li><b>VInt</b>: left</li> | |
294 | + * <li><b>VInt</b>: right</li> | |
295 | + * <li><b>VInt</b>: max</li> | |
296 | + * <li><b>VLong</b>: left reference to {@link #mtasIndexObjectParentFileName} | |
297 | + * minus the offset stored in the root node</li> | |
298 | + * <li><b>VLong</b>: right reference to {@link #mtasIndexObjectParentFileName} | |
299 | + * minus the offset stored in the root node</li> | |
300 | + * <li><b>VInt</b>: number of objects on this node</li> | |
301 | + * <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>: set of the first reference to | |
302 | + * {@link #mtasObjectFileName} minus offset, the prefixId referring to the | |
303 | + * position the prefix in {@link #mtasPrefixFileName} and the reference to | |
304 | + * {@link #mtasTermFileName} minus offset</li> | |
305 | + * <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>,...: for optional other sets of | |
306 | + * reference to {@link #mtasObjectFileName}, position of the prefix in | |
307 | + * {@link #mtasPrefixFileName} and the reference to {@link #mtasTermFileName}; | |
308 | + * for the first item the difference between this reference minus the previous | |
309 | + * reference is stored</li> | |
310 | + * </ul> | |
311 | + * </li> | |
312 | + * </ul> | |
313 | + * </li> | |
314 | + * </ul> | |
315 | + * | |
47 | 316 | */ |
48 | 317 | |
49 | 318 | public class MtasFieldsConsumer extends FieldsConsumer { |
... | ... |
src/site/markdown/features.md
... | ... | @@ -9,10 +9,10 @@ |
9 | 9 | |
10 | 10 | ### Search |
11 | 11 | * Supports [CQL](search_cql.html) query language. |
12 | -* [Statistics](search_statistics.html) on number of [words](search_query_stats_positions.html), [tokens](search_query_stats_tokens.html) and [spans](search_query_stats_spans.html). | |
12 | +* [Statistics](search_stats.html) on number of [words](search_query_stats_positions.html), [tokens](search_query_stats_tokens.html) and [spans](search_query_stats_spans.html). | |
13 | 13 | * Usage of [functions](search_functions.html) to produce statistics for custom defined relations between multiple spans and/or number of words. |
14 | -* [Facets](search_facet.html) with [statistics](search_statistics.html) on hits. | |
15 | -* [Kwic and lists](search_kwic_and_list.html), [termvectors](search_termvector.html) and [grouping](search_group.html) for spans. | |
14 | +* [Facets](search_query_facet.html) with [statistics](search_stats.html) on hits. | |
15 | +* [Kwic and lists](search_query_kwic_and_list.html), [termvectors](search_query_termvector.html) and [grouping](search_query_group.html) for spans. | |
16 | 16 | |
17 | 17 | |
18 | 18 | ### Solr |
... | ... |