Commit 438ff8bff823e74c232f802939fc9371e00025d2

Authored by Matthijs Brouwer
1 parent f50d9e14

update

src/mtas/codec/MtasCodecPostingsFormat.java
... ... @@ -285,10 +285,6 @@ public class MtasCodecPostingsFormat extends PostingsFormat {
285 285 } catch (Exception e) {
286 286 throw new IOException(e.getMessage());
287 287 }
288   - Long termRef = inObject.readVLong();
289   - inTerm.seek(termRef);
290   - token.setTermRef(termRef);
291   - token.setValue(inTerm.readString());
292 288 return token;
293 289 }
294 290  
... ...
src/mtas/codec/MtasFieldsConsumer.java
... ... @@ -35,15 +35,284 @@ import org.apache.lucene.index.SegmentWriteState;
35 35 import org.apache.lucene.index.Terms;
36 36 import org.apache.lucene.index.TermsEnum;
37 37 import org.apache.lucene.search.DocIdSetIterator;
38   -import org.apache.lucene.store.IOContext;
39 38 import org.apache.lucene.store.IndexInput;
40 39 import org.apache.lucene.store.IndexOutput;
41   -import org.apache.lucene.store.Lock;
42 40 import org.apache.lucene.util.BytesRef;
43 41 import org.apache.lucene.util.IOUtils;
44 42  
45 43 /**
46 44 * The Class MtasFieldsConsumer.
  45 + *
  46 + *
  47 + * The Class MtasFieldsConsumer constructs several temporal and permanent files
  48 + * to provide a forward index
  49 + *
  50 + * <ul>
  51 + * <li><b>Temporary files</b><br>
  52 + * <ul>
  53 + * <li><b>Temporary file {@link #mtasTmpFieldFileName} with extension
  54 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_FIELD_EXTENSION} </b><br>
  55 + * Contains for each field a reference to the list of documents. Structure of
  56 + * content:
  57 + * <ul>
  58 + * <li><b>String</b>: field</li>
  59 + * <li><b>VLong</b>: reference to {@link #mtasDocFileName}</li>
  60 + * <li><b>VInt</b>: number of documents</li>
  61 + * <li><b>VLong</b>: reference to {@link #mtasTermFileName}</li>
  62 + * <li><b>VInt</b>: number of terms</li>
  63 + * <li><b>VLong</b>: reference to {@link #mtasPrefixFileName}</li>
  64 + * <li><b>VInt</b>: number of prefixes</li>
  65 + * </ul>
  66 + * </li>
  67 + * <li><b>Temporary file {@link #mtasTmpObjectFileName} with extension
  68 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_OBJECT_EXTENSION}</b><br>
  69 + * Contains for a specific field all objects constructed by
  70 + * {@link createObjectAndRegisterPrefix}. For all fields, the objects are later
  71 + * on copied to {@link #mtasObjectFileName} while statistics are collected.
  72 + * Structure of content identical to {@link #mtasObjectFileName}.</li>
  73 + * <li><b>Temporary file {@link #mtasTmpDocsFileName} with extension
  74 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_DOCS_EXTENSION}</b> <br>
  75 + * Contains for a specific field for each doc multiple fragments. Each occurring
  76 + * term results in a fragment. Structure of content:
  77 + * <ul>
  78 + * <li><b>VInt</b>: docId</li>
  79 + * <li><b>VInt</b>: number of objects in this fragment</li>
  80 + * <li><b>VLong</b>: offset references to {@link #mtasTmpObjectFileName}</li>
  81 + * <li><b>VInt</b>,<b>VLong</b>: mtasId object, reference temporary object in
  82 + * {@link #mtasTmpObjectFileName} minus offset</li>
  83 + * <li><b>VInt</b>,<b>VLong</b>: ...</li>
  84 + * </ul>
  85 + * </li>
  86 + * <li><b>Temporary file {@link #mtasTmpDocsChainedFileName} with extension
  87 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_DOCS_CHAINED_EXTENSION}
  88 + * </b><br>
  89 + * Contains for a specific field for each doc multiple chained fragments.
  90 + * Structure of content:
  91 + * <ul>
  92 + * <li><b>VInt</b>: docId</li>
  93 + * <li><b>VInt</b>: number of objects in this fragment</li>
  94 + * <li><b>VLong</b>: offset references to {@link #mtasTmpObjectFileName}</li>
  95 + * <li><b>VInt</b>,<b>VLong</b>: mtasId object, reference temporary object in
  96 + * {@link #mtasTmpObjectFileName} minus offset</li>
  97 + * <li><b>VInt</b>,<b>VLong</b>: ...</li>
  98 + * <li><b>VLong</b>: reference to next fragment in
  99 + * {@link #mtasTmpDocsChainedFileName}, self reference indicates end of chain
  100 + * </ul>
  101 + * </li>
  102 + * <li><b>Temporary file {@link #mtasTmpDocFileName} with extension
  103 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_DOC_EXTENSION}</b><br>
  104 + * For each document
  105 + * <ul>
  106 + * <li><b>VInt</b>: docId</li>
  107 + * <li><b>VLong</b>: reference to {@link #mtasIndexObjectIdFileName}</li>
  108 + * <li><b>VLong</b>: reference first object, used as offset for tree index
  109 + * <li><b>VInt</b>: slope used in approximation reference objects index on id
  110 + * </li>
  111 + * <li><b>ZLong</b>: offset used in approximation reference objects index on id
  112 + * </li>
  113 + * <li><b>Byte</b>: flag indicating how corrections on the approximation
  114 + * references objects for the index on id are stored:
  115 + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_BYTE},
  116 + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_SHORT},
  117 + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_INTEGER} or
  118 + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_LONG}</li>
  119 + * <li><b>VInt</b>: number of objects in this document</li>
  120 + * <li><b>VInt</b>: first position</li>
  121 + * <li><b>VInt</b>: last position</li>
  122 + * </ul>
  123 + * </li>
  124 + * </ul>
  125 + * </li>
  126 + * <li><b>Final files</b><br>
  127 + * <ul>
  128 + * <li><b>File {@link #mtasIndexFieldFileName} with extension
  129 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_FIELD_EXTENSION}</b><br>
  130 + * Contains for each field a reference to the list of documents and the
  131 + * prefixes. Structure of content:
  132 + * <ul>
  133 + * <li><b>String</b>: field</li>
  134 + * <li><b>VLong</b>: reference to {@link #mtasDocFileName}</li>
  135 + * <li><b>VLong</b>: reference to {@link #mtasIndexDocIdFileName}</li>
  136 + * <li><b>VInt</b>: number of documents</li>
  137 + * <li><b>VLong</b>: reference to {@link #mtasTermFileName}</li>
  138 + * <li><b>VInt</b>: number of terms</li>
  139 + * <li><b>VLong</b>: reference to {@link #mtasPrefixFileName}</li>
  140 + * <li><b>VInt</b>: number of prefixes</li>
  141 + * </ul>
  142 + * </li>
  143 + * <li><b>File {@link #mtasTermFileName} with extension
  144 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TERM_EXTENSION}</b><br>
  145 + * For each field, all unique terms are stored here. Structure of content:
  146 + * <ul>
  147 + * <li><b>String</b>: term</li>
  148 + * </ul>
  149 + * </li>
  150 + * <li><b>File {@link #mtasPrefixFileName} with extension
  151 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_PREFIX_EXTENSION}</b><br>
  152 + * For each field, all unique prefixes are stored here. Structure of content:
  153 + * <ul>
  154 + * <li><b>String</b>: prefix</li>
  155 + * </ul>
  156 + * </li>
  157 + * <li><b>File {@link #mtasObjectFileName} with extension
  158 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_OBJECT_EXTENSION}</b><br>
  159 + * Contains all objects for all fields. Structure of content:
  160 + * <ul>
  161 + * <li><b>VInt</b>: mtasId</li>
  162 + * <li><b>VInt</b>: objectFlags
  163 + * <ul>
  164 + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PARENT}</li>
  165 + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_RANGE}</li>
  166 + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_SET}</li>
  167 + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_OFFSET}</li>
  168 + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_REALOFFSET}</li>
  169 + * <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PAYLOAD}</li>
  170 + * </ul>
  171 + * </li>
  172 + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PARENT}<br>
  173 + * <b>VInt</b>: parentId
  174 + * <li>Only if
  175 + * {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_RANGE}<br>
  176 + * <b>VInt</b>,<b>VInt</b>: startPosition and (endPosition-startPosition)
  177 + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_SET}<br>
  178 + * <b>VInt</b>,<b>VInt</b>,<b>VInt</b>,...: number of positions, firstPosition,
  179 + * (position-previousPosition),...
  180 + * <li>Only if no {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_RANGE}
  181 + * or {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_SET}<br>
  182 + * <b>VInt</b>: position
  183 + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_OFFSET}<br>
  184 + * <b>VInt</b>,<b>VInt</b>: startOffset, (endOffset-startOffset)
  185 + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_REALOFFSET}<br>
  186 + * <b>VInt</b>,<b>VInt</b>: startRealOffset, (endRealOffset-startRealOffset)
  187 + * <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PAYLOAD}<br>
  188 + * <b>VInt</b>,<b>Bytes</b>: number of bytes, payload
  189 + * <li><b>VLong</b>: reference to Term in {@link #mtasTermFileName}</li>
  190 + * </ul>
  191 + * </li>
  192 + * <li><b>File {@link #mtasIndexDocIdFileName} with extension
  193 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_DOC_ID_EXTENSION}
  194 + * </b><br>
  195 + * Contains for each field a tree structure {@link MtasTree} to search reference
  196 + * to {@link #mtasDocFileName} by id. Structure of content for each node:
  197 + * <ul>
  198 + * <li><b>VLong</b>: offset references to {@link #mtasIndexDocIdFileName}, only
  199 + * available in root node</li>
  200 + * <li><b>Byte</b>: flag, should be zero for this tree, only available in root
  201 + * node</li>
  202 + * <li><b>VInt</b>: left</li>
  203 + * <li><b>VInt</b>: right</li>
  204 + * <li><b>VInt</b>: max</li>
  205 + * <li><b>VLong</b>: left reference to {@link #mtasIndexDocIdFileName} minus the
  206 + * offset stored in the root node</li>
  207 + * <li><b>VLong</b>: right reference to {@link #mtasIndexDocIdFileName} minus
  208 + * the offset stored in the root node</li>
  209 + * <li><b>VInt</b>: number of objects on this node (always 1 for this tree)</li>
  210 + * <li><b>VLong</b>: reference to {@link #mtasDocFileName} minus offset</li>
  211 + * </ul>
  212 + * </li>
  213 + * <li><b>File {@link #mtasDocFileName} with extension
  214 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_DOC_EXTENSION}</b><br>
  215 + * For each document
  216 + * <ul>
  217 + * <li><b>VInt</b>: docId</li>
  218 + * <li><b>VLong</b>: reference to {@link #mtasIndexObjectIdFileName}</li>
  219 + * <li><b>VLong</b>: reference to {@link #mtasIndexObjectPositionFileName}</li>
  220 + * <li><b>VLong</b>: reference to {@link #mtasIndexObjectParentFileName}</li>
  221 + * <li><b>VLong</b>: reference first object, used as offset for tree index
  222 + * <li><b>VInt</b>: slope used in approximation reference objects index on id
  223 + * </li>
  224 + * <li><b>ZLong</b>: offset used in approximation reference objects index on id
  225 + * </li>
  226 + * <li><b>Byte</b>: flag indicating how corrections on the approximation
  227 + * references objects for the index on id are stored:
  228 + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_BYTE},
  229 + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_SHORT},
  230 + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_INTEGER} or
  231 + * {@link MtasCodecPostingsFormat#MTAS_STORAGE_LONG}</li>
  232 + * <li><b>VInt</b>: number of objects</li>
  233 + * <li><b>VInt</b>: first position</li>
  234 + * <li><b>VInt</b>: last position</li>
  235 + * </ul>
  236 + * </li>
  237 + * <li><b>File {@link #mtasIndexObjectIdFileName} with extension
  238 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_OBJECT_ID_EXTENSION}
  239 + * </b><br>
  240 + * Provides for each mtasId the reference to {@link #mtasObjectFileName}. These
  241 + * references are grouped by document, sorted by mtasId, and because the
  242 + * mtasId's for each document will always start with 0 and are sequential
  243 + * without gaps, a reference can be computed if the position of the first
  244 + * reference for a document is known from {@link #mtasDocFileName}. The
  245 + * reference is approximated by the reference to the first object plus the
  246 + * mtasId times a slope. Only a correction to this approximation is stored.
  247 + * Structure of content:
  248 + * <ul>
  249 + * <li><b>Byte</b>/<b>Short</b>/<b>Int</b>/<b>Long</b>: correction reference to
  250 + * {@link #mtasObjectFileName}</li>
  251 + * </ul>
  252 + * </li>
  253 + * <li><b>File {@link #mtasIndexObjectPositionFileName} with extension
  254 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_OBJECT_POSITION_EXTENSION}
  255 + * </b><br>
  256 + * Contains for each document a tree structure {@link MtasTree} to search
  257 + * objects by position. Structure of content for each node:
  258 + * <ul>
  259 + * <li><b>VLong</b>: offset references to
  260 + * {@link #mtasIndexObjectPositionFileName}, only available in root node</li>
  261 + * <li><b>Byte</b>: flag, should be zero for this tree, only available in root
  262 + * node</li>
  263 + * <li><b>VInt</b>: left</li>
  264 + * <li><b>VInt</b>: right</li>
  265 + * <li><b>VInt</b>: max</li>
  266 + * <li><b>VLong</b>: left reference to {@link #mtasIndexObjectPositionFileName}
  267 + * minus the offset stored in the root node</li>
  268 + * <li><b>VLong</b>: right reference to {@link #mtasIndexObjectPositionFileName}
  269 + * minus the offset stored in the root node</li>
  270 + * <li><b>VInt</b>: number of objects on this node</li>
  271 + * <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>: set of the first reference to
  272 + * {@link #mtasObjectFileName} minus offset, the prefixId referring to the
  273 + * position the prefix in {@link #mtasPrefixFileName} and the reference to
  274 + * {@link #mtasTermFileName} minus offset</li>
  275 + * <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>,...: for optional other sets of
  276 + * reference to {@link #mtasObjectFileName}, position of the prefix in
  277 + * {@link #mtasPrefixFileName} and the reference to {@link #mtasTermFileName};
  278 + * for the first item the difference between this reference minus the previous
  279 + * reference is stored</li>
  280 + * </ul>
  281 + * </li>
  282 + * <li><b>File {@link #mtasIndexObjectParentFileName} with extension
  283 + * {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_OBJECT_PARENT_EXTENSION}
  284 + * </b><br>
  285 + * Contains for each document a tree structure {@link MtasTree} to search
  286 + * objects by parent. Structure of content for each node:
  287 + * <ul>
  288 + * <li><b>VLong</b>: offset references to {@link #mtasIndexObjectParentFileName}
  289 + * , only available in root node</li>
  290 + * <li><b>Byte</b>: flag, for this tree equal to
  291 + * {@link mtas.codec.tree.MtasTree#SINGLE_POSITION_TREE} indicating a tree with
  292 + * exactly one point at each node, only available in root node</li>
  293 + * <li><b>VInt</b>: left</li>
  294 + * <li><b>VInt</b>: right</li>
  295 + * <li><b>VInt</b>: max</li>
  296 + * <li><b>VLong</b>: left reference to {@link #mtasIndexObjectParentFileName}
  297 + * minus the offset stored in the root node</li>
  298 + * <li><b>VLong</b>: right reference to {@link #mtasIndexObjectParentFileName}
  299 + * minus the offset stored in the root node</li>
  300 + * <li><b>VInt</b>: number of objects on this node</li>
  301 + * <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>: set of the first reference to
  302 + * {@link #mtasObjectFileName} minus offset, the prefixId referring to the
  303 + * position the prefix in {@link #mtasPrefixFileName} and the reference to
  304 + * {@link #mtasTermFileName} minus offset</li>
  305 + * <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>,...: for optional other sets of
  306 + * reference to {@link #mtasObjectFileName}, position of the prefix in
  307 + * {@link #mtasPrefixFileName} and the reference to {@link #mtasTermFileName};
  308 + * for the first item the difference between this reference minus the previous
  309 + * reference is stored</li>
  310 + * </ul>
  311 + * </li>
  312 + * </ul>
  313 + * </li>
  314 + * </ul>
  315 + *
47 316 */
48 317  
49 318 public class MtasFieldsConsumer extends FieldsConsumer {
... ...
src/site/markdown/features.md
... ... @@ -9,10 +9,10 @@
9 9  
10 10 ### Search
11 11 * Supports [CQL](search_cql.html) query language.
12   -* [Statistics](search_statistics.html) on number of [words](search_query_stats_positions.html), [tokens](search_query_stats_tokens.html) and [spans](search_query_stats_spans.html).
  12 +* [Statistics](search_stats.html) on number of [words](search_query_stats_positions.html), [tokens](search_query_stats_tokens.html) and [spans](search_query_stats_spans.html).
13 13 * Usage of [functions](search_functions.html) to produce statistics for custom defined relations between multiple spans and/or number of words.
14   -* [Facets](search_facet.html) with [statistics](search_statistics.html) on hits.
15   -* [Kwic and lists](search_kwic_and_list.html), [termvectors](search_termvector.html) and [grouping](search_group.html) for spans.
  14 +* [Facets](search_query_facet.html) with [statistics](search_stats.html) on hits.
  15 +* [Kwic and lists](search_query_kwic_and_list.html), [termvectors](search_query_termvector.html) and [grouping](search_query_group.html) for spans.
16 16  
17 17  
18 18 ### Solr
... ...