Commit 1d031eeefd184d632ab5bb8fcb6bba261a798b7c

Authored by Matthijs Brouwer
1 parent 47700394

document/intersecting

docker/Dockerfile
1 1 # Automatically generated Dockerfile
2   -# - Build 2017-01-31 12:02
  2 +# - Build 2017-02-07 07:29
3 3 # - Lucene/Solr version 6.4.0
4 4 # - Mtas release 20170131
5 5 #
... ... @@ -55,7 +55,7 @@ RUN apt-get update && apt-get install -y lsof software-properties-common python-
55 55 && chmod -R 755 /var/www/html \
56 56 && printf "echo\n" >> /start.sh \
57 57 && printf "echo \"================ Mtas -- Multi Tier Annotation Search =================\"\n" >> /start.sh \
58   -&& printf "echo \" Timestamp 2017-01-31 12:02\"\n" >> /start.sh \
  58 +&& printf "echo \" Timestamp 2017-02-07 07:29\"\n" >> /start.sh \
59 59 && printf "echo \" Lucene/Solr version 6.4.0\"\n" >> /start.sh \
60 60 && printf "echo \" Mtas release 20170131\"\n" >> /start.sh \
61 61 && printf "echo \" See https://meertensinstituut.github.io/mtas/ for more information\"\n" >> /start.sh \
... ...
src/mtas/analysis/token/MtasToken.java
... ... @@ -4,7 +4,10 @@ import java.io.IOException;
4 4 import java.io.UnsupportedEncodingException;
5 5 import java.util.ArrayList;
6 6 import java.util.Arrays;
  7 +import java.util.Collection;
  8 +import java.util.HashMap;
7 9 import java.util.List;
  10 +import java.util.Set;
8 11 import java.util.TreeSet;
9 12 import java.util.regex.Matcher;
10 13 import java.util.regex.Pattern;
... ... @@ -13,6 +16,7 @@ import org.apache.commons.lang.ArrayUtils;
13 16 import org.apache.lucene.analysis.payloads.PayloadHelper;
14 17 import org.apache.lucene.util.BytesRef;
15 18 import org.apache.lucene.util.automaton.Automaton;
  19 +import org.apache.lucene.util.automaton.ByteRunAutomaton;
16 20 import org.apache.lucene.util.automaton.CompiledAutomaton;
17 21 import org.apache.lucene.util.automaton.Operations;
18 22 import org.apache.lucene.util.automaton.RegExp;
... ... @@ -21,7 +25,8 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
21 25 /**
22 26 * The Class MtasToken.
23 27 *
24   - * @param <GenericType> the generic type
  28 + * @param <GenericType>
  29 + * the generic type
25 30 */
26 31 public abstract class MtasToken<GenericType> {
27 32  
... ... @@ -81,8 +86,10 @@ public abstract class MtasToken&lt;GenericType&gt; {
81 86 /**
82 87 * Instantiates a new mtas token.
83 88 *
84   - * @param tokenId the token id
85   - * @param value the value
  89 + * @param tokenId
  90 + * the token id
  91 + * @param value
  92 + * the value
86 93 */
87 94 protected MtasToken(Integer tokenId, String value) {
88 95 this.tokenId = tokenId;
... ... @@ -93,9 +100,12 @@ public abstract class MtasToken&lt;GenericType&gt; {
93 100 /**
94 101 * Instantiates a new mtas token.
95 102 *
96   - * @param tokenId the token id
97   - * @param value the value
98   - * @param position the position
  103 + * @param tokenId
  104 + * the token id
  105 + * @param value
  106 + * the value
  107 + * @param position
  108 + * the position
99 109 */
100 110 protected MtasToken(Integer tokenId, String value, Integer position) {
101 111 this(tokenId, value);
... ... @@ -105,7 +115,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
105 115 /**
106 116 * Sets the token ref.
107 117 *
108   - * @param ref the new token ref
  118 + * @param ref
  119 + * the new token ref
109 120 */
110 121 final public void setTokenRef(Long ref) {
111 122 tokenRef = ref;
... ... @@ -123,7 +134,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
123 134 /**
124 135 * Sets the term ref.
125 136 *
126   - * @param ref the new term ref
  137 + * @param ref
  138 + * the new term ref
127 139 */
128 140 final public void setTermRef(Long ref) {
129 141 termRef = ref;
... ... @@ -141,7 +153,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
141 153 /**
142 154 * Sets the prefix id.
143 155 *
144   - * @param id the new prefix id
  156 + * @param id
  157 + * the new prefix id
145 158 */
146 159 final public void setPrefixId(int id) {
147 160 prefixId = id;
... ... @@ -151,7 +164,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
151 164 * Gets the prefix id.
152 165 *
153 166 * @return the prefix id
154   - * @throws IOException Signals that an I/O exception has occurred.
  167 + * @throws IOException
  168 + * Signals that an I/O exception has occurred.
155 169 */
156 170 final public int getPrefixId() throws IOException {
157 171 if (prefixId != null) {
... ... @@ -164,7 +178,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
164 178 /**
165 179 * Sets the id.
166 180 *
167   - * @param id the new id
  181 + * @param id
  182 + * the new id
168 183 */
169 184 final public void setId(Integer id) {
170 185 tokenId = id;
... ... @@ -182,7 +197,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
182 197 /**
183 198 * Sets the parent id.
184 199 *
185   - * @param id the new parent id
  200 + * @param id
  201 + * the new parent id
186 202 */
187 203 final public void setParentId(Integer id) {
188 204 tokenParentId = id;
... ... @@ -200,7 +216,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
200 216 /**
201 217 * Sets the provide parent id.
202 218 *
203   - * @param provide the new provide parent id
  219 + * @param provide
  220 + * the new provide parent id
204 221 */
205 222 final public void setProvideParentId(Boolean provide) {
206 223 provideParentId = provide;
... ... @@ -234,7 +251,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
234 251 /**
235 252 * Adds the position.
236 253 *
237   - * @param position the position
  254 + * @param position
  255 + * the position
238 256 */
239 257 final public void addPosition(int position) {
240 258 if (tokenPosition == null) {
... ... @@ -247,8 +265,10 @@ public abstract class MtasToken&lt;GenericType&gt; {
247 265 /**
248 266 * Adds the position range.
249 267 *
250   - * @param start the start
251   - * @param end the end
  268 + * @param start
  269 + * the start
  270 + * @param end
  271 + * the end
252 272 */
253 273 final public void addPositionRange(int start, int end) {
254 274 if (tokenPosition == null) {
... ... @@ -265,7 +285,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
265 285 /**
266 286 * Adds the positions.
267 287 *
268   - * @param positions the positions
  288 + * @param positions
  289 + * the positions
269 290 */
270 291 final public void addPositions(int[] positions) {
271 292 if (positions != null && positions.length > 0) {
... ... @@ -280,7 +301,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
280 301 /**
281 302 * Adds the positions.
282 303 *
283   - * @param list the list
  304 + * @param list
  305 + * the list
284 306 */
285 307 final public void addPositions(TreeSet<Integer> list) {
286 308 int[] positions = ArrayUtils
... ... @@ -291,7 +313,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
291 313 /**
292 314 * Check position type.
293 315 *
294   - * @param type the type
  316 + * @param type
  317 + * the type
295 318 * @return the boolean
296 319 */
297 320 final public Boolean checkPositionType(String type) {
... ... @@ -368,8 +391,10 @@ public abstract class MtasToken&lt;GenericType&gt; {
368 391 /**
369 392 * Sets the offset.
370 393 *
371   - * @param start the start
372   - * @param end the end
  394 + * @param start
  395 + * the start
  396 + * @param end
  397 + * the end
373 398 */
374 399 final public void setOffset(Integer start, Integer end) {
375 400 if ((start == null) || (end == null)) {
... ... @@ -384,8 +409,10 @@ public abstract class MtasToken&lt;GenericType&gt; {
384 409 /**
385 410 * Adds the offset.
386 411 *
387   - * @param start the start
388   - * @param end the end
  412 + * @param start
  413 + * the start
  414 + * @param end
  415 + * the end
389 416 */
390 417 final public void addOffset(Integer start, Integer end) {
391 418 if (tokenOffset == null) {
... ... @@ -402,7 +429,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
402 429 /**
403 430 * Sets the provide offset.
404 431 *
405   - * @param provide the new provide offset
  432 + * @param provide
  433 + * the new provide offset
406 434 */
407 435 final public void setProvideOffset(Boolean provide) {
408 436 provideOffset = provide;
... ... @@ -411,8 +439,10 @@ public abstract class MtasToken&lt;GenericType&gt; {
411 439 /**
412 440 * Sets the real offset.
413 441 *
414   - * @param start the start
415   - * @param end the end
  442 + * @param start
  443 + * the start
  444 + * @param end
  445 + * the end
416 446 */
417 447 final public void setRealOffset(Integer start, Integer end) {
418 448 if ((start == null) || (end == null)) {
... ... @@ -428,7 +458,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
428 458 /**
429 459 * Sets the provide real offset.
430 460 *
431   - * @param provide the new provide real offset
  461 + * @param provide
  462 + * the new provide real offset
432 463 */
433 464 final public void setProvideRealOffset(Boolean provide) {
434 465 provideRealOffset = provide;
... ... @@ -491,7 +522,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
491 522 /**
492 523 * Sets the value.
493 524 *
494   - * @param value the new value
  525 + * @param value
  526 + * the new value
495 527 */
496 528 public void setValue(String value) {
497 529 tokenValue = value;
... ... @@ -500,7 +532,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
500 532 /**
501 533 * Gets the prefix from value.
502 534 *
503   - * @param value the value
  535 + * @param value
  536 + * the value
504 537 * @return the prefix from value
505 538 */
506 539 public static String getPrefixFromValue(String value) {
... ... @@ -521,7 +554,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
521 554 /**
522 555 * Gets the postfix from value.
523 556 *
524   - * @param value the value
  557 + * @param value
  558 + * the value
525 559 * @return the postfix from value
526 560 */
527 561 public static String getPostfixFromValue(String value) {
... ... @@ -537,7 +571,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
537 571 /**
538 572 * Gets the postfix from value.
539 573 *
540   - * @param term the term
  574 + * @param term
  575 + * the term
541 576 * @return the postfix from value
542 577 */
543 578 public static String getPostfixFromValue(BytesRef term) {
... ... @@ -671,7 +706,8 @@ public abstract class MtasToken&lt;GenericType&gt; {
671 706 /**
672 707 * Sets the payload.
673 708 *
674   - * @param payload the new payload
  709 + * @param payload
  710 + * the new payload
675 711 */
676 712 public void setPayload(BytesRef payload) {
677 713 tokenPayload = payload;
... ... @@ -686,17 +722,46 @@ public abstract class MtasToken&lt;GenericType&gt; {
686 722 return tokenPayload;
687 723 }
688 724  
  725 + public static HashMap<String, Automaton> createAutomatonMap(String prefix,
  726 + List<String> valueList, Boolean filter) {
  727 + HashMap<String, Automaton> automatonMap = new HashMap<String, Automaton>();
  728 + if (valueList != null) {
  729 + for (String item : valueList) {
  730 + if (filter) {
  731 + item = item.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])",
  732 + "\\\\\\1");
  733 + }
  734 + automatonMap.put(item,
  735 + new RegExp(prefix + MtasToken.DELIMITER + item + "\u0000*").toAutomaton());
  736 + }
  737 + }
  738 + return automatonMap;
  739 + }
  740 +
  741 + public static HashMap<String, ByteRunAutomaton> byteRunAutomatonMap(HashMap<String, Automaton> automatonMap) {
  742 + HashMap<String, ByteRunAutomaton> byteRunAutomatonMap = new HashMap<String, ByteRunAutomaton>();
  743 + if(automatonMap!=null) {
  744 + for(String key : automatonMap.keySet()) {
  745 + byteRunAutomatonMap.put(key, new ByteRunAutomaton(automatonMap.get(key)));
  746 + }
  747 + }
  748 + return byteRunAutomatonMap;
  749 + }
689 750 /**
690 751 * Creates the automata.
691 752 *
692   - * @param prefix the prefix
693   - * @param regexp the regexp
694   - * @param valueList the value list
  753 + * @param prefix
  754 + * the prefix
  755 + * @param regexp
  756 + * the regexp
  757 + * @param valueList
  758 + * the value list
695 759 * @return the list
696   - * @throws IOException Signals that an I/O exception has occurred.
  760 + * @throws IOException
  761 + * Signals that an I/O exception has occurred.
697 762 */
698 763 public static List<CompiledAutomaton> createAutomata(String prefix,
699   - String regexp, List<String> valueList) throws IOException {
  764 + String regexp, HashMap<String, Automaton> automatonMap) throws IOException {
700 765 List<CompiledAutomaton> list = new ArrayList<CompiledAutomaton>();
701 766 Automaton automatonRegexp = null;
702 767 if (regexp != null) {
... ... @@ -704,26 +769,22 @@ public abstract class MtasToken&lt;GenericType&gt; {
704 769 automatonRegexp = re.toAutomaton();
705 770 }
706 771 int step = 500;
707   - for (int i = 0; i < valueList.size(); i += step) {
  772 + List<String> keyList = new ArrayList<String>(automatonMap.keySet());
  773 + for (int i = 0; i < keyList.size(); i += step) {
708 774 int localStep = step;
709 775 boolean success = false;
710 776 CompiledAutomaton compiledAutomaton = null;
711 777 while (!success) {
712 778 success = true;
713   - int next = Math.min(valueList.size(), i + localStep);
  779 + int next = Math.min(keyList.size(), i + localStep);
714 780 List<Automaton> listAutomaton = new ArrayList<Automaton>();
715 781 for (int j = i; j < next; j++) {
716   - String value = valueList.get(j);
717   - value = value.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])",
718   - "\\\\\\1");
719   - listAutomaton.add(
720   - (new RegExp(prefix + MtasToken.DELIMITER + value + "\u0000*"))
721   - .toAutomaton());
  782 + listAutomaton.add(automatonMap.get(keyList.get(j)));
722 783 }
723 784 Automaton automatonList = Operations.union(listAutomaton);
724 785 Automaton automaton;
725 786 if (automatonRegexp != null) {
726   - automaton = Operations.intersection(automatonList, automatonRegexp);
  787 + automaton = Operations.intersection(automatonList, automatonRegexp);
727 788 } else {
728 789 automaton = automatonList;
729 790 }
... ...
src/mtas/codec/util/CodecCollector.java
... ... @@ -55,17 +55,24 @@ import org.apache.lucene.index.IndexableField;
55 55 import org.apache.lucene.index.LeafReader;
56 56 import org.apache.lucene.index.LeafReaderContext;
57 57 import org.apache.lucene.index.PostingsEnum;
  58 +import org.apache.lucene.index.SingleTermsEnum;
58 59 import org.apache.lucene.index.Term;
59 60 import org.apache.lucene.index.Terms;
60 61 import org.apache.lucene.index.TermsEnum;
  62 +import org.apache.lucene.search.AutomatonQuery;
61 63 import org.apache.lucene.search.DocIdSetIterator;
62 64 import org.apache.lucene.search.IndexSearcher;
63 65 import org.apache.lucene.search.spans.SpanWeight;
64 66 import org.apache.lucene.search.spans.Spans;
65 67 import org.apache.lucene.util.Bits;
66 68 import org.apache.lucene.util.BytesRef;
  69 +import org.apache.lucene.util.BytesRefBuilder;
67 70 import org.apache.lucene.util.LegacyNumericUtils;
  71 +import org.apache.lucene.util.automaton.Automaton;
  72 +import org.apache.lucene.util.automaton.ByteRunAutomaton;
68 73 import org.apache.lucene.util.automaton.CompiledAutomaton;
  74 +import org.apache.lucene.util.automaton.Operations;
  75 +import org.apache.lucene.util.automaton.RegExp;
69 76  
70 77 /**
71 78 * The Class CodecCollector.
... ... @@ -2012,6 +2019,9 @@ public class CodecCollector {
2012 2019 IndexSearcher searcher, Terms t, LeafReader r, LeafReaderContext lrc)
2013 2020 throws IOException {
2014 2021 if (documentList != null) {
  2022 + TreeSet<String> listStatsItems = CodecUtil.createStatsItems("sum");
  2023 + String listStatsType = CodecUtil.createStatsType(listStatsItems,
  2024 + CodecUtil.STATS_TYPE_SUM, null);
2015 2025 for (ComponentDocument document : documentList) {
2016 2026 // initialize
2017 2027 for (int docId : docList) {
... ... @@ -2019,9 +2029,6 @@ public class CodecCollector {
2019 2029 Document doc = searcher.doc(docId,
2020 2030 new HashSet<String>(Arrays.asList(uniqueKeyField)));
2021 2031 IndexableField indxfld = doc.getField(uniqueKeyField);
2022   - TreeSet<String> listStatsItems = CodecUtil.createStatsItems("sum");
2023   - String listStatsType = CodecUtil.createStatsType(listStatsItems,
2024   - CodecUtil.STATS_TYPE_SUM, null);
2025 2032 // get other doc info
2026 2033 if (indxfld != null) {
2027 2034 document.uniqueKey.put(docId, indxfld.stringValue());
... ... @@ -2031,10 +2038,30 @@ public class CodecCollector {
2031 2038 null, null);
2032 2039 document.statsData.put(docId, stats);
2033 2040 if (document.statsList != null) {
2034   - MtasDataCollector<?, ?> list = DataCollector.getCollector(
2035   - DataCollector.COLLECTOR_TYPE_LIST, CodecUtil.DATA_TYPE_LONG,
2036   - listStatsType, listStatsItems, CodecUtil.STATS_TYPE_SUM,
2037   - CodecUtil.SORT_DESC, 0, document.number, null, null);
  2041 + MtasDataCollector<?, ?> list;
  2042 + if (document.listExpand) {
  2043 + TreeSet<String>[] baseStatsItems = new TreeSet[] {
  2044 + listStatsItems };
  2045 + list = DataCollector.getCollector(
  2046 + DataCollector.COLLECTOR_TYPE_LIST, CodecUtil.DATA_TYPE_LONG,
  2047 + listStatsType, listStatsItems, CodecUtil.STATS_TYPE_SUM,
  2048 + CodecUtil.SORT_DESC, 0, document.listNumber,
  2049 + new String[] { DataCollector.COLLECTOR_TYPE_LIST },
  2050 + new String[] { CodecUtil.DATA_TYPE_LONG },
  2051 + new String[] { listStatsType },
  2052 + Arrays.copyOfRange(baseStatsItems, 0,
  2053 + baseStatsItems.length),
  2054 + new String[] { CodecUtil.STATS_TYPE_SUM },
  2055 + new String[] { CodecUtil.SORT_DESC }, new Integer[] { 0 },
  2056 + new Integer[] { document.listExpandNumber }, null, null);
  2057 + } else {
  2058 + TreeSet<String>[] baseStatsItems = new TreeSet[] {
  2059 + listStatsItems };
  2060 + list = DataCollector.getCollector(
  2061 + DataCollector.COLLECTOR_TYPE_LIST, CodecUtil.DATA_TYPE_LONG,
  2062 + listStatsType, listStatsItems, CodecUtil.STATS_TYPE_SUM,
  2063 + CodecUtil.SORT_DESC, 0, document.listNumber, null, null);
  2064 + }
2038 2065 document.statsList.put(docId, list);
2039 2066 }
2040 2067 }
... ... @@ -2049,55 +2076,142 @@ public class CodecCollector {
2049 2076 for (ComponentDocument document : documentList) {
2050 2077  
2051 2078 List<CompiledAutomaton> listAutomata;
  2079 + HashMap<String, Automaton> automatonMap;
  2080 + HashMap<String, ByteRunAutomaton> byteRunAutomatonMap;
2052 2081 if (document.list == null) {
  2082 + automatonMap = null;
  2083 + byteRunAutomatonMap = null;
2053 2084 listAutomata = new ArrayList<CompiledAutomaton>();
2054   - listAutomata.add(document.compiledAutomaton);
  2085 + CompiledAutomaton compiledAutomaton;
  2086 + Automaton automaton;
  2087 + if ((document.regexp == null) || (document.regexp.isEmpty())) {
  2088 + RegExp re = new RegExp(
  2089 + document.prefix + MtasToken.DELIMITER + ".*");
  2090 + automaton = re.toAutomaton();
  2091 + } else {
  2092 + RegExp re = new RegExp(document.prefix + MtasToken.DELIMITER
  2093 + + document.regexp + "\u0000*");
  2094 + automaton = re.toAutomaton();
  2095 + }
  2096 + compiledAutomaton = new CompiledAutomaton(automaton);
  2097 + listAutomata.add(compiledAutomaton);
2055 2098 } else {
  2099 + automatonMap = MtasToken.createAutomatonMap(document.prefix,
  2100 + new ArrayList<String>(document.list),
  2101 + document.listRegexp ? false : true);
  2102 + byteRunAutomatonMap = MtasToken.byteRunAutomatonMap(automatonMap);
2056 2103 listAutomata = MtasToken.createAutomata(document.prefix,
2057   - document.regexp, new ArrayList<String>(document.list));
  2104 + document.regexp, automatonMap);
  2105 + }
  2106 + List<ByteRunAutomaton> ignoreByteRunAutomatonList = null;
  2107 + if ((document.ignoreRegexp != null)
  2108 + && (!document.ignoreRegexp.isEmpty())) {
  2109 + ignoreByteRunAutomatonList = new ArrayList<ByteRunAutomaton>();
  2110 + RegExp re = new RegExp(document.prefix + MtasToken.DELIMITER
  2111 + + document.ignoreRegexp + "\u0000*");
  2112 + ignoreByteRunAutomatonList
  2113 + .add(new ByteRunAutomaton(re.toAutomaton()));
  2114 + }
  2115 + if (document.ignoreList != null) {
  2116 + if(ignoreByteRunAutomatonList==null) {
  2117 + ignoreByteRunAutomatonList = new ArrayList<ByteRunAutomaton>();
  2118 + }
  2119 + HashMap<String, Automaton> list = MtasToken.createAutomatonMap(
  2120 + document.prefix, new ArrayList<String>(document.ignoreList),
  2121 + document.ignoreListRegexp ? false : true);
  2122 + for (Automaton automaton : list.values()) {
  2123 + ignoreByteRunAutomatonList.add(new ByteRunAutomaton(automaton));
  2124 + }
2058 2125 }
2059 2126  
2060 2127 for (CompiledAutomaton compiledAutomaton : listAutomata) {
2061   -
2062   - termsEnum = t.intersect(compiledAutomaton, null);
2063   - // init
2064   - int initSize = Math.min((int) t.size(), 1000);
2065   - for (int docId : docList) {
2066   - document.statsData.get(docId).initNewList(1);
2067   - if (document.statsList != null) {
2068   - document.statsList.get(docId).initNewList(initSize);
  2128 + if (!compiledAutomaton.type
  2129 + .equals(CompiledAutomaton.AUTOMATON_TYPE.NONE)) {
  2130 + termsEnum = t.intersect(compiledAutomaton, null);
  2131 + // init
  2132 + int initBaseSize = Math.min((int) t.size(), 1000);
  2133 + int initListSize = document.statsList != null
  2134 + ? Math.min(document.statsList.size(), initBaseSize)
  2135 + : initBaseSize;
  2136 + HashSet<MtasDataCollector<?, ?>> initialised = new HashSet<MtasDataCollector<?, ?>>();
  2137 + for (int docId : docList) {
  2138 + document.statsData.get(docId).initNewList(1);
  2139 + initialised.add(document.statsData.get(docId));
  2140 + if (document.statsList != null
  2141 + && document.statsList.size() > 0) {
  2142 + document.statsList.get(docId).initNewList(initListSize);
  2143 + initialised.add(document.statsList.get(docId));
  2144 + }
2069 2145 }
2070   - }
2071   - // fill
2072   - while ((term = termsEnum.next()) != null) {
2073   - Iterator<Integer> docIterator = docList.iterator();
2074   - postingsEnum = termsEnum.postings(postingsEnum,
2075   - PostingsEnum.FREQS);
2076   - int termDocId = -1;
2077   - while (docIterator.hasNext()) {
2078   - int segmentDocId = docIterator.next() - lrc.docBase;
2079   - if (segmentDocId >= termDocId) {
2080   - if ((segmentDocId == termDocId) || ((termDocId = postingsEnum
2081   - .advance(segmentDocId)) == segmentDocId)) {
2082   - // register stats
2083   - document.statsData.get(segmentDocId + lrc.docBase)
2084   - .add(new long[] { postingsEnum.freq() }, 1);
2085   - // register list
2086   - if (document.statsList != null) {
2087   - document.statsList.get(segmentDocId + lrc.docBase).add(
2088   - MtasToken.getPostfixFromValue(term),
2089   - new long[] { postingsEnum.freq() }, 1);
  2146 + // fill
  2147 + int termDocId;
  2148 + boolean acceptedTerm;
  2149 + while ((term = termsEnum.next()) != null) {
  2150 + Iterator<Integer> docIterator = docList.iterator();
  2151 + postingsEnum = termsEnum.postings(postingsEnum,
  2152 + PostingsEnum.FREQS);
  2153 + termDocId = -1;
  2154 + acceptedTerm = true;
  2155 + if(ignoreByteRunAutomatonList!=null) {
  2156 + for(ByteRunAutomaton ignoreByteRunAutomaton : ignoreByteRunAutomatonList) {
  2157 + if(ignoreByteRunAutomaton.run(term.bytes, term.offset, term.length)) {
  2158 + acceptedTerm = false;
  2159 + break;
  2160 + }
  2161 + }
  2162 + }
  2163 + if (acceptedTerm) {
  2164 + while (docIterator.hasNext()) {
  2165 + int segmentDocId = docIterator.next() - lrc.docBase;
  2166 + if (segmentDocId >= termDocId) {
  2167 + if ((segmentDocId == termDocId)
  2168 + || ((termDocId = postingsEnum
  2169 + .advance(segmentDocId)) == segmentDocId)) {
  2170 + // register stats
  2171 + document.statsData.get(segmentDocId + lrc.docBase)
  2172 + .add(new long[] { postingsEnum.freq() }, 1);
  2173 + // register list
  2174 + if (document.statsList != null) {
  2175 + if (automatonMap != null) {
  2176 + MtasDataCollector<?, ?> dataCollector,
  2177 + subSataCollector;
  2178 + for (String key : byteRunAutomatonMap.keySet()) {
  2179 + ByteRunAutomaton bra = byteRunAutomatonMap
  2180 + .get(key);
  2181 + if (bra.run(term.bytes, term.offset,
  2182 + term.length)) {
  2183 + dataCollector = document.statsList
  2184 + .get(segmentDocId + lrc.docBase);
  2185 + subSataCollector = dataCollector.add(key,
  2186 + new long[] { postingsEnum.freq() }, 1);
  2187 + if (document.listExpand
  2188 + && subSataCollector != null) {
  2189 + if (!initialised.contains(subSataCollector)) {
  2190 + subSataCollector.initNewList(initBaseSize);
  2191 + initialised.add(subSataCollector);
  2192 + }
  2193 + subSataCollector.add(
  2194 + MtasToken.getPostfixFromValue(term),
  2195 + new long[] { postingsEnum.freq() }, 1);
  2196 + }
  2197 + }
  2198 + }
  2199 + } else {
  2200 + document.statsList.get(segmentDocId + lrc.docBase)
  2201 + .add(MtasToken.getPostfixFromValue(term),
  2202 + new long[] { postingsEnum.freq() }, 1);
  2203 + }
  2204 + }
  2205 + }
2090 2206 }
2091 2207 }
2092 2208 }
2093 2209 }
2094   - }
2095   - // close
2096   - for (int docId : docList) {
2097   - document.statsData.get(docId).closeNewList();
2098   - if (document.statsList != null) {
2099   - document.statsList.get(docId).closeNewList();
  2210 + // close
  2211 + for (MtasDataCollector<?, ?> item : initialised) {
  2212 + item.closeNewList();
2100 2213 }
  2214 + initialised.clear();
2101 2215 }
2102 2216 }
2103 2217 }
... ... @@ -2702,12 +2816,16 @@ public class CodecCollector {
2702 2816 }
2703 2817  
2704 2818 List<CompiledAutomaton> listAutomata;
  2819 + HashMap<String, Automaton> automatonMap;
2705 2820 if (termVector.list == null) {
  2821 + automatonMap = null;
2706 2822 listAutomata = new ArrayList<CompiledAutomaton>();
2707 2823 listAutomata.add(termVector.compiledAutomaton);
2708 2824 } else {
  2825 + automatonMap = MtasToken.createAutomatonMap(termVector.prefix,
  2826 + new ArrayList<String>(termVector.list), true);
2709 2827 listAutomata = MtasToken.createAutomata(termVector.prefix,
2710   - termVector.regexp, new ArrayList<String>(termVector.list));
  2828 + termVector.regexp, automatonMap);
2711 2829 }
2712 2830  
2713 2831 for (CompiledAutomaton compiledAutomaton : listAutomata) {
... ... @@ -3055,9 +3173,12 @@ public class CodecCollector {
3055 3173 HashSet<String> recomputeKeyList = termVector.subComponentFunction.dataCollector.segmentRecomputeKeyList
3056 3174 .get(segmentName);
3057 3175 if (recomputeKeyList.size() > 0) {
  3176 + HashMap<String, Automaton> automatonMap = MtasToken
  3177 + .createAutomatonMap(termVector.prefix,
  3178 + new ArrayList<String>(termVector.list), true);
3058 3179 List<CompiledAutomaton> listCompiledAutomata = MtasToken
3059 3180 .createAutomata(termVector.prefix, termVector.regexp,
3060   - new ArrayList<String>(recomputeKeyList));
  3181 + automatonMap);
3061 3182 for (CompiledAutomaton compiledAutomaton : listCompiledAutomata) {
3062 3183 termsEnum = t.intersect(compiledAutomaton, null);
3063 3184 termVector.subComponentFunction.dataCollector.initNewList(
... ...
src/mtas/codec/util/CodecComponent.java
... ... @@ -265,10 +265,14 @@ public class CodecComponent {
265 265 public static class ComponentDocument {
266 266  
267 267 /** The regexp. */
268   - public String key, prefix, regexp;
  268 + public String key, prefix, regexp, ignoreRegexp;
269 269  
270 270 /** The list. */
271   - public HashSet<String> list;
  271 + public HashSet<String> list, ignoreList;
  272 +
  273 + public boolean listRegexp, listExpand, ignoreListRegexp;
  274 +
  275 + public int listExpandNumber;
272 276  
273 277 /** The stats type. */
274 278 public String dataType, statsType;
... ... @@ -276,11 +280,8 @@ public class CodecComponent {
276 280 /** The stats items. */
277 281 public TreeSet<String> statsItems;
278 282  
279   - /** The compiled automaton. */
280   - public CompiledAutomaton compiledAutomaton;
281   -
282 283 /** The number. */
283   - public int number;
  284 + public int listNumber;
284 285  
285 286 /** The unique key. */
286 287 public HashMap<Integer, String> uniqueKey;
... ... @@ -290,7 +291,7 @@ public class CodecComponent {
290 291  
291 292 /** The list. */
292 293 public HashMap<Integer, MtasDataCollector<?, ?>> statsList;
293   -
  294 +
294 295 /**
295 296 * Instantiates a new component document.
296 297 *
... ... @@ -302,30 +303,40 @@ public class CodecComponent {
302 303 * @throws IOException Signals that an I/O exception has occurred.
303 304 */
304 305 public ComponentDocument(String key, String prefix, String statsType,
305   - String regexp, String[] list, int number) throws IOException {
  306 + String regexp, String[] list, int listNumber, Boolean listRegexp, Boolean listExpand, int listExpandNumber, String ignoreRegexp, String[] ignoreList, Boolean ignoreListRegexp) throws IOException {
306 307 this.key = key;
307 308 this.prefix = prefix;
308 309 this.regexp = regexp;
309 310 if (list != null && list.length > 0) {
310 311 this.list = new HashSet(Arrays.asList(list));
  312 + this.listRegexp = listRegexp!=null?listRegexp:false;
  313 + this.listExpand = (listExpand!=null && listExpandNumber>0)?listExpand:false;
  314 + if(this.listExpand) {
  315 + this.listExpandNumber = listExpandNumber;
  316 + } else {
  317 + this.listExpandNumber = 0;
  318 + }
311 319 } else {
312 320 this.list = null;
  321 + this.listRegexp = false;
  322 + this.listExpand = false;
  323 + this.listExpandNumber = 0;
  324 + }
  325 + this.ignoreRegexp = ignoreRegexp;
  326 + if (ignoreList != null && ignoreList.length > 0) {
  327 + this.ignoreList = new HashSet(Arrays.asList(ignoreList));
  328 + this.ignoreListRegexp = ignoreListRegexp!=null?ignoreListRegexp:false;
  329 + } else {
  330 + this.ignoreList = null;
  331 + this.ignoreListRegexp = false;
313 332 }
314   - this.number = number;
  333 + this.listNumber = listNumber;
315 334 uniqueKey = new HashMap<Integer, String>();
316 335 dataType = CodecUtil.DATA_TYPE_LONG;
317 336 statsItems = CodecUtil.createStatsItems(statsType);
318   - this.statsType = CodecUtil.createStatsType(statsItems, null, null);
319   - if ((regexp == null) || (regexp.isEmpty())) {
320   - RegExp re = new RegExp(prefix + MtasToken.DELIMITER + ".*");
321   - compiledAutomaton = new CompiledAutomaton(re.toAutomaton());
322   - } else {
323   - RegExp re = new RegExp(
324   - prefix + MtasToken.DELIMITER + regexp + "\u0000*");
325   - compiledAutomaton = new CompiledAutomaton(re.toAutomaton());
326   - }
  337 + this.statsType = CodecUtil.createStatsType(statsItems, null, null);
327 338 this.statsData = new HashMap<Integer, MtasDataCollector<?, ?>>();
328   - if (this.number > 0) {
  339 + if (this.listNumber > 0) {
329 340 this.statsList = new HashMap<Integer, MtasDataCollector<?, ?>>();
330 341 } else {
331 342 this.statsList = null;
... ...
src/mtas/search/spans/MtasSpanIntersectingQuery.java
1 1 package mtas.search.spans;
2 2  
3 3 import java.io.IOException;
  4 +import java.util.ArrayList;
  5 +import java.util.List;
  6 +import java.util.Map;
  7 +import java.util.Set;
4 8  
  9 +import org.apache.lucene.index.LeafReaderContext;
  10 +import org.apache.lucene.index.Term;
  11 +import org.apache.lucene.index.TermContext;
  12 +import org.apache.lucene.index.Terms;
5 13 import org.apache.lucene.search.IndexSearcher;
6   -import org.apache.lucene.search.spans.SpanQuery;
7 14 import org.apache.lucene.search.spans.SpanWeight;
8   -
  15 +import org.apache.lucene.search.spans.Spans;
9 16 import mtas.search.spans.util.MtasSpanQuery;
10 17  
11 18 public class MtasSpanIntersectingQuery extends MtasSpanQuery {
12 19  
13   - public MtasSpanIntersectingQuery(SpanQuery q1, SpanQuery q2) {
14   - super();
  20 + private String field;
  21 +
  22 + private MtasSpanQuery q1, q2;
  23 +
  24 + public MtasSpanIntersectingQuery(MtasSpanQuery q1, MtasSpanQuery q2) {
  25 + if (q1 != null) {
  26 + field = q1.getField();
  27 + if (q2 != null && !q2.getField().equals(field)) {
  28 + throw new IllegalArgumentException("Clauses must have same field.");
  29 + }
  30 + } else if (q2 != null) {
  31 + field = q2.getField();
  32 + } else {
  33 + field = null;
  34 + }
  35 + this.q1 = q1;
  36 + this.q2 = q2;
15 37 }
16   -
  38 +
17 39 @Override
18 40 public String getField() {
19   - // TODO Auto-generated method stub
20   - return null;
  41 + return field;
21 42 }
22 43  
23 44 @Override
24 45 public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores)
25 46 throws IOException {
26   - // TODO Auto-generated method stub
27   - return null;
  47 + if (q1 == null || q2 == null) {
  48 + return null;
  49 + } else {
  50 + MtasSpanIntersectingQueryWeight w1 = new MtasSpanIntersectingQueryWeight(
  51 + q1.createWeight(searcher, needsScores));
  52 + MtasSpanIntersectingQueryWeight w2 = new MtasSpanIntersectingQueryWeight(
  53 + q2.createWeight(searcher, needsScores));
  54 + //subWeights
  55 + List<MtasSpanIntersectingQueryWeight> subWeights = new ArrayList<MtasSpanIntersectingQueryWeight>();
  56 + subWeights.add(w1);
  57 + subWeights.add(w2);
  58 + //return
  59 + return new SpanIntersectingWeight(w1, w2, searcher, needsScores ? getTermContexts(subWeights) : null);
  60 + }
  61 + }
  62 +
  63 + protected Map<Term, TermContext> getTermContexts(
  64 + List<MtasSpanIntersectingQueryWeight> items) {
  65 + List<SpanWeight> weights = new ArrayList<SpanWeight>();
  66 + for (MtasSpanIntersectingQueryWeight item : items) {
  67 + weights.add(item.spanWeight);
  68 + }
  69 + return getTermContexts(weights);
28 70 }
29 71  
30 72 @Override
31 73 public String toString(String field) {
32   - // TODO Auto-generated method stub
33   - return null;
  74 + StringBuilder buffer = new StringBuilder();
  75 + buffer.append(this.getClass().getSimpleName() + "([");
  76 + if (q1 != null) {
  77 + buffer.append(q1.toString(q1.getField()));
  78 + } else {
  79 + buffer.append("null");
  80 + }
  81 + buffer.append(",");
  82 + if (q2 != null) {
  83 + buffer.append(q2.toString(q2.getField()));
  84 + } else {
  85 + buffer.append("null");
  86 + }
  87 + buffer.append("])");
  88 + return buffer.toString();
34 89 }
35 90  
36 91 @Override
37 92 public boolean equals(Object obj) {
38   - // TODO Auto-generated method stub
39   - return false;
  93 + if (this == obj)
  94 + return true;
  95 + if (obj == null)
  96 + return false;
  97 + if (getClass() != obj.getClass())
  98 + return false;
  99 + final MtasSpanIntersectingQuery other = (MtasSpanIntersectingQuery) obj;
  100 + return q1.equals(other.q1) && q2.equals(other.q2);
40 101 }
41 102  
42 103 @Override
43 104 public int hashCode() {
44   - // TODO Auto-generated method stub
45   - return 0;
  105 + int h = this.getClass().getSimpleName().hashCode();
  106 + if (q1 != null) {
  107 + h = (h * 7) ^ q1.hashCode();
  108 + }
  109 + if (q2 != null) {
  110 + h = (h * 11) ^ q2.hashCode();
  111 + }
  112 + return h;
  113 + }
  114 +
  115 + public class SpanIntersectingWeight extends SpanWeight {
  116 +
  117 + MtasSpanIntersectingQueryWeight w1,w2;
  118 +
  119 + public SpanIntersectingWeight(MtasSpanIntersectingQueryWeight w1, MtasSpanIntersectingQueryWeight w2, IndexSearcher searcher,
  120 + Map<Term, TermContext> terms) throws IOException {
  121 + super(MtasSpanIntersectingQuery.this, searcher, terms);
  122 + this.w1=w1;
  123 + this.w2=w2;
  124 + }
  125 +
  126 + @Override
  127 + public void extractTermContexts(Map<Term, TermContext> contexts) {
  128 + w1.spanWeight.extractTermContexts(contexts);
  129 + w2.spanWeight.extractTermContexts(contexts);
  130 + }
  131 +
  132 + @Override
  133 + public Spans getSpans(LeafReaderContext context, Postings requiredPostings)
  134 + throws IOException {
  135 + Terms terms = context.reader().terms(field);
  136 + if (terms == null) {
  137 + return null; // field does not exist
  138 + }
  139 + MtasSpanIntersectingQuerySpans s1 = new MtasSpanIntersectingQuerySpans(w1.spanWeight.getSpans(context, requiredPostings));
  140 + MtasSpanIntersectingQuerySpans s2 = new MtasSpanIntersectingQuerySpans(w2.spanWeight.getSpans(context, requiredPostings));
  141 + return new MtasSpanIntersectingSpans(MtasSpanIntersectingQuery.this,
  142 + s1, s2);
  143 + }
  144 +
  145 + @Override
  146 + public void extractTerms(Set<Term> terms) {
  147 + w1.spanWeight.extractTerms(terms);
  148 + w2.spanWeight.extractTerms(terms);
  149 + }
  150 +
  151 + }
  152 +
  153 + public class MtasSpanIntersectingQuerySpans {
  154 + public Spans spans;
  155 +
  156 + public MtasSpanIntersectingQuerySpans(Spans spans) {
  157 + this.spans = spans;
  158 + }
  159 +
  160 + }
  161 +
  162 + public class MtasSpanIntersectingQueryWeight {
  163 +
  164 + /** The span weight. */
  165 + public SpanWeight spanWeight;
  166 +
  167 + public MtasSpanIntersectingQueryWeight(SpanWeight spanWeight) {
  168 + this.spanWeight = spanWeight;
  169 + }
46 170 }
47 171  
48 172 }
... ...
src/mtas/search/spans/MtasSpanIntersectingSpans.java 0 โ†’ 100644
  1 +package mtas.search.spans;
  2 +
  3 +import java.io.IOException;
  4 +import java.util.List;
  5 +
  6 +import org.apache.lucene.search.spans.SpanCollector;
  7 +import org.apache.lucene.search.spans.Spans;
  8 +
  9 +import mtas.search.spans.MtasSpanIntersectingQuery.MtasSpanIntersectingQuerySpans;
  10 +import mtas.search.spans.util.MtasSpans;
  11 +
  12 +public class MtasSpanIntersectingSpans extends Spans implements MtasSpans {
  13 +
  14 + private MtasSpanIntersectingQuerySpans spans1, spans2;
  15 +
  16 + private int docId;
  17 +
  18 + public MtasSpanIntersectingSpans(MtasSpanIntersectingQuery mtasSpanIntersectingQuery,
  19 + MtasSpanIntersectingQuerySpans spans1, MtasSpanIntersectingQuerySpans spans2) {
  20 + super();
  21 + docId = -1;
  22 + this.spans1 = spans1;
  23 + this.spans2 = spans2;
  24 + }
  25 +
  26 + @Override
  27 + public void collect(SpanCollector collector) throws IOException {
  28 + spans1.spans.collect(collector);
  29 + spans2.spans.collect(collector);
  30 + }
  31 +
  32 + @Override
  33 + public int endPosition() {
  34 + return NO_MORE_POSITIONS;
  35 + }
  36 +
  37 + @Override
  38 + public int nextStartPosition() throws IOException {
  39 + return NO_MORE_POSITIONS;
  40 + }
  41 +
  42 + @Override
  43 + public float positionsCost() {
  44 + return 0;
  45 + }
  46 +
  47 + @Override
  48 + public int startPosition() {
  49 + return NO_MORE_POSITIONS;
  50 + }
  51 +
  52 + @Override
  53 + public int width() {
  54 + return 0;
  55 + }
  56 +
  57 + @Override
  58 + public int advance(int target) throws IOException {
  59 + return NO_MORE_POSITIONS;
  60 + }
  61 +
  62 + @Override
  63 + public long cost() {
  64 + return 0;
  65 + }
  66 +
  67 + @Override
  68 + public int docID() {
  69 + return NO_MORE_DOCS;
  70 + }
  71 +
  72 + @Override
  73 + public int nextDoc() throws IOException {
  74 + return NO_MORE_DOCS;
  75 + }
  76 +
  77 +}
... ...
src/mtas/search/spans/MtasSpanMatchAllQuery.java
... ... @@ -2,19 +2,23 @@ package mtas.search.spans;
2 2  
3 3 import java.io.IOException;
4 4 import java.lang.reflect.Method;
  5 +import java.util.Collections;
5 6 import java.util.Map;
6 7 import java.util.Set;
7 8 import mtas.codec.util.CodecInfo;
8 9 import mtas.search.similarities.MtasSimScorer;
9 10 import mtas.search.spans.util.MtasSpanQuery;
  11 +import mtas.search.spans.util.MtasExtendedSpanTermQuery.SpanTermWeight;
10 12  
11 13 import org.apache.lucene.codecs.FieldsProducer;
  14 +import org.apache.lucene.index.IndexReaderContext;
12 15 import org.apache.lucene.index.LeafReader;
13 16 import org.apache.lucene.index.LeafReaderContext;
14 17 import org.apache.lucene.index.Term;
15 18 import org.apache.lucene.index.TermContext;
16 19 import org.apache.lucene.index.Terms;
17 20 import org.apache.lucene.search.IndexSearcher;
  21 +import org.apache.lucene.search.similarities.Similarity;
18 22 import org.apache.lucene.search.similarities.Similarity.SimScorer;
19 23 import org.apache.lucene.search.spans.SpanWeight;
20 24 import org.apache.lucene.search.spans.Spans;
... ... @@ -56,14 +60,17 @@ public class MtasSpanMatchAllQuery extends MtasSpanQuery {
56 60 */
57 61 @Override
58 62 public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores)
59   - throws IOException {
60   - return new SpanAllWeight(searcher, null);
  63 + throws IOException {
  64 + //keep things simple
  65 + return new SpanAllWeight(searcher, null);
61 66 }
62 67  
63 68 /**
64 69 * The Class SpanAllWeight.
65 70 */
66 71 public class SpanAllWeight extends SpanWeight {
  72 +
  73 + IndexSearcher searcher;
67 74  
68 75 /**
69 76 * Instantiates a new span all weight.
... ... @@ -78,6 +85,7 @@ public class MtasSpanMatchAllQuery extends MtasSpanQuery {
78 85 public SpanAllWeight(IndexSearcher searcher,
79 86 Map<Term, TermContext> termContexts) throws IOException {
80 87 super(MtasSpanMatchAllQuery.this, searcher, termContexts);
  88 + this.searcher = searcher;
81 89 }
82 90  
83 91 /*
... ... @@ -89,6 +97,15 @@ public class MtasSpanMatchAllQuery extends MtasSpanQuery {
89 97 */
90 98 @Override
91 99 public void extractTermContexts(Map<Term, TermContext> contexts) {
  100 + Term term = new Term(field);
  101 + if(!contexts.containsKey(term)) {
  102 + IndexReaderContext topContext = searcher.getTopReaderContext();
  103 + try {
  104 + contexts.put(term, TermContext.build(topContext, term));
  105 + } catch (IOException e) {
  106 + //fail
  107 + }
  108 + }
92 109 }
93 110  
94 111 /*
... ... @@ -199,6 +216,6 @@ public class MtasSpanMatchAllQuery extends MtasSpanQuery {
199 216 int h = this.getClass().getSimpleName().hashCode();
200 217 h = (h * 7) ^ field.hashCode();
201 218 return h;
202   - }
  219 + }
203 220  
204 221 }
... ...
src/mtas/search/spans/MtasSpanSequenceSpans.java
... ... @@ -19,7 +19,7 @@ import org.apache.lucene.search.spans.Spans;
19 19 * The Class MtasSpanSequenceSpans.
20 20 */
21 21 public class MtasSpanSequenceSpans extends Spans implements MtasSpans {
22   -
  22 +
23 23 /** The queue spans. */
24 24 private List<QueueItem> queueSpans;
25 25  
... ... @@ -123,6 +123,9 @@ public class MtasSpanSequenceSpans extends Spans implements MtasSpans {
123 123 */
124 124 @Override
125 125 public void collect(SpanCollector collector) throws IOException {
  126 + for(QueueItem item : queueSpans) {
  127 + item.sequenceSpans.spans.collect(collector);
  128 + }
126 129 }
127 130  
128 131 /*
... ...
src/mtas/search/spans/MtasSpanWithinQuery.java
... ... @@ -4,7 +4,6 @@ import java.io.IOException;
4 4  
5 5 import org.apache.lucene.index.IndexReader;
6 6 import org.apache.lucene.search.IndexSearcher;
7   -import org.apache.lucene.search.spans.SpanContainingQuery;
8 7 import org.apache.lucene.search.spans.SpanQuery;
9 8 import org.apache.lucene.search.spans.SpanWeight;
10 9 import org.apache.lucene.search.spans.SpanWithinQuery;
... ... @@ -35,7 +34,9 @@ public class MtasSpanWithinQuery extends MtasSpanQuery {
35 34 @Override
36 35 public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores)
37 36 throws IOException {
38   - return baseQuery.createWeight(searcher, needsScores);
  37 + SpanWeight sw = baseQuery.createWeight(searcher, needsScores);
  38 + return sw;
  39 + //return baseQuery.createWeight(searcher, needsScores);
39 40 }
40 41  
41 42 @Override
... ...
src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java
... ... @@ -2,7 +2,9 @@ package mtas.solr.handler.component.util;
2 2  
3 3 import java.io.IOException;
4 4 import java.util.ArrayList;
  5 +import java.util.HashMap;
5 6 import java.util.Set;
  7 +import java.util.TreeSet;
6 8  
7 9 import org.apache.solr.common.util.NamedList;
8 10 import org.apache.solr.common.util.SimpleOrderedMap;
... ... @@ -45,6 +47,18 @@ public class MtasSolrComponentDocument {
45 47  
46 48 /** The Constant NAME_MTAS_DOCUMENT_REGEXP. */
47 49 public static final String NAME_MTAS_DOCUMENT_LIST = "list";
  50 +
  51 + public static final String NAME_MTAS_DOCUMENT_LIST_REGEXP = "listRegexp";
  52 +
  53 + public static final String NAME_MTAS_DOCUMENT_LIST_EXPAND = "listExpand";
  54 +
  55 + public static final String NAME_MTAS_DOCUMENT_LIST_EXPAND_NUMBER = "listExpandNumber";
  56 +
  57 + public static final String NAME_MTAS_DOCUMENT_IGNORE_REGEXP = "ignoreRegexp";
  58 +
  59 + public static final String NAME_MTAS_DOCUMENT_IGNORE_LIST = "ignoreList";
  60 +
  61 + public static final String NAME_MTAS_DOCUMENT_IGNORE_LIST_REGEXP = "ignoreListRegexp";
48 62  
49 63 /** The Constant NAME_MTAS_DOCUMENT_NUMBER. */
50 64 public static final String NAME_MTAS_DOCUMENT_NUMBER = "number";
... ... @@ -77,7 +91,13 @@ public class MtasSolrComponentDocument {
77 91 String[] types = new String[ids.size()];
78 92 String[] regexps = new String[ids.size()];
79 93 String[] lists = new String[ids.size()];
80   - String[] numbers = new String[ids.size()];
  94 + Boolean[] listRegexps = new Boolean[ids.size()];
  95 + Boolean[] listExpands = new Boolean[ids.size()];
  96 + int[] listExpandNumbers = new int[ids.size()];
  97 + String[] ignoreRegexps = new String[ids.size()];
  98 + String[] ignoreLists = new String[ids.size()];
  99 + Boolean[] ignoreListRegexps = new Boolean[ids.size()];
  100 + String[] listNumbers = new String[ids.size()];
81 101 for (String id : ids) {
82 102 fields[tmpCounter] = rb.req.getParams().get(
83 103 PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_FIELD,
... ... @@ -98,7 +118,25 @@ public class MtasSolrComponentDocument {
98 118 lists[tmpCounter] = rb.req.getParams().get(
99 119 PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST,
100 120 null);
101   - numbers[tmpCounter] = rb.req.getParams().get(
  121 + listRegexps[tmpCounter] = rb.req.getParams().getBool(
  122 + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST_REGEXP,
  123 + false);
  124 + listExpands[tmpCounter] = rb.req.getParams().getBool(
  125 + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST_EXPAND,
  126 + false);
  127 + listExpandNumbers[tmpCounter] = rb.req.getParams().getInt(
  128 + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST_EXPAND_NUMBER,
  129 + 10);
  130 + ignoreRegexps[tmpCounter] = rb.req.getParams().get(
  131 + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_IGNORE_REGEXP,
  132 + null);
  133 + ignoreLists[tmpCounter] = rb.req.getParams().get(
  134 + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_IGNORE_LIST,
  135 + null);
  136 + ignoreListRegexps[tmpCounter] = rb.req.getParams().getBool(
  137 + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_IGNORE_LIST_REGEXP,
  138 + false);
  139 + listNumbers[tmpCounter] = rb.req.getParams().get(
102 140 PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_NUMBER,
103 141 null);
104 142 tmpCounter++;
... ... @@ -123,7 +161,11 @@ public class MtasSolrComponentDocument {
123 161 NAME_MTAS_DOCUMENT_REGEXP, NAME_MTAS_DOCUMENT_FIELD, false);
124 162 MtasSolrResultUtil.compareAndCheck(lists, fields,
125 163 NAME_MTAS_DOCUMENT_LIST, NAME_MTAS_DOCUMENT_FIELD, false);
126   - MtasSolrResultUtil.compareAndCheck(numbers, fields,
  164 + MtasSolrResultUtil.compareAndCheck(ignoreRegexps, fields,
  165 + NAME_MTAS_DOCUMENT_IGNORE_REGEXP, NAME_MTAS_DOCUMENT_FIELD, false);
  166 + MtasSolrResultUtil.compareAndCheck(ignoreLists, fields,
  167 + NAME_MTAS_DOCUMENT_IGNORE_LIST, NAME_MTAS_DOCUMENT_FIELD, false);
  168 + MtasSolrResultUtil.compareAndCheck(listNumbers, fields,
127 169 NAME_MTAS_DOCUMENT_NUMBER, NAME_MTAS_DOCUMENT_FIELD, false);
128 170 for (int i = 0; i < fields.length; i++) {
129 171 String key = (keys[i] == null) || (keys[i].isEmpty())
... ... @@ -133,6 +175,9 @@ public class MtasSolrComponentDocument {
133 175 String type = types[i];
134 176 String regexp = regexps[i];
135 177 String[] list = null;
  178 + Boolean listRegexp = listRegexps[i];
  179 + Boolean listExpand = listExpands[i];
  180 + int listExpandNumber = listExpandNumbers[i];
136 181 if(lists[i]!=null) {
137 182 ArrayList<String> tmpList = new ArrayList<String>();
138 183 String[] subList = lists[i].split("(?<!\\\\),");
... ... @@ -141,10 +186,21 @@ public class MtasSolrComponentDocument {
141 186 }
142 187 list = tmpList.toArray(new String[tmpList.size()]);
143 188 }
144   - int number = Math.max(0, (numbers[i] == null) || (numbers[i].isEmpty())
145   - ? 0 : Integer.parseInt(numbers[i]));
  189 + int listNumber = Math.max(0, (listNumbers[i] == null) || (listNumbers[i].isEmpty())
  190 + ? 0 : Integer.parseInt(listNumbers[i]));
  191 + String ignoreRegexp = ignoreRegexps[i];
  192 + String[] ignoreList = null;
  193 + Boolean ignoreListRegexp = ignoreListRegexps[i];
  194 + if(ignoreLists[i]!=null) {
  195 + ArrayList<String> tmpList = new ArrayList<String>();
  196 + String[] subList = ignoreLists[i].split("(?<!\\\\),");
  197 + for(int j=0; j<subList.length; j++) {
  198 + tmpList.add(subList[j].replace("\\,", ",").replace("\\\\", "\\"));
  199 + }
  200 + ignoreList = tmpList.toArray(new String[tmpList.size()]);
  201 + }
146 202 mtasFields.list.get(fields[i]).documentList
147   - .add(new ComponentDocument(key, prefix, type, regexp, list, number));
  203 + .add(new ComponentDocument(key, prefix, type, regexp, list, listNumber, listRegexp, listExpand, listExpandNumber, ignoreRegexp, ignoreList, ignoreListRegexp));
148 204 }
149 205 }
150 206 }
... ... @@ -165,6 +221,7 @@ public class MtasSolrComponentDocument {
165 221 NamedList<Object> mtasDocumentItemResponse = new SimpleOrderedMap<>();
166 222 MtasDataCollector<?, ?> stats = document.statsData.get(docId);
167 223 MtasDataCollector<?, ?> list = null;
  224 + HashMap<String, MtasDataCollector<?, ?>> expandedList = null;
168 225 if (document.statsList != null) {
169 226 list = document.statsList.get(docId);
170 227 }
... ... @@ -172,9 +229,15 @@ public class MtasSolrComponentDocument {
172 229 stats.getDataType(), stats.getStatsType(), stats.statsItems, null));
173 230 mtasDocumentItemResponse.add("documentKey",
174 231 document.uniqueKey.get(docId));
175   - if (list != null) {
176   - mtasDocumentItemResponse.add("list", new MtasSolrResult(list,
177   - list.getDataType(), list.getStatsType(), list.statsItems, null));
  232 + if (list != null) {
  233 + if(document.listExpand) {
  234 + mtasDocumentItemResponse.add("list", new MtasSolrResult(list,
  235 + new String[] { list.getDataType(), list.getDataType()}, new String[] {list.getStatsType(), list.getStatsType()}, new TreeSet[] {list.statsItems, list.statsItems}, new String[] {null, null}, new String[] {null, null}, new Integer[] { 0 , 0}, new Integer[] { 1 , 1}, null));
  236 + } else {
  237 + mtasDocumentItemResponse.add("list", new MtasSolrResult(list,
  238 + list.getDataType(), list.getStatsType(), list.statsItems, null));
  239 + }
  240 +
178 241 }
179 242 // add
180 243 mtasDocumentItemResponses.add(mtasDocumentItemResponse);
... ...