Commit 1d031eeefd184d632ab5bb8fcb6bba261a798b7c
1 parent
47700394
document/intersecting
Showing
10 changed files
with
620 additions
and
142 deletions
docker/Dockerfile
1 | 1 | # Automatically generated Dockerfile |
2 | -# - Build 2017-01-31 12:02 | |
2 | +# - Build 2017-02-07 07:29 | |
3 | 3 | # - Lucene/Solr version 6.4.0 |
4 | 4 | # - Mtas release 20170131 |
5 | 5 | # |
... | ... | @@ -55,7 +55,7 @@ RUN apt-get update && apt-get install -y lsof software-properties-common python- |
55 | 55 | && chmod -R 755 /var/www/html \ |
56 | 56 | && printf "echo\n" >> /start.sh \ |
57 | 57 | && printf "echo \"================ Mtas -- Multi Tier Annotation Search =================\"\n" >> /start.sh \ |
58 | -&& printf "echo \" Timestamp 2017-01-31 12:02\"\n" >> /start.sh \ | |
58 | +&& printf "echo \" Timestamp 2017-02-07 07:29\"\n" >> /start.sh \ | |
59 | 59 | && printf "echo \" Lucene/Solr version 6.4.0\"\n" >> /start.sh \ |
60 | 60 | && printf "echo \" Mtas release 20170131\"\n" >> /start.sh \ |
61 | 61 | && printf "echo \" See https://meertensinstituut.github.io/mtas/ for more information\"\n" >> /start.sh \ |
... | ... |
src/mtas/analysis/token/MtasToken.java
... | ... | @@ -4,7 +4,10 @@ import java.io.IOException; |
4 | 4 | import java.io.UnsupportedEncodingException; |
5 | 5 | import java.util.ArrayList; |
6 | 6 | import java.util.Arrays; |
7 | +import java.util.Collection; | |
8 | +import java.util.HashMap; | |
7 | 9 | import java.util.List; |
10 | +import java.util.Set; | |
8 | 11 | import java.util.TreeSet; |
9 | 12 | import java.util.regex.Matcher; |
10 | 13 | import java.util.regex.Pattern; |
... | ... | @@ -13,6 +16,7 @@ import org.apache.commons.lang.ArrayUtils; |
13 | 16 | import org.apache.lucene.analysis.payloads.PayloadHelper; |
14 | 17 | import org.apache.lucene.util.BytesRef; |
15 | 18 | import org.apache.lucene.util.automaton.Automaton; |
19 | +import org.apache.lucene.util.automaton.ByteRunAutomaton; | |
16 | 20 | import org.apache.lucene.util.automaton.CompiledAutomaton; |
17 | 21 | import org.apache.lucene.util.automaton.Operations; |
18 | 22 | import org.apache.lucene.util.automaton.RegExp; |
... | ... | @@ -21,7 +25,8 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; |
21 | 25 | /** |
22 | 26 | * The Class MtasToken. |
23 | 27 | * |
24 | - * @param <GenericType> the generic type | |
28 | + * @param <GenericType> | |
29 | + * the generic type | |
25 | 30 | */ |
26 | 31 | public abstract class MtasToken<GenericType> { |
27 | 32 | |
... | ... | @@ -81,8 +86,10 @@ public abstract class MtasToken<GenericType> { |
81 | 86 | /** |
82 | 87 | * Instantiates a new mtas token. |
83 | 88 | * |
84 | - * @param tokenId the token id | |
85 | - * @param value the value | |
89 | + * @param tokenId | |
90 | + * the token id | |
91 | + * @param value | |
92 | + * the value | |
86 | 93 | */ |
87 | 94 | protected MtasToken(Integer tokenId, String value) { |
88 | 95 | this.tokenId = tokenId; |
... | ... | @@ -93,9 +100,12 @@ public abstract class MtasToken<GenericType> { |
93 | 100 | /** |
94 | 101 | * Instantiates a new mtas token. |
95 | 102 | * |
96 | - * @param tokenId the token id | |
97 | - * @param value the value | |
98 | - * @param position the position | |
103 | + * @param tokenId | |
104 | + * the token id | |
105 | + * @param value | |
106 | + * the value | |
107 | + * @param position | |
108 | + * the position | |
99 | 109 | */ |
100 | 110 | protected MtasToken(Integer tokenId, String value, Integer position) { |
101 | 111 | this(tokenId, value); |
... | ... | @@ -105,7 +115,8 @@ public abstract class MtasToken<GenericType> { |
105 | 115 | /** |
106 | 116 | * Sets the token ref. |
107 | 117 | * |
108 | - * @param ref the new token ref | |
118 | + * @param ref | |
119 | + * the new token ref | |
109 | 120 | */ |
110 | 121 | final public void setTokenRef(Long ref) { |
111 | 122 | tokenRef = ref; |
... | ... | @@ -123,7 +134,8 @@ public abstract class MtasToken<GenericType> { |
123 | 134 | /** |
124 | 135 | * Sets the term ref. |
125 | 136 | * |
126 | - * @param ref the new term ref | |
137 | + * @param ref | |
138 | + * the new term ref | |
127 | 139 | */ |
128 | 140 | final public void setTermRef(Long ref) { |
129 | 141 | termRef = ref; |
... | ... | @@ -141,7 +153,8 @@ public abstract class MtasToken<GenericType> { |
141 | 153 | /** |
142 | 154 | * Sets the prefix id. |
143 | 155 | * |
144 | - * @param id the new prefix id | |
156 | + * @param id | |
157 | + * the new prefix id | |
145 | 158 | */ |
146 | 159 | final public void setPrefixId(int id) { |
147 | 160 | prefixId = id; |
... | ... | @@ -151,7 +164,8 @@ public abstract class MtasToken<GenericType> { |
151 | 164 | * Gets the prefix id. |
152 | 165 | * |
153 | 166 | * @return the prefix id |
154 | - * @throws IOException Signals that an I/O exception has occurred. | |
167 | + * @throws IOException | |
168 | + * Signals that an I/O exception has occurred. | |
155 | 169 | */ |
156 | 170 | final public int getPrefixId() throws IOException { |
157 | 171 | if (prefixId != null) { |
... | ... | @@ -164,7 +178,8 @@ public abstract class MtasToken<GenericType> { |
164 | 178 | /** |
165 | 179 | * Sets the id. |
166 | 180 | * |
167 | - * @param id the new id | |
181 | + * @param id | |
182 | + * the new id | |
168 | 183 | */ |
169 | 184 | final public void setId(Integer id) { |
170 | 185 | tokenId = id; |
... | ... | @@ -182,7 +197,8 @@ public abstract class MtasToken<GenericType> { |
182 | 197 | /** |
183 | 198 | * Sets the parent id. |
184 | 199 | * |
185 | - * @param id the new parent id | |
200 | + * @param id | |
201 | + * the new parent id | |
186 | 202 | */ |
187 | 203 | final public void setParentId(Integer id) { |
188 | 204 | tokenParentId = id; |
... | ... | @@ -200,7 +216,8 @@ public abstract class MtasToken<GenericType> { |
200 | 216 | /** |
201 | 217 | * Sets the provide parent id. |
202 | 218 | * |
203 | - * @param provide the new provide parent id | |
219 | + * @param provide | |
220 | + * the new provide parent id | |
204 | 221 | */ |
205 | 222 | final public void setProvideParentId(Boolean provide) { |
206 | 223 | provideParentId = provide; |
... | ... | @@ -234,7 +251,8 @@ public abstract class MtasToken<GenericType> { |
234 | 251 | /** |
235 | 252 | * Adds the position. |
236 | 253 | * |
237 | - * @param position the position | |
254 | + * @param position | |
255 | + * the position | |
238 | 256 | */ |
239 | 257 | final public void addPosition(int position) { |
240 | 258 | if (tokenPosition == null) { |
... | ... | @@ -247,8 +265,10 @@ public abstract class MtasToken<GenericType> { |
247 | 265 | /** |
248 | 266 | * Adds the position range. |
249 | 267 | * |
250 | - * @param start the start | |
251 | - * @param end the end | |
268 | + * @param start | |
269 | + * the start | |
270 | + * @param end | |
271 | + * the end | |
252 | 272 | */ |
253 | 273 | final public void addPositionRange(int start, int end) { |
254 | 274 | if (tokenPosition == null) { |
... | ... | @@ -265,7 +285,8 @@ public abstract class MtasToken<GenericType> { |
265 | 285 | /** |
266 | 286 | * Adds the positions. |
267 | 287 | * |
268 | - * @param positions the positions | |
288 | + * @param positions | |
289 | + * the positions | |
269 | 290 | */ |
270 | 291 | final public void addPositions(int[] positions) { |
271 | 292 | if (positions != null && positions.length > 0) { |
... | ... | @@ -280,7 +301,8 @@ public abstract class MtasToken<GenericType> { |
280 | 301 | /** |
281 | 302 | * Adds the positions. |
282 | 303 | * |
283 | - * @param list the list | |
304 | + * @param list | |
305 | + * the list | |
284 | 306 | */ |
285 | 307 | final public void addPositions(TreeSet<Integer> list) { |
286 | 308 | int[] positions = ArrayUtils |
... | ... | @@ -291,7 +313,8 @@ public abstract class MtasToken<GenericType> { |
291 | 313 | /** |
292 | 314 | * Check position type. |
293 | 315 | * |
294 | - * @param type the type | |
316 | + * @param type | |
317 | + * the type | |
295 | 318 | * @return the boolean |
296 | 319 | */ |
297 | 320 | final public Boolean checkPositionType(String type) { |
... | ... | @@ -368,8 +391,10 @@ public abstract class MtasToken<GenericType> { |
368 | 391 | /** |
369 | 392 | * Sets the offset. |
370 | 393 | * |
371 | - * @param start the start | |
372 | - * @param end the end | |
394 | + * @param start | |
395 | + * the start | |
396 | + * @param end | |
397 | + * the end | |
373 | 398 | */ |
374 | 399 | final public void setOffset(Integer start, Integer end) { |
375 | 400 | if ((start == null) || (end == null)) { |
... | ... | @@ -384,8 +409,10 @@ public abstract class MtasToken<GenericType> { |
384 | 409 | /** |
385 | 410 | * Adds the offset. |
386 | 411 | * |
387 | - * @param start the start | |
388 | - * @param end the end | |
412 | + * @param start | |
413 | + * the start | |
414 | + * @param end | |
415 | + * the end | |
389 | 416 | */ |
390 | 417 | final public void addOffset(Integer start, Integer end) { |
391 | 418 | if (tokenOffset == null) { |
... | ... | @@ -402,7 +429,8 @@ public abstract class MtasToken<GenericType> { |
402 | 429 | /** |
403 | 430 | * Sets the provide offset. |
404 | 431 | * |
405 | - * @param provide the new provide offset | |
432 | + * @param provide | |
433 | + * the new provide offset | |
406 | 434 | */ |
407 | 435 | final public void setProvideOffset(Boolean provide) { |
408 | 436 | provideOffset = provide; |
... | ... | @@ -411,8 +439,10 @@ public abstract class MtasToken<GenericType> { |
411 | 439 | /** |
412 | 440 | * Sets the real offset. |
413 | 441 | * |
414 | - * @param start the start | |
415 | - * @param end the end | |
442 | + * @param start | |
443 | + * the start | |
444 | + * @param end | |
445 | + * the end | |
416 | 446 | */ |
417 | 447 | final public void setRealOffset(Integer start, Integer end) { |
418 | 448 | if ((start == null) || (end == null)) { |
... | ... | @@ -428,7 +458,8 @@ public abstract class MtasToken<GenericType> { |
428 | 458 | /** |
429 | 459 | * Sets the provide real offset. |
430 | 460 | * |
431 | - * @param provide the new provide real offset | |
461 | + * @param provide | |
462 | + * the new provide real offset | |
432 | 463 | */ |
433 | 464 | final public void setProvideRealOffset(Boolean provide) { |
434 | 465 | provideRealOffset = provide; |
... | ... | @@ -491,7 +522,8 @@ public abstract class MtasToken<GenericType> { |
491 | 522 | /** |
492 | 523 | * Sets the value. |
493 | 524 | * |
494 | - * @param value the new value | |
525 | + * @param value | |
526 | + * the new value | |
495 | 527 | */ |
496 | 528 | public void setValue(String value) { |
497 | 529 | tokenValue = value; |
... | ... | @@ -500,7 +532,8 @@ public abstract class MtasToken<GenericType> { |
500 | 532 | /** |
501 | 533 | * Gets the prefix from value. |
502 | 534 | * |
503 | - * @param value the value | |
535 | + * @param value | |
536 | + * the value | |
504 | 537 | * @return the prefix from value |
505 | 538 | */ |
506 | 539 | public static String getPrefixFromValue(String value) { |
... | ... | @@ -521,7 +554,8 @@ public abstract class MtasToken<GenericType> { |
521 | 554 | /** |
522 | 555 | * Gets the postfix from value. |
523 | 556 | * |
524 | - * @param value the value | |
557 | + * @param value | |
558 | + * the value | |
525 | 559 | * @return the postfix from value |
526 | 560 | */ |
527 | 561 | public static String getPostfixFromValue(String value) { |
... | ... | @@ -537,7 +571,8 @@ public abstract class MtasToken<GenericType> { |
537 | 571 | /** |
538 | 572 | * Gets the postfix from value. |
539 | 573 | * |
540 | - * @param term the term | |
574 | + * @param term | |
575 | + * the term | |
541 | 576 | * @return the postfix from value |
542 | 577 | */ |
543 | 578 | public static String getPostfixFromValue(BytesRef term) { |
... | ... | @@ -671,7 +706,8 @@ public abstract class MtasToken<GenericType> { |
671 | 706 | /** |
672 | 707 | * Sets the payload. |
673 | 708 | * |
674 | - * @param payload the new payload | |
709 | + * @param payload | |
710 | + * the new payload | |
675 | 711 | */ |
676 | 712 | public void setPayload(BytesRef payload) { |
677 | 713 | tokenPayload = payload; |
... | ... | @@ -686,17 +722,46 @@ public abstract class MtasToken<GenericType> { |
686 | 722 | return tokenPayload; |
687 | 723 | } |
688 | 724 | |
725 | + public static HashMap<String, Automaton> createAutomatonMap(String prefix, | |
726 | + List<String> valueList, Boolean filter) { | |
727 | + HashMap<String, Automaton> automatonMap = new HashMap<String, Automaton>(); | |
728 | + if (valueList != null) { | |
729 | + for (String item : valueList) { | |
730 | + if (filter) { | |
731 | + item = item.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])", | |
732 | + "\\\\\\1"); | |
733 | + } | |
734 | + automatonMap.put(item, | |
735 | + new RegExp(prefix + MtasToken.DELIMITER + item + "\u0000*").toAutomaton()); | |
736 | + } | |
737 | + } | |
738 | + return automatonMap; | |
739 | + } | |
740 | + | |
741 | + public static HashMap<String, ByteRunAutomaton> byteRunAutomatonMap(HashMap<String, Automaton> automatonMap) { | |
742 | + HashMap<String, ByteRunAutomaton> byteRunAutomatonMap = new HashMap<String, ByteRunAutomaton>(); | |
743 | + if(automatonMap!=null) { | |
744 | + for(String key : automatonMap.keySet()) { | |
745 | + byteRunAutomatonMap.put(key, new ByteRunAutomaton(automatonMap.get(key))); | |
746 | + } | |
747 | + } | |
748 | + return byteRunAutomatonMap; | |
749 | + } | |
689 | 750 | /** |
690 | 751 | * Creates the automata. |
691 | 752 | * |
692 | - * @param prefix the prefix | |
693 | - * @param regexp the regexp | |
694 | - * @param valueList the value list | |
753 | + * @param prefix | |
754 | + * the prefix | |
755 | + * @param regexp | |
756 | + * the regexp | |
757 | + * @param valueList | |
758 | + * the value list | |
695 | 759 | * @return the list |
696 | - * @throws IOException Signals that an I/O exception has occurred. | |
760 | + * @throws IOException | |
761 | + * Signals that an I/O exception has occurred. | |
697 | 762 | */ |
698 | 763 | public static List<CompiledAutomaton> createAutomata(String prefix, |
699 | - String regexp, List<String> valueList) throws IOException { | |
764 | + String regexp, HashMap<String, Automaton> automatonMap) throws IOException { | |
700 | 765 | List<CompiledAutomaton> list = new ArrayList<CompiledAutomaton>(); |
701 | 766 | Automaton automatonRegexp = null; |
702 | 767 | if (regexp != null) { |
... | ... | @@ -704,26 +769,22 @@ public abstract class MtasToken<GenericType> { |
704 | 769 | automatonRegexp = re.toAutomaton(); |
705 | 770 | } |
706 | 771 | int step = 500; |
707 | - for (int i = 0; i < valueList.size(); i += step) { | |
772 | + List<String> keyList = new ArrayList<String>(automatonMap.keySet()); | |
773 | + for (int i = 0; i < keyList.size(); i += step) { | |
708 | 774 | int localStep = step; |
709 | 775 | boolean success = false; |
710 | 776 | CompiledAutomaton compiledAutomaton = null; |
711 | 777 | while (!success) { |
712 | 778 | success = true; |
713 | - int next = Math.min(valueList.size(), i + localStep); | |
779 | + int next = Math.min(keyList.size(), i + localStep); | |
714 | 780 | List<Automaton> listAutomaton = new ArrayList<Automaton>(); |
715 | 781 | for (int j = i; j < next; j++) { |
716 | - String value = valueList.get(j); | |
717 | - value = value.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])", | |
718 | - "\\\\\\1"); | |
719 | - listAutomaton.add( | |
720 | - (new RegExp(prefix + MtasToken.DELIMITER + value + "\u0000*")) | |
721 | - .toAutomaton()); | |
782 | + listAutomaton.add(automatonMap.get(keyList.get(j))); | |
722 | 783 | } |
723 | 784 | Automaton automatonList = Operations.union(listAutomaton); |
724 | 785 | Automaton automaton; |
725 | 786 | if (automatonRegexp != null) { |
726 | - automaton = Operations.intersection(automatonList, automatonRegexp); | |
787 | + automaton = Operations.intersection(automatonList, automatonRegexp); | |
727 | 788 | } else { |
728 | 789 | automaton = automatonList; |
729 | 790 | } |
... | ... |
src/mtas/codec/util/CodecCollector.java
... | ... | @@ -55,17 +55,24 @@ import org.apache.lucene.index.IndexableField; |
55 | 55 | import org.apache.lucene.index.LeafReader; |
56 | 56 | import org.apache.lucene.index.LeafReaderContext; |
57 | 57 | import org.apache.lucene.index.PostingsEnum; |
58 | +import org.apache.lucene.index.SingleTermsEnum; | |
58 | 59 | import org.apache.lucene.index.Term; |
59 | 60 | import org.apache.lucene.index.Terms; |
60 | 61 | import org.apache.lucene.index.TermsEnum; |
62 | +import org.apache.lucene.search.AutomatonQuery; | |
61 | 63 | import org.apache.lucene.search.DocIdSetIterator; |
62 | 64 | import org.apache.lucene.search.IndexSearcher; |
63 | 65 | import org.apache.lucene.search.spans.SpanWeight; |
64 | 66 | import org.apache.lucene.search.spans.Spans; |
65 | 67 | import org.apache.lucene.util.Bits; |
66 | 68 | import org.apache.lucene.util.BytesRef; |
69 | +import org.apache.lucene.util.BytesRefBuilder; | |
67 | 70 | import org.apache.lucene.util.LegacyNumericUtils; |
71 | +import org.apache.lucene.util.automaton.Automaton; | |
72 | +import org.apache.lucene.util.automaton.ByteRunAutomaton; | |
68 | 73 | import org.apache.lucene.util.automaton.CompiledAutomaton; |
74 | +import org.apache.lucene.util.automaton.Operations; | |
75 | +import org.apache.lucene.util.automaton.RegExp; | |
69 | 76 | |
70 | 77 | /** |
71 | 78 | * The Class CodecCollector. |
... | ... | @@ -2012,6 +2019,9 @@ public class CodecCollector { |
2012 | 2019 | IndexSearcher searcher, Terms t, LeafReader r, LeafReaderContext lrc) |
2013 | 2020 | throws IOException { |
2014 | 2021 | if (documentList != null) { |
2022 | + TreeSet<String> listStatsItems = CodecUtil.createStatsItems("sum"); | |
2023 | + String listStatsType = CodecUtil.createStatsType(listStatsItems, | |
2024 | + CodecUtil.STATS_TYPE_SUM, null); | |
2015 | 2025 | for (ComponentDocument document : documentList) { |
2016 | 2026 | // initialize |
2017 | 2027 | for (int docId : docList) { |
... | ... | @@ -2019,9 +2029,6 @@ public class CodecCollector { |
2019 | 2029 | Document doc = searcher.doc(docId, |
2020 | 2030 | new HashSet<String>(Arrays.asList(uniqueKeyField))); |
2021 | 2031 | IndexableField indxfld = doc.getField(uniqueKeyField); |
2022 | - TreeSet<String> listStatsItems = CodecUtil.createStatsItems("sum"); | |
2023 | - String listStatsType = CodecUtil.createStatsType(listStatsItems, | |
2024 | - CodecUtil.STATS_TYPE_SUM, null); | |
2025 | 2032 | // get other doc info |
2026 | 2033 | if (indxfld != null) { |
2027 | 2034 | document.uniqueKey.put(docId, indxfld.stringValue()); |
... | ... | @@ -2031,10 +2038,30 @@ public class CodecCollector { |
2031 | 2038 | null, null); |
2032 | 2039 | document.statsData.put(docId, stats); |
2033 | 2040 | if (document.statsList != null) { |
2034 | - MtasDataCollector<?, ?> list = DataCollector.getCollector( | |
2035 | - DataCollector.COLLECTOR_TYPE_LIST, CodecUtil.DATA_TYPE_LONG, | |
2036 | - listStatsType, listStatsItems, CodecUtil.STATS_TYPE_SUM, | |
2037 | - CodecUtil.SORT_DESC, 0, document.number, null, null); | |
2041 | + MtasDataCollector<?, ?> list; | |
2042 | + if (document.listExpand) { | |
2043 | + TreeSet<String>[] baseStatsItems = new TreeSet[] { | |
2044 | + listStatsItems }; | |
2045 | + list = DataCollector.getCollector( | |
2046 | + DataCollector.COLLECTOR_TYPE_LIST, CodecUtil.DATA_TYPE_LONG, | |
2047 | + listStatsType, listStatsItems, CodecUtil.STATS_TYPE_SUM, | |
2048 | + CodecUtil.SORT_DESC, 0, document.listNumber, | |
2049 | + new String[] { DataCollector.COLLECTOR_TYPE_LIST }, | |
2050 | + new String[] { CodecUtil.DATA_TYPE_LONG }, | |
2051 | + new String[] { listStatsType }, | |
2052 | + Arrays.copyOfRange(baseStatsItems, 0, | |
2053 | + baseStatsItems.length), | |
2054 | + new String[] { CodecUtil.STATS_TYPE_SUM }, | |
2055 | + new String[] { CodecUtil.SORT_DESC }, new Integer[] { 0 }, | |
2056 | + new Integer[] { document.listExpandNumber }, null, null); | |
2057 | + } else { | |
2058 | + TreeSet<String>[] baseStatsItems = new TreeSet[] { | |
2059 | + listStatsItems }; | |
2060 | + list = DataCollector.getCollector( | |
2061 | + DataCollector.COLLECTOR_TYPE_LIST, CodecUtil.DATA_TYPE_LONG, | |
2062 | + listStatsType, listStatsItems, CodecUtil.STATS_TYPE_SUM, | |
2063 | + CodecUtil.SORT_DESC, 0, document.listNumber, null, null); | |
2064 | + } | |
2038 | 2065 | document.statsList.put(docId, list); |
2039 | 2066 | } |
2040 | 2067 | } |
... | ... | @@ -2049,55 +2076,142 @@ public class CodecCollector { |
2049 | 2076 | for (ComponentDocument document : documentList) { |
2050 | 2077 | |
2051 | 2078 | List<CompiledAutomaton> listAutomata; |
2079 | + HashMap<String, Automaton> automatonMap; | |
2080 | + HashMap<String, ByteRunAutomaton> byteRunAutomatonMap; | |
2052 | 2081 | if (document.list == null) { |
2082 | + automatonMap = null; | |
2083 | + byteRunAutomatonMap = null; | |
2053 | 2084 | listAutomata = new ArrayList<CompiledAutomaton>(); |
2054 | - listAutomata.add(document.compiledAutomaton); | |
2085 | + CompiledAutomaton compiledAutomaton; | |
2086 | + Automaton automaton; | |
2087 | + if ((document.regexp == null) || (document.regexp.isEmpty())) { | |
2088 | + RegExp re = new RegExp( | |
2089 | + document.prefix + MtasToken.DELIMITER + ".*"); | |
2090 | + automaton = re.toAutomaton(); | |
2091 | + } else { | |
2092 | + RegExp re = new RegExp(document.prefix + MtasToken.DELIMITER | |
2093 | + + document.regexp + "\u0000*"); | |
2094 | + automaton = re.toAutomaton(); | |
2095 | + } | |
2096 | + compiledAutomaton = new CompiledAutomaton(automaton); | |
2097 | + listAutomata.add(compiledAutomaton); | |
2055 | 2098 | } else { |
2099 | + automatonMap = MtasToken.createAutomatonMap(document.prefix, | |
2100 | + new ArrayList<String>(document.list), | |
2101 | + document.listRegexp ? false : true); | |
2102 | + byteRunAutomatonMap = MtasToken.byteRunAutomatonMap(automatonMap); | |
2056 | 2103 | listAutomata = MtasToken.createAutomata(document.prefix, |
2057 | - document.regexp, new ArrayList<String>(document.list)); | |
2104 | + document.regexp, automatonMap); | |
2105 | + } | |
2106 | + List<ByteRunAutomaton> ignoreByteRunAutomatonList = null; | |
2107 | + if ((document.ignoreRegexp != null) | |
2108 | + && (!document.ignoreRegexp.isEmpty())) { | |
2109 | + ignoreByteRunAutomatonList = new ArrayList<ByteRunAutomaton>(); | |
2110 | + RegExp re = new RegExp(document.prefix + MtasToken.DELIMITER | |
2111 | + + document.ignoreRegexp + "\u0000*"); | |
2112 | + ignoreByteRunAutomatonList | |
2113 | + .add(new ByteRunAutomaton(re.toAutomaton())); | |
2114 | + } | |
2115 | + if (document.ignoreList != null) { | |
2116 | + if(ignoreByteRunAutomatonList==null) { | |
2117 | + ignoreByteRunAutomatonList = new ArrayList<ByteRunAutomaton>(); | |
2118 | + } | |
2119 | + HashMap<String, Automaton> list = MtasToken.createAutomatonMap( | |
2120 | + document.prefix, new ArrayList<String>(document.ignoreList), | |
2121 | + document.ignoreListRegexp ? false : true); | |
2122 | + for (Automaton automaton : list.values()) { | |
2123 | + ignoreByteRunAutomatonList.add(new ByteRunAutomaton(automaton)); | |
2124 | + } | |
2058 | 2125 | } |
2059 | 2126 | |
2060 | 2127 | for (CompiledAutomaton compiledAutomaton : listAutomata) { |
2061 | - | |
2062 | - termsEnum = t.intersect(compiledAutomaton, null); | |
2063 | - // init | |
2064 | - int initSize = Math.min((int) t.size(), 1000); | |
2065 | - for (int docId : docList) { | |
2066 | - document.statsData.get(docId).initNewList(1); | |
2067 | - if (document.statsList != null) { | |
2068 | - document.statsList.get(docId).initNewList(initSize); | |
2128 | + if (!compiledAutomaton.type | |
2129 | + .equals(CompiledAutomaton.AUTOMATON_TYPE.NONE)) { | |
2130 | + termsEnum = t.intersect(compiledAutomaton, null); | |
2131 | + // init | |
2132 | + int initBaseSize = Math.min((int) t.size(), 1000); | |
2133 | + int initListSize = document.statsList != null | |
2134 | + ? Math.min(document.statsList.size(), initBaseSize) | |
2135 | + : initBaseSize; | |
2136 | + HashSet<MtasDataCollector<?, ?>> initialised = new HashSet<MtasDataCollector<?, ?>>(); | |
2137 | + for (int docId : docList) { | |
2138 | + document.statsData.get(docId).initNewList(1); | |
2139 | + initialised.add(document.statsData.get(docId)); | |
2140 | + if (document.statsList != null | |
2141 | + && document.statsList.size() > 0) { | |
2142 | + document.statsList.get(docId).initNewList(initListSize); | |
2143 | + initialised.add(document.statsList.get(docId)); | |
2144 | + } | |
2069 | 2145 | } |
2070 | - } | |
2071 | - // fill | |
2072 | - while ((term = termsEnum.next()) != null) { | |
2073 | - Iterator<Integer> docIterator = docList.iterator(); | |
2074 | - postingsEnum = termsEnum.postings(postingsEnum, | |
2075 | - PostingsEnum.FREQS); | |
2076 | - int termDocId = -1; | |
2077 | - while (docIterator.hasNext()) { | |
2078 | - int segmentDocId = docIterator.next() - lrc.docBase; | |
2079 | - if (segmentDocId >= termDocId) { | |
2080 | - if ((segmentDocId == termDocId) || ((termDocId = postingsEnum | |
2081 | - .advance(segmentDocId)) == segmentDocId)) { | |
2082 | - // register stats | |
2083 | - document.statsData.get(segmentDocId + lrc.docBase) | |
2084 | - .add(new long[] { postingsEnum.freq() }, 1); | |
2085 | - // register list | |
2086 | - if (document.statsList != null) { | |
2087 | - document.statsList.get(segmentDocId + lrc.docBase).add( | |
2088 | - MtasToken.getPostfixFromValue(term), | |
2089 | - new long[] { postingsEnum.freq() }, 1); | |
2146 | + // fill | |
2147 | + int termDocId; | |
2148 | + boolean acceptedTerm; | |
2149 | + while ((term = termsEnum.next()) != null) { | |
2150 | + Iterator<Integer> docIterator = docList.iterator(); | |
2151 | + postingsEnum = termsEnum.postings(postingsEnum, | |
2152 | + PostingsEnum.FREQS); | |
2153 | + termDocId = -1; | |
2154 | + acceptedTerm = true; | |
2155 | + if(ignoreByteRunAutomatonList!=null) { | |
2156 | + for(ByteRunAutomaton ignoreByteRunAutomaton : ignoreByteRunAutomatonList) { | |
2157 | + if(ignoreByteRunAutomaton.run(term.bytes, term.offset, term.length)) { | |
2158 | + acceptedTerm = false; | |
2159 | + break; | |
2160 | + } | |
2161 | + } | |
2162 | + } | |
2163 | + if (acceptedTerm) { | |
2164 | + while (docIterator.hasNext()) { | |
2165 | + int segmentDocId = docIterator.next() - lrc.docBase; | |
2166 | + if (segmentDocId >= termDocId) { | |
2167 | + if ((segmentDocId == termDocId) | |
2168 | + || ((termDocId = postingsEnum | |
2169 | + .advance(segmentDocId)) == segmentDocId)) { | |
2170 | + // register stats | |
2171 | + document.statsData.get(segmentDocId + lrc.docBase) | |
2172 | + .add(new long[] { postingsEnum.freq() }, 1); | |
2173 | + // register list | |
2174 | + if (document.statsList != null) { | |
2175 | + if (automatonMap != null) { | |
2176 | + MtasDataCollector<?, ?> dataCollector, | |
2177 | + subSataCollector; | |
2178 | + for (String key : byteRunAutomatonMap.keySet()) { | |
2179 | + ByteRunAutomaton bra = byteRunAutomatonMap | |
2180 | + .get(key); | |
2181 | + if (bra.run(term.bytes, term.offset, | |
2182 | + term.length)) { | |
2183 | + dataCollector = document.statsList | |
2184 | + .get(segmentDocId + lrc.docBase); | |
2185 | + subSataCollector = dataCollector.add(key, | |
2186 | + new long[] { postingsEnum.freq() }, 1); | |
2187 | + if (document.listExpand | |
2188 | + && subSataCollector != null) { | |
2189 | + if (!initialised.contains(subSataCollector)) { | |
2190 | + subSataCollector.initNewList(initBaseSize); | |
2191 | + initialised.add(subSataCollector); | |
2192 | + } | |
2193 | + subSataCollector.add( | |
2194 | + MtasToken.getPostfixFromValue(term), | |
2195 | + new long[] { postingsEnum.freq() }, 1); | |
2196 | + } | |
2197 | + } | |
2198 | + } | |
2199 | + } else { | |
2200 | + document.statsList.get(segmentDocId + lrc.docBase) | |
2201 | + .add(MtasToken.getPostfixFromValue(term), | |
2202 | + new long[] { postingsEnum.freq() }, 1); | |
2203 | + } | |
2204 | + } | |
2205 | + } | |
2090 | 2206 | } |
2091 | 2207 | } |
2092 | 2208 | } |
2093 | 2209 | } |
2094 | - } | |
2095 | - // close | |
2096 | - for (int docId : docList) { | |
2097 | - document.statsData.get(docId).closeNewList(); | |
2098 | - if (document.statsList != null) { | |
2099 | - document.statsList.get(docId).closeNewList(); | |
2210 | + // close | |
2211 | + for (MtasDataCollector<?, ?> item : initialised) { | |
2212 | + item.closeNewList(); | |
2100 | 2213 | } |
2214 | + initialised.clear(); | |
2101 | 2215 | } |
2102 | 2216 | } |
2103 | 2217 | } |
... | ... | @@ -2702,12 +2816,16 @@ public class CodecCollector { |
2702 | 2816 | } |
2703 | 2817 | |
2704 | 2818 | List<CompiledAutomaton> listAutomata; |
2819 | + HashMap<String, Automaton> automatonMap; | |
2705 | 2820 | if (termVector.list == null) { |
2821 | + automatonMap = null; | |
2706 | 2822 | listAutomata = new ArrayList<CompiledAutomaton>(); |
2707 | 2823 | listAutomata.add(termVector.compiledAutomaton); |
2708 | 2824 | } else { |
2825 | + automatonMap = MtasToken.createAutomatonMap(termVector.prefix, | |
2826 | + new ArrayList<String>(termVector.list), true); | |
2709 | 2827 | listAutomata = MtasToken.createAutomata(termVector.prefix, |
2710 | - termVector.regexp, new ArrayList<String>(termVector.list)); | |
2828 | + termVector.regexp, automatonMap); | |
2711 | 2829 | } |
2712 | 2830 | |
2713 | 2831 | for (CompiledAutomaton compiledAutomaton : listAutomata) { |
... | ... | @@ -3055,9 +3173,12 @@ public class CodecCollector { |
3055 | 3173 | HashSet<String> recomputeKeyList = termVector.subComponentFunction.dataCollector.segmentRecomputeKeyList |
3056 | 3174 | .get(segmentName); |
3057 | 3175 | if (recomputeKeyList.size() > 0) { |
3176 | + HashMap<String, Automaton> automatonMap = MtasToken | |
3177 | + .createAutomatonMap(termVector.prefix, | |
3178 | + new ArrayList<String>(termVector.list), true); | |
3058 | 3179 | List<CompiledAutomaton> listCompiledAutomata = MtasToken |
3059 | 3180 | .createAutomata(termVector.prefix, termVector.regexp, |
3060 | - new ArrayList<String>(recomputeKeyList)); | |
3181 | + automatonMap); | |
3061 | 3182 | for (CompiledAutomaton compiledAutomaton : listCompiledAutomata) { |
3062 | 3183 | termsEnum = t.intersect(compiledAutomaton, null); |
3063 | 3184 | termVector.subComponentFunction.dataCollector.initNewList( |
... | ... |
src/mtas/codec/util/CodecComponent.java
... | ... | @@ -265,10 +265,14 @@ public class CodecComponent { |
265 | 265 | public static class ComponentDocument { |
266 | 266 | |
267 | 267 | /** The regexp. */ |
268 | - public String key, prefix, regexp; | |
268 | + public String key, prefix, regexp, ignoreRegexp; | |
269 | 269 | |
270 | 270 | /** The list. */ |
271 | - public HashSet<String> list; | |
271 | + public HashSet<String> list, ignoreList; | |
272 | + | |
273 | + public boolean listRegexp, listExpand, ignoreListRegexp; | |
274 | + | |
275 | + public int listExpandNumber; | |
272 | 276 | |
273 | 277 | /** The stats type. */ |
274 | 278 | public String dataType, statsType; |
... | ... | @@ -276,11 +280,8 @@ public class CodecComponent { |
276 | 280 | /** The stats items. */ |
277 | 281 | public TreeSet<String> statsItems; |
278 | 282 | |
279 | - /** The compiled automaton. */ | |
280 | - public CompiledAutomaton compiledAutomaton; | |
281 | - | |
282 | 283 | /** The number. */ |
283 | - public int number; | |
284 | + public int listNumber; | |
284 | 285 | |
285 | 286 | /** The unique key. */ |
286 | 287 | public HashMap<Integer, String> uniqueKey; |
... | ... | @@ -290,7 +291,7 @@ public class CodecComponent { |
290 | 291 | |
291 | 292 | /** The list. */ |
292 | 293 | public HashMap<Integer, MtasDataCollector<?, ?>> statsList; |
293 | - | |
294 | + | |
294 | 295 | /** |
295 | 296 | * Instantiates a new component document. |
296 | 297 | * |
... | ... | @@ -302,30 +303,40 @@ public class CodecComponent { |
302 | 303 | * @throws IOException Signals that an I/O exception has occurred. |
303 | 304 | */ |
304 | 305 | public ComponentDocument(String key, String prefix, String statsType, |
305 | - String regexp, String[] list, int number) throws IOException { | |
306 | + String regexp, String[] list, int listNumber, Boolean listRegexp, Boolean listExpand, int listExpandNumber, String ignoreRegexp, String[] ignoreList, Boolean ignoreListRegexp) throws IOException { | |
306 | 307 | this.key = key; |
307 | 308 | this.prefix = prefix; |
308 | 309 | this.regexp = regexp; |
309 | 310 | if (list != null && list.length > 0) { |
310 | 311 | this.list = new HashSet(Arrays.asList(list)); |
312 | + this.listRegexp = listRegexp!=null?listRegexp:false; | |
313 | + this.listExpand = (listExpand!=null && listExpandNumber>0)?listExpand:false; | |
314 | + if(this.listExpand) { | |
315 | + this.listExpandNumber = listExpandNumber; | |
316 | + } else { | |
317 | + this.listExpandNumber = 0; | |
318 | + } | |
311 | 319 | } else { |
312 | 320 | this.list = null; |
321 | + this.listRegexp = false; | |
322 | + this.listExpand = false; | |
323 | + this.listExpandNumber = 0; | |
324 | + } | |
325 | + this.ignoreRegexp = ignoreRegexp; | |
326 | + if (ignoreList != null && ignoreList.length > 0) { | |
327 | + this.ignoreList = new HashSet(Arrays.asList(ignoreList)); | |
328 | + this.ignoreListRegexp = ignoreListRegexp!=null?ignoreListRegexp:false; | |
329 | + } else { | |
330 | + this.ignoreList = null; | |
331 | + this.ignoreListRegexp = false; | |
313 | 332 | } |
314 | - this.number = number; | |
333 | + this.listNumber = listNumber; | |
315 | 334 | uniqueKey = new HashMap<Integer, String>(); |
316 | 335 | dataType = CodecUtil.DATA_TYPE_LONG; |
317 | 336 | statsItems = CodecUtil.createStatsItems(statsType); |
318 | - this.statsType = CodecUtil.createStatsType(statsItems, null, null); | |
319 | - if ((regexp == null) || (regexp.isEmpty())) { | |
320 | - RegExp re = new RegExp(prefix + MtasToken.DELIMITER + ".*"); | |
321 | - compiledAutomaton = new CompiledAutomaton(re.toAutomaton()); | |
322 | - } else { | |
323 | - RegExp re = new RegExp( | |
324 | - prefix + MtasToken.DELIMITER + regexp + "\u0000*"); | |
325 | - compiledAutomaton = new CompiledAutomaton(re.toAutomaton()); | |
326 | - } | |
337 | + this.statsType = CodecUtil.createStatsType(statsItems, null, null); | |
327 | 338 | this.statsData = new HashMap<Integer, MtasDataCollector<?, ?>>(); |
328 | - if (this.number > 0) { | |
339 | + if (this.listNumber > 0) { | |
329 | 340 | this.statsList = new HashMap<Integer, MtasDataCollector<?, ?>>(); |
330 | 341 | } else { |
331 | 342 | this.statsList = null; |
... | ... |
src/mtas/search/spans/MtasSpanIntersectingQuery.java
1 | 1 | package mtas.search.spans; |
2 | 2 | |
3 | 3 | import java.io.IOException; |
4 | +import java.util.ArrayList; | |
5 | +import java.util.List; | |
6 | +import java.util.Map; | |
7 | +import java.util.Set; | |
4 | 8 | |
9 | +import org.apache.lucene.index.LeafReaderContext; | |
10 | +import org.apache.lucene.index.Term; | |
11 | +import org.apache.lucene.index.TermContext; | |
12 | +import org.apache.lucene.index.Terms; | |
5 | 13 | import org.apache.lucene.search.IndexSearcher; |
6 | -import org.apache.lucene.search.spans.SpanQuery; | |
7 | 14 | import org.apache.lucene.search.spans.SpanWeight; |
8 | - | |
15 | +import org.apache.lucene.search.spans.Spans; | |
9 | 16 | import mtas.search.spans.util.MtasSpanQuery; |
10 | 17 | |
11 | 18 | public class MtasSpanIntersectingQuery extends MtasSpanQuery { |
12 | 19 | |
13 | - public MtasSpanIntersectingQuery(SpanQuery q1, SpanQuery q2) { | |
14 | - super(); | |
20 | + private String field; | |
21 | + | |
22 | + private MtasSpanQuery q1, q2; | |
23 | + | |
24 | + public MtasSpanIntersectingQuery(MtasSpanQuery q1, MtasSpanQuery q2) { | |
25 | + if (q1 != null) { | |
26 | + field = q1.getField(); | |
27 | + if (q2 != null && !q2.getField().equals(field)) { | |
28 | + throw new IllegalArgumentException("Clauses must have same field."); | |
29 | + } | |
30 | + } else if (q2 != null) { | |
31 | + field = q2.getField(); | |
32 | + } else { | |
33 | + field = null; | |
34 | + } | |
35 | + this.q1 = q1; | |
36 | + this.q2 = q2; | |
15 | 37 | } |
16 | - | |
38 | + | |
17 | 39 | @Override |
18 | 40 | public String getField() { |
19 | - // TODO Auto-generated method stub | |
20 | - return null; | |
41 | + return field; | |
21 | 42 | } |
22 | 43 | |
23 | 44 | @Override |
24 | 45 | public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores) |
25 | 46 | throws IOException { |
26 | - // TODO Auto-generated method stub | |
27 | - return null; | |
47 | + if (q1 == null || q2 == null) { | |
48 | + return null; | |
49 | + } else { | |
50 | + MtasSpanIntersectingQueryWeight w1 = new MtasSpanIntersectingQueryWeight( | |
51 | + q1.createWeight(searcher, needsScores)); | |
52 | + MtasSpanIntersectingQueryWeight w2 = new MtasSpanIntersectingQueryWeight( | |
53 | + q2.createWeight(searcher, needsScores)); | |
54 | + //subWeights | |
55 | + List<MtasSpanIntersectingQueryWeight> subWeights = new ArrayList<MtasSpanIntersectingQueryWeight>(); | |
56 | + subWeights.add(w1); | |
57 | + subWeights.add(w2); | |
58 | + //return | |
59 | + return new SpanIntersectingWeight(w1, w2, searcher, needsScores ? getTermContexts(subWeights) : null); | |
60 | + } | |
61 | + } | |
62 | + | |
63 | + protected Map<Term, TermContext> getTermContexts( | |
64 | + List<MtasSpanIntersectingQueryWeight> items) { | |
65 | + List<SpanWeight> weights = new ArrayList<SpanWeight>(); | |
66 | + for (MtasSpanIntersectingQueryWeight item : items) { | |
67 | + weights.add(item.spanWeight); | |
68 | + } | |
69 | + return getTermContexts(weights); | |
28 | 70 | } |
29 | 71 | |
30 | 72 | @Override |
31 | 73 | public String toString(String field) { |
32 | - // TODO Auto-generated method stub | |
33 | - return null; | |
74 | + StringBuilder buffer = new StringBuilder(); | |
75 | + buffer.append(this.getClass().getSimpleName() + "(["); | |
76 | + if (q1 != null) { | |
77 | + buffer.append(q1.toString(q1.getField())); | |
78 | + } else { | |
79 | + buffer.append("null"); | |
80 | + } | |
81 | + buffer.append(","); | |
82 | + if (q2 != null) { | |
83 | + buffer.append(q2.toString(q2.getField())); | |
84 | + } else { | |
85 | + buffer.append("null"); | |
86 | + } | |
87 | + buffer.append("])"); | |
88 | + return buffer.toString(); | |
34 | 89 | } |
35 | 90 | |
36 | 91 | @Override |
37 | 92 | public boolean equals(Object obj) { |
38 | - // TODO Auto-generated method stub | |
39 | - return false; | |
93 | + if (this == obj) | |
94 | + return true; | |
95 | + if (obj == null) | |
96 | + return false; | |
97 | + if (getClass() != obj.getClass()) | |
98 | + return false; | |
99 | + final MtasSpanIntersectingQuery other = (MtasSpanIntersectingQuery) obj; | |
100 | + return q1.equals(other.q1) && q2.equals(other.q2); | |
40 | 101 | } |
41 | 102 | |
42 | 103 | @Override |
43 | 104 | public int hashCode() { |
44 | - // TODO Auto-generated method stub | |
45 | - return 0; | |
105 | + int h = this.getClass().getSimpleName().hashCode(); | |
106 | + if (q1 != null) { | |
107 | + h = (h * 7) ^ q1.hashCode(); | |
108 | + } | |
109 | + if (q2 != null) { | |
110 | + h = (h * 11) ^ q2.hashCode(); | |
111 | + } | |
112 | + return h; | |
113 | + } | |
114 | + | |
115 | + public class SpanIntersectingWeight extends SpanWeight { | |
116 | + | |
117 | + MtasSpanIntersectingQueryWeight w1,w2; | |
118 | + | |
119 | + public SpanIntersectingWeight(MtasSpanIntersectingQueryWeight w1, MtasSpanIntersectingQueryWeight w2, IndexSearcher searcher, | |
120 | + Map<Term, TermContext> terms) throws IOException { | |
121 | + super(MtasSpanIntersectingQuery.this, searcher, terms); | |
122 | + this.w1=w1; | |
123 | + this.w2=w2; | |
124 | + } | |
125 | + | |
126 | + @Override | |
127 | + public void extractTermContexts(Map<Term, TermContext> contexts) { | |
128 | + w1.spanWeight.extractTermContexts(contexts); | |
129 | + w2.spanWeight.extractTermContexts(contexts); | |
130 | + } | |
131 | + | |
132 | + @Override | |
133 | + public Spans getSpans(LeafReaderContext context, Postings requiredPostings) | |
134 | + throws IOException { | |
135 | + Terms terms = context.reader().terms(field); | |
136 | + if (terms == null) { | |
137 | + return null; // field does not exist | |
138 | + } | |
139 | + MtasSpanIntersectingQuerySpans s1 = new MtasSpanIntersectingQuerySpans(w1.spanWeight.getSpans(context, requiredPostings)); | |
140 | + MtasSpanIntersectingQuerySpans s2 = new MtasSpanIntersectingQuerySpans(w2.spanWeight.getSpans(context, requiredPostings)); | |
141 | + return new MtasSpanIntersectingSpans(MtasSpanIntersectingQuery.this, | |
142 | + s1, s2); | |
143 | + } | |
144 | + | |
145 | + @Override | |
146 | + public void extractTerms(Set<Term> terms) { | |
147 | + w1.spanWeight.extractTerms(terms); | |
148 | + w2.spanWeight.extractTerms(terms); | |
149 | + } | |
150 | + | |
151 | + } | |
152 | + | |
153 | + public class MtasSpanIntersectingQuerySpans { | |
154 | + public Spans spans; | |
155 | + | |
156 | + public MtasSpanIntersectingQuerySpans(Spans spans) { | |
157 | + this.spans = spans; | |
158 | + } | |
159 | + | |
160 | + } | |
161 | + | |
162 | + public class MtasSpanIntersectingQueryWeight { | |
163 | + | |
164 | + /** The span weight. */ | |
165 | + public SpanWeight spanWeight; | |
166 | + | |
167 | + public MtasSpanIntersectingQueryWeight(SpanWeight spanWeight) { | |
168 | + this.spanWeight = spanWeight; | |
169 | + } | |
46 | 170 | } |
47 | 171 | |
48 | 172 | } |
... | ... |
src/mtas/search/spans/MtasSpanIntersectingSpans.java
0 โ 100644
1 | +package mtas.search.spans; | |
2 | + | |
3 | +import java.io.IOException; | |
4 | +import java.util.List; | |
5 | + | |
6 | +import org.apache.lucene.search.spans.SpanCollector; | |
7 | +import org.apache.lucene.search.spans.Spans; | |
8 | + | |
9 | +import mtas.search.spans.MtasSpanIntersectingQuery.MtasSpanIntersectingQuerySpans; | |
10 | +import mtas.search.spans.util.MtasSpans; | |
11 | + | |
12 | +public class MtasSpanIntersectingSpans extends Spans implements MtasSpans { | |
13 | + | |
14 | + private MtasSpanIntersectingQuerySpans spans1, spans2; | |
15 | + | |
16 | + private int docId; | |
17 | + | |
18 | + public MtasSpanIntersectingSpans(MtasSpanIntersectingQuery mtasSpanIntersectingQuery, | |
19 | + MtasSpanIntersectingQuerySpans spans1, MtasSpanIntersectingQuerySpans spans2) { | |
20 | + super(); | |
21 | + docId = -1; | |
22 | + this.spans1 = spans1; | |
23 | + this.spans2 = spans2; | |
24 | + } | |
25 | + | |
26 | + @Override | |
27 | + public void collect(SpanCollector collector) throws IOException { | |
28 | + spans1.spans.collect(collector); | |
29 | + spans2.spans.collect(collector); | |
30 | + } | |
31 | + | |
32 | + @Override | |
33 | + public int endPosition() { | |
34 | + return NO_MORE_POSITIONS; | |
35 | + } | |
36 | + | |
37 | + @Override | |
38 | + public int nextStartPosition() throws IOException { | |
39 | + return NO_MORE_POSITIONS; | |
40 | + } | |
41 | + | |
42 | + @Override | |
43 | + public float positionsCost() { | |
44 | + return 0; | |
45 | + } | |
46 | + | |
47 | + @Override | |
48 | + public int startPosition() { | |
49 | + return NO_MORE_POSITIONS; | |
50 | + } | |
51 | + | |
52 | + @Override | |
53 | + public int width() { | |
54 | + return 0; | |
55 | + } | |
56 | + | |
57 | + @Override | |
58 | + public int advance(int target) throws IOException { | |
59 | + return NO_MORE_POSITIONS; | |
60 | + } | |
61 | + | |
62 | + @Override | |
63 | + public long cost() { | |
64 | + return 0; | |
65 | + } | |
66 | + | |
67 | + @Override | |
68 | + public int docID() { | |
69 | + return NO_MORE_DOCS; | |
70 | + } | |
71 | + | |
72 | + @Override | |
73 | + public int nextDoc() throws IOException { | |
74 | + return NO_MORE_DOCS; | |
75 | + } | |
76 | + | |
77 | +} | |
... | ... |
src/mtas/search/spans/MtasSpanMatchAllQuery.java
... | ... | @@ -2,19 +2,23 @@ package mtas.search.spans; |
2 | 2 | |
3 | 3 | import java.io.IOException; |
4 | 4 | import java.lang.reflect.Method; |
5 | +import java.util.Collections; | |
5 | 6 | import java.util.Map; |
6 | 7 | import java.util.Set; |
7 | 8 | import mtas.codec.util.CodecInfo; |
8 | 9 | import mtas.search.similarities.MtasSimScorer; |
9 | 10 | import mtas.search.spans.util.MtasSpanQuery; |
11 | +import mtas.search.spans.util.MtasExtendedSpanTermQuery.SpanTermWeight; | |
10 | 12 | |
11 | 13 | import org.apache.lucene.codecs.FieldsProducer; |
14 | +import org.apache.lucene.index.IndexReaderContext; | |
12 | 15 | import org.apache.lucene.index.LeafReader; |
13 | 16 | import org.apache.lucene.index.LeafReaderContext; |
14 | 17 | import org.apache.lucene.index.Term; |
15 | 18 | import org.apache.lucene.index.TermContext; |
16 | 19 | import org.apache.lucene.index.Terms; |
17 | 20 | import org.apache.lucene.search.IndexSearcher; |
21 | +import org.apache.lucene.search.similarities.Similarity; | |
18 | 22 | import org.apache.lucene.search.similarities.Similarity.SimScorer; |
19 | 23 | import org.apache.lucene.search.spans.SpanWeight; |
20 | 24 | import org.apache.lucene.search.spans.Spans; |
... | ... | @@ -56,14 +60,17 @@ public class MtasSpanMatchAllQuery extends MtasSpanQuery { |
56 | 60 | */ |
57 | 61 | @Override |
58 | 62 | public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores) |
59 | - throws IOException { | |
60 | - return new SpanAllWeight(searcher, null); | |
63 | + throws IOException { | |
64 | + //keep things simple | |
65 | + return new SpanAllWeight(searcher, null); | |
61 | 66 | } |
62 | 67 | |
63 | 68 | /** |
64 | 69 | * The Class SpanAllWeight. |
65 | 70 | */ |
66 | 71 | public class SpanAllWeight extends SpanWeight { |
72 | + | |
73 | + IndexSearcher searcher; | |
67 | 74 | |
68 | 75 | /** |
69 | 76 | * Instantiates a new span all weight. |
... | ... | @@ -78,6 +85,7 @@ public class MtasSpanMatchAllQuery extends MtasSpanQuery { |
78 | 85 | public SpanAllWeight(IndexSearcher searcher, |
79 | 86 | Map<Term, TermContext> termContexts) throws IOException { |
80 | 87 | super(MtasSpanMatchAllQuery.this, searcher, termContexts); |
88 | + this.searcher = searcher; | |
81 | 89 | } |
82 | 90 | |
83 | 91 | /* |
... | ... | @@ -89,6 +97,15 @@ public class MtasSpanMatchAllQuery extends MtasSpanQuery { |
89 | 97 | */ |
90 | 98 | @Override |
91 | 99 | public void extractTermContexts(Map<Term, TermContext> contexts) { |
100 | + Term term = new Term(field); | |
101 | + if(!contexts.containsKey(term)) { | |
102 | + IndexReaderContext topContext = searcher.getTopReaderContext(); | |
103 | + try { | |
104 | + contexts.put(term, TermContext.build(topContext, term)); | |
105 | + } catch (IOException e) { | |
106 | + //fail | |
107 | + } | |
108 | + } | |
92 | 109 | } |
93 | 110 | |
94 | 111 | /* |
... | ... | @@ -199,6 +216,6 @@ public class MtasSpanMatchAllQuery extends MtasSpanQuery { |
199 | 216 | int h = this.getClass().getSimpleName().hashCode(); |
200 | 217 | h = (h * 7) ^ field.hashCode(); |
201 | 218 | return h; |
202 | - } | |
219 | + } | |
203 | 220 | |
204 | 221 | } |
... | ... |
src/mtas/search/spans/MtasSpanSequenceSpans.java
... | ... | @@ -19,7 +19,7 @@ import org.apache.lucene.search.spans.Spans; |
19 | 19 | * The Class MtasSpanSequenceSpans. |
20 | 20 | */ |
21 | 21 | public class MtasSpanSequenceSpans extends Spans implements MtasSpans { |
22 | - | |
22 | + | |
23 | 23 | /** The queue spans. */ |
24 | 24 | private List<QueueItem> queueSpans; |
25 | 25 | |
... | ... | @@ -123,6 +123,9 @@ public class MtasSpanSequenceSpans extends Spans implements MtasSpans { |
123 | 123 | */ |
124 | 124 | @Override |
125 | 125 | public void collect(SpanCollector collector) throws IOException { |
126 | + for(QueueItem item : queueSpans) { | |
127 | + item.sequenceSpans.spans.collect(collector); | |
128 | + } | |
126 | 129 | } |
127 | 130 | |
128 | 131 | /* |
... | ... |
src/mtas/search/spans/MtasSpanWithinQuery.java
... | ... | @@ -4,7 +4,6 @@ import java.io.IOException; |
4 | 4 | |
5 | 5 | import org.apache.lucene.index.IndexReader; |
6 | 6 | import org.apache.lucene.search.IndexSearcher; |
7 | -import org.apache.lucene.search.spans.SpanContainingQuery; | |
8 | 7 | import org.apache.lucene.search.spans.SpanQuery; |
9 | 8 | import org.apache.lucene.search.spans.SpanWeight; |
10 | 9 | import org.apache.lucene.search.spans.SpanWithinQuery; |
... | ... | @@ -35,7 +34,9 @@ public class MtasSpanWithinQuery extends MtasSpanQuery { |
35 | 34 | @Override |
36 | 35 | public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores) |
37 | 36 | throws IOException { |
38 | - return baseQuery.createWeight(searcher, needsScores); | |
37 | + SpanWeight sw = baseQuery.createWeight(searcher, needsScores); | |
38 | + return sw; | |
39 | + //return baseQuery.createWeight(searcher, needsScores); | |
39 | 40 | } |
40 | 41 | |
41 | 42 | @Override |
... | ... |
src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java
... | ... | @@ -2,7 +2,9 @@ package mtas.solr.handler.component.util; |
2 | 2 | |
3 | 3 | import java.io.IOException; |
4 | 4 | import java.util.ArrayList; |
5 | +import java.util.HashMap; | |
5 | 6 | import java.util.Set; |
7 | +import java.util.TreeSet; | |
6 | 8 | |
7 | 9 | import org.apache.solr.common.util.NamedList; |
8 | 10 | import org.apache.solr.common.util.SimpleOrderedMap; |
... | ... | @@ -45,6 +47,18 @@ public class MtasSolrComponentDocument { |
45 | 47 | |
46 | 48 | /** The Constant NAME_MTAS_DOCUMENT_REGEXP. */ |
47 | 49 | public static final String NAME_MTAS_DOCUMENT_LIST = "list"; |
50 | + | |
51 | + public static final String NAME_MTAS_DOCUMENT_LIST_REGEXP = "listRegexp"; | |
52 | + | |
53 | + public static final String NAME_MTAS_DOCUMENT_LIST_EXPAND = "listExpand"; | |
54 | + | |
55 | + public static final String NAME_MTAS_DOCUMENT_LIST_EXPAND_NUMBER = "listExpandNumber"; | |
56 | + | |
57 | + public static final String NAME_MTAS_DOCUMENT_IGNORE_REGEXP = "ignoreRegexp"; | |
58 | + | |
59 | + public static final String NAME_MTAS_DOCUMENT_IGNORE_LIST = "ignoreList"; | |
60 | + | |
61 | + public static final String NAME_MTAS_DOCUMENT_IGNORE_LIST_REGEXP = "ignoreListRegexp"; | |
48 | 62 | |
49 | 63 | /** The Constant NAME_MTAS_DOCUMENT_NUMBER. */ |
50 | 64 | public static final String NAME_MTAS_DOCUMENT_NUMBER = "number"; |
... | ... | @@ -77,7 +91,13 @@ public class MtasSolrComponentDocument { |
77 | 91 | String[] types = new String[ids.size()]; |
78 | 92 | String[] regexps = new String[ids.size()]; |
79 | 93 | String[] lists = new String[ids.size()]; |
80 | - String[] numbers = new String[ids.size()]; | |
94 | + Boolean[] listRegexps = new Boolean[ids.size()]; | |
95 | + Boolean[] listExpands = new Boolean[ids.size()]; | |
96 | + int[] listExpandNumbers = new int[ids.size()]; | |
97 | + String[] ignoreRegexps = new String[ids.size()]; | |
98 | + String[] ignoreLists = new String[ids.size()]; | |
99 | + Boolean[] ignoreListRegexps = new Boolean[ids.size()]; | |
100 | + String[] listNumbers = new String[ids.size()]; | |
81 | 101 | for (String id : ids) { |
82 | 102 | fields[tmpCounter] = rb.req.getParams().get( |
83 | 103 | PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_FIELD, |
... | ... | @@ -98,7 +118,25 @@ public class MtasSolrComponentDocument { |
98 | 118 | lists[tmpCounter] = rb.req.getParams().get( |
99 | 119 | PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST, |
100 | 120 | null); |
101 | - numbers[tmpCounter] = rb.req.getParams().get( | |
121 | + listRegexps[tmpCounter] = rb.req.getParams().getBool( | |
122 | + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST_REGEXP, | |
123 | + false); | |
124 | + listExpands[tmpCounter] = rb.req.getParams().getBool( | |
125 | + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST_EXPAND, | |
126 | + false); | |
127 | + listExpandNumbers[tmpCounter] = rb.req.getParams().getInt( | |
128 | + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST_EXPAND_NUMBER, | |
129 | + 10); | |
130 | + ignoreRegexps[tmpCounter] = rb.req.getParams().get( | |
131 | + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_IGNORE_REGEXP, | |
132 | + null); | |
133 | + ignoreLists[tmpCounter] = rb.req.getParams().get( | |
134 | + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_IGNORE_LIST, | |
135 | + null); | |
136 | + ignoreListRegexps[tmpCounter] = rb.req.getParams().getBool( | |
137 | + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_IGNORE_LIST_REGEXP, | |
138 | + false); | |
139 | + listNumbers[tmpCounter] = rb.req.getParams().get( | |
102 | 140 | PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_NUMBER, |
103 | 141 | null); |
104 | 142 | tmpCounter++; |
... | ... | @@ -123,7 +161,11 @@ public class MtasSolrComponentDocument { |
123 | 161 | NAME_MTAS_DOCUMENT_REGEXP, NAME_MTAS_DOCUMENT_FIELD, false); |
124 | 162 | MtasSolrResultUtil.compareAndCheck(lists, fields, |
125 | 163 | NAME_MTAS_DOCUMENT_LIST, NAME_MTAS_DOCUMENT_FIELD, false); |
126 | - MtasSolrResultUtil.compareAndCheck(numbers, fields, | |
164 | + MtasSolrResultUtil.compareAndCheck(ignoreRegexps, fields, | |
165 | + NAME_MTAS_DOCUMENT_IGNORE_REGEXP, NAME_MTAS_DOCUMENT_FIELD, false); | |
166 | + MtasSolrResultUtil.compareAndCheck(ignoreLists, fields, | |
167 | + NAME_MTAS_DOCUMENT_IGNORE_LIST, NAME_MTAS_DOCUMENT_FIELD, false); | |
168 | + MtasSolrResultUtil.compareAndCheck(listNumbers, fields, | |
127 | 169 | NAME_MTAS_DOCUMENT_NUMBER, NAME_MTAS_DOCUMENT_FIELD, false); |
128 | 170 | for (int i = 0; i < fields.length; i++) { |
129 | 171 | String key = (keys[i] == null) || (keys[i].isEmpty()) |
... | ... | @@ -133,6 +175,9 @@ public class MtasSolrComponentDocument { |
133 | 175 | String type = types[i]; |
134 | 176 | String regexp = regexps[i]; |
135 | 177 | String[] list = null; |
178 | + Boolean listRegexp = listRegexps[i]; | |
179 | + Boolean listExpand = listExpands[i]; | |
180 | + int listExpandNumber = listExpandNumbers[i]; | |
136 | 181 | if(lists[i]!=null) { |
137 | 182 | ArrayList<String> tmpList = new ArrayList<String>(); |
138 | 183 | String[] subList = lists[i].split("(?<!\\\\),"); |
... | ... | @@ -141,10 +186,21 @@ public class MtasSolrComponentDocument { |
141 | 186 | } |
142 | 187 | list = tmpList.toArray(new String[tmpList.size()]); |
143 | 188 | } |
144 | - int number = Math.max(0, (numbers[i] == null) || (numbers[i].isEmpty()) | |
145 | - ? 0 : Integer.parseInt(numbers[i])); | |
189 | + int listNumber = Math.max(0, (listNumbers[i] == null) || (listNumbers[i].isEmpty()) | |
190 | + ? 0 : Integer.parseInt(listNumbers[i])); | |
191 | + String ignoreRegexp = ignoreRegexps[i]; | |
192 | + String[] ignoreList = null; | |
193 | + Boolean ignoreListRegexp = ignoreListRegexps[i]; | |
194 | + if(ignoreLists[i]!=null) { | |
195 | + ArrayList<String> tmpList = new ArrayList<String>(); | |
196 | + String[] subList = ignoreLists[i].split("(?<!\\\\),"); | |
197 | + for(int j=0; j<subList.length; j++) { | |
198 | + tmpList.add(subList[j].replace("\\,", ",").replace("\\\\", "\\")); | |
199 | + } | |
200 | + ignoreList = tmpList.toArray(new String[tmpList.size()]); | |
201 | + } | |
146 | 202 | mtasFields.list.get(fields[i]).documentList |
147 | - .add(new ComponentDocument(key, prefix, type, regexp, list, number)); | |
203 | + .add(new ComponentDocument(key, prefix, type, regexp, list, listNumber, listRegexp, listExpand, listExpandNumber, ignoreRegexp, ignoreList, ignoreListRegexp)); | |
148 | 204 | } |
149 | 205 | } |
150 | 206 | } |
... | ... | @@ -165,6 +221,7 @@ public class MtasSolrComponentDocument { |
165 | 221 | NamedList<Object> mtasDocumentItemResponse = new SimpleOrderedMap<>(); |
166 | 222 | MtasDataCollector<?, ?> stats = document.statsData.get(docId); |
167 | 223 | MtasDataCollector<?, ?> list = null; |
224 | + HashMap<String, MtasDataCollector<?, ?>> expandedList = null; | |
168 | 225 | if (document.statsList != null) { |
169 | 226 | list = document.statsList.get(docId); |
170 | 227 | } |
... | ... | @@ -172,9 +229,15 @@ public class MtasSolrComponentDocument { |
172 | 229 | stats.getDataType(), stats.getStatsType(), stats.statsItems, null)); |
173 | 230 | mtasDocumentItemResponse.add("documentKey", |
174 | 231 | document.uniqueKey.get(docId)); |
175 | - if (list != null) { | |
176 | - mtasDocumentItemResponse.add("list", new MtasSolrResult(list, | |
177 | - list.getDataType(), list.getStatsType(), list.statsItems, null)); | |
232 | + if (list != null) { | |
233 | + if(document.listExpand) { | |
234 | + mtasDocumentItemResponse.add("list", new MtasSolrResult(list, | |
235 | + new String[] { list.getDataType(), list.getDataType()}, new String[] {list.getStatsType(), list.getStatsType()}, new TreeSet[] {list.statsItems, list.statsItems}, new String[] {null, null}, new String[] {null, null}, new Integer[] { 0 , 0}, new Integer[] { 1 , 1}, null)); | |
236 | + } else { | |
237 | + mtasDocumentItemResponse.add("list", new MtasSolrResult(list, | |
238 | + list.getDataType(), list.getStatsType(), list.statsItems, null)); | |
239 | + } | |
240 | + | |
178 | 241 | } |
179 | 242 | // add |
180 | 243 | mtasDocumentItemResponses.add(mtasDocumentItemResponse); |
... | ... |