Commit c3c710901d1acc3379e8456bca1ef23492a95cb4
1 parent
6ec38b89
add list to document
Showing
4 changed files
with
133 additions
and
81 deletions
docker/Dockerfile
1 | 1 | # Automatically generated Dockerfile |
2 | -# - Build 2017-01-10 08:13 | |
2 | +# - Build 2017-01-13 14:47 | |
3 | 3 | # - Lucene/Solr version 6.3.0 |
4 | 4 | # - Mtas release 20170110 |
5 | 5 | # |
... | ... | @@ -55,7 +55,7 @@ RUN apt-get update && apt-get install -y lsof software-properties-common python- |
55 | 55 | && chmod -R 755 /var/www/html \ |
56 | 56 | && printf "echo\n" >> /start.sh \ |
57 | 57 | && printf "echo \"================ Mtas -- Multi Tier Annotation Search =================\"\n" >> /start.sh \ |
58 | -&& printf "echo \" Timestamp 2017-01-10 08:13\"\n" >> /start.sh \ | |
58 | +&& printf "echo \" Timestamp 2017-01-13 14:47\"\n" >> /start.sh \ | |
59 | 59 | && printf "echo \" Lucene/Solr version 6.3.0\"\n" >> /start.sh \ |
60 | 60 | && printf "echo \" Mtas release 20170110\"\n" >> /start.sh \ |
61 | 61 | && printf "echo \" See https://meertensinstituut.github.io/mtas/ for more information\"\n" >> /start.sh \ |
... | ... |
src/mtas/codec/util/CodecCollector.java
... | ... | @@ -2028,13 +2028,13 @@ public class CodecCollector { |
2028 | 2028 | DataCollector.COLLECTOR_TYPE_DATA, document.dataType, |
2029 | 2029 | document.statsType, document.statsItems, null, null, null, null, |
2030 | 2030 | null, null); |
2031 | - document.stats.put(docId, stats); | |
2032 | - if (document.list != null) { | |
2031 | + document.statsData.put(docId, stats); | |
2032 | + if (document.statsList != null) { | |
2033 | 2033 | MtasDataCollector<?, ?> list = DataCollector.getCollector( |
2034 | 2034 | DataCollector.COLLECTOR_TYPE_LIST, CodecUtil.DATA_TYPE_LONG, |
2035 | 2035 | listStatsType, listStatsItems, CodecUtil.STATS_TYPE_SUM, |
2036 | 2036 | CodecUtil.SORT_DESC, 0, document.number, null, null); |
2037 | - document.list.put(docId, list); | |
2037 | + document.statsList.put(docId, list); | |
2038 | 2038 | } |
2039 | 2039 | } |
2040 | 2040 | } |
... | ... | @@ -2046,44 +2046,57 @@ public class CodecCollector { |
2046 | 2046 | PostingsEnum postingsEnum = null; |
2047 | 2047 | // loop over termvectors |
2048 | 2048 | for (ComponentDocument document : documentList) { |
2049 | - termsEnum = t.intersect(document.compiledAutomaton, null); | |
2050 | - // init | |
2051 | - int initSize = Math.min((int) t.size(), 1000); | |
2052 | - for (int docId : docList) { | |
2053 | - document.stats.get(docId).initNewList(1); | |
2054 | - if (document.list != null) { | |
2055 | - document.list.get(docId).initNewList(initSize); | |
2056 | - } | |
2049 | + | |
2050 | + List<CompiledAutomaton> listAutomata; | |
2051 | + if (document.list == null) { | |
2052 | + listAutomata = new ArrayList<CompiledAutomaton>(); | |
2053 | + listAutomata.add(document.compiledAutomaton); | |
2054 | + } else { | |
2055 | + listAutomata = MtasToken.createAutomata(document.prefix, | |
2056 | + document.regexp, new ArrayList<String>(document.list)); | |
2057 | 2057 | } |
2058 | - // fill | |
2059 | - while ((term = termsEnum.next()) != null) { | |
2060 | - Iterator<Integer> docIterator = docList.iterator(); | |
2061 | - postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS); | |
2062 | - int termDocId = -1; | |
2063 | - while (docIterator.hasNext()) { | |
2064 | - int segmentDocId = docIterator.next() - lrc.docBase; | |
2065 | - if (segmentDocId >= termDocId) { | |
2066 | - if ((segmentDocId == termDocId) || ((termDocId = postingsEnum | |
2067 | - .advance(segmentDocId)) == segmentDocId)) { | |
2068 | - // register stats | |
2069 | - document.stats.get(segmentDocId + lrc.docBase) | |
2070 | - .add(new long[] { postingsEnum.freq() }, 1); | |
2071 | - // register list | |
2072 | - if (document.list != null) { | |
2073 | - document.list.get(segmentDocId + lrc.docBase).add( | |
2074 | - MtasToken.getPostfixFromValue(term), | |
2075 | - new long[] { postingsEnum.freq() }, 1); | |
2058 | + | |
2059 | + for (CompiledAutomaton compiledAutomaton : listAutomata) { | |
2060 | + | |
2061 | + termsEnum = t.intersect(compiledAutomaton, null); | |
2062 | + // init | |
2063 | + int initSize = Math.min((int) t.size(), 1000); | |
2064 | + for (int docId : docList) { | |
2065 | + document.statsData.get(docId).initNewList(1); | |
2066 | + if (document.statsList != null) { | |
2067 | + document.statsList.get(docId).initNewList(initSize); | |
2068 | + } | |
2069 | + } | |
2070 | + // fill | |
2071 | + while ((term = termsEnum.next()) != null) { | |
2072 | + Iterator<Integer> docIterator = docList.iterator(); | |
2073 | + postingsEnum = termsEnum.postings(postingsEnum, | |
2074 | + PostingsEnum.FREQS); | |
2075 | + int termDocId = -1; | |
2076 | + while (docIterator.hasNext()) { | |
2077 | + int segmentDocId = docIterator.next() - lrc.docBase; | |
2078 | + if (segmentDocId >= termDocId) { | |
2079 | + if ((segmentDocId == termDocId) || ((termDocId = postingsEnum | |
2080 | + .advance(segmentDocId)) == segmentDocId)) { | |
2081 | + // register stats | |
2082 | + document.statsData.get(segmentDocId + lrc.docBase) | |
2083 | + .add(new long[] { postingsEnum.freq() }, 1); | |
2084 | + // register list | |
2085 | + if (document.statsList != null) { | |
2086 | + document.statsList.get(segmentDocId + lrc.docBase).add( | |
2087 | + MtasToken.getPostfixFromValue(term), | |
2088 | + new long[] { postingsEnum.freq() }, 1); | |
2089 | + } | |
2076 | 2090 | } |
2077 | 2091 | } |
2078 | 2092 | } |
2079 | 2093 | } |
2080 | - } | |
2081 | - | |
2082 | - // close | |
2083 | - for (int docId : docList) { | |
2084 | - document.stats.get(docId).closeNewList(); | |
2085 | - if (document.list != null) { | |
2086 | - document.list.get(docId).closeNewList(); | |
2094 | + // close | |
2095 | + for (int docId : docList) { | |
2096 | + document.statsData.get(docId).closeNewList(); | |
2097 | + if (document.statsList != null) { | |
2098 | + document.statsList.get(docId).closeNewList(); | |
2099 | + } | |
2087 | 2100 | } |
2088 | 2101 | } |
2089 | 2102 | } |
... | ... | @@ -2231,14 +2244,22 @@ public class CodecCollector { |
2231 | 2244 | /** |
2232 | 2245 | * Creates the facet base. |
2233 | 2246 | * |
2234 | - * @param cf the cf | |
2235 | - * @param level the level | |
2236 | - * @param dataCollector the data collector | |
2237 | - * @param positionsData the positions data | |
2238 | - * @param spansNumberData the spans number data | |
2239 | - * @param facetData the facet data | |
2240 | - * @param docSet the doc set | |
2241 | - * @throws IOException Signals that an I/O exception has occurred. | |
2247 | + * @param cf | |
2248 | + * the cf | |
2249 | + * @param level | |
2250 | + * the level | |
2251 | + * @param dataCollector | |
2252 | + * the data collector | |
2253 | + * @param positionsData | |
2254 | + * the positions data | |
2255 | + * @param spansNumberData | |
2256 | + * the spans number data | |
2257 | + * @param facetData | |
2258 | + * the facet data | |
2259 | + * @param docSet | |
2260 | + * the doc set | |
2261 | + * @throws IOException | |
2262 | + * Signals that an I/O exception has occurred. | |
2242 | 2263 | */ |
2243 | 2264 | private static void createFacetBase(ComponentFacet cf, int level, |
2244 | 2265 | MtasDataCollector<?, ?> dataCollector, |
... | ... | @@ -2287,22 +2308,24 @@ public class CodecCollector { |
2287 | 2308 | // only if documents and facets |
2288 | 2309 | if (docSet.length > 0 && list.size() > 0) { |
2289 | 2310 | HashMap<String, Integer[]> docLists = new HashMap<String, Integer[]>(); |
2290 | - HashMap<String, String> groupedKeys = new HashMap<String,String>(); | |
2311 | + HashMap<String, String> groupedKeys = new HashMap<String, String>(); | |
2291 | 2312 | boolean documentsInFacets = false; |
2292 | 2313 | // compute intersections |
2293 | 2314 | for (String key : list.keySet()) { |
2294 | - //fill grouped keys | |
2295 | - if(!groupedKeys.containsKey(key)) { | |
2296 | - groupedKeys.put(key, groupedKeyName(key, cf.baseRangeSizes[level], cf.baseRangeBases[level])); | |
2315 | + // fill grouped keys | |
2316 | + if (!groupedKeys.containsKey(key)) { | |
2317 | + groupedKeys.put(key, groupedKeyName(key, cf.baseRangeSizes[level], | |
2318 | + cf.baseRangeBases[level])); | |
2297 | 2319 | } |
2298 | 2320 | // intersect docSet with docList |
2299 | 2321 | Integer[] docList = intersectedDocList(list.get(key), docSet); |
2300 | 2322 | if (docList.length > 0) { |
2301 | 2323 | documentsInFacets = true; |
2302 | 2324 | } |
2303 | - //update docLists | |
2304 | - if(docLists.containsKey(groupedKeys.get(key))) { | |
2305 | - docLists.put(groupedKeys.get(key), mergeDocLists(docLists.get(groupedKeys.get(key)), docList)); | |
2325 | + // update docLists | |
2326 | + if (docLists.containsKey(groupedKeys.get(key))) { | |
2327 | + docLists.put(groupedKeys.get(key), | |
2328 | + mergeDocLists(docLists.get(groupedKeys.get(key)), docList)); | |
2306 | 2329 | } else { |
2307 | 2330 | docLists.put(groupedKeys.get(key), docList); |
2308 | 2331 | } |
... | ... | @@ -2515,39 +2538,42 @@ public class CodecCollector { |
2515 | 2538 | function.dataCollector.closeNewList(); |
2516 | 2539 | } |
2517 | 2540 | } |
2518 | - } | |
2541 | + } | |
2519 | 2542 | |
2520 | 2543 | } |
2521 | - | |
2522 | - private static String groupedKeyName(String key, Double baseRangeSize, Double baseRangeBase) { | |
2523 | - if(baseRangeSize==null || baseRangeSize<=0) { | |
2544 | + | |
2545 | + private static String groupedKeyName(String key, Double baseRangeSize, | |
2546 | + Double baseRangeBase) { | |
2547 | + if (baseRangeSize == null || baseRangeSize <= 0) { | |
2524 | 2548 | return key; |
2525 | - } else { | |
2549 | + } else { | |
2526 | 2550 | Double doubleKey, doubleBase, doubleNumber, doubleStart, doubleEnd; |
2527 | 2551 | try { |
2528 | - doubleKey = Double.parseDouble(key); | |
2529 | - doubleBase = baseRangeBase==null?0:baseRangeBase; | |
2530 | - doubleNumber = Math.floor((doubleKey - doubleBase) / baseRangeSize); | |
2552 | + doubleKey = Double.parseDouble(key); | |
2553 | + doubleBase = baseRangeBase == null ? 0 : baseRangeBase; | |
2554 | + doubleNumber = Math.floor((doubleKey - doubleBase) / baseRangeSize); | |
2531 | 2555 | doubleStart = doubleBase + doubleNumber * baseRangeSize; |
2532 | - doubleEnd = doubleStart+baseRangeSize; | |
2556 | + doubleEnd = doubleStart + baseRangeSize; | |
2533 | 2557 | } catch (NumberFormatException e) { |
2534 | 2558 | return key; |
2535 | 2559 | } |
2536 | - //integer | |
2537 | - if(Math.floor(baseRangeSize) == baseRangeSize && Math.floor(doubleBase)==doubleBase) { | |
2560 | + // integer | |
2561 | + if (Math.floor(baseRangeSize) == baseRangeSize | |
2562 | + && Math.floor(doubleBase) == doubleBase) { | |
2538 | 2563 | try { |
2539 | - if(baseRangeSize>1) { | |
2540 | - return String.format("%.0f", doubleStart)+"-"+String.format("%.0f", doubleEnd-1); | |
2564 | + if (baseRangeSize > 1) { | |
2565 | + return String.format("%.0f", doubleStart) + "-" | |
2566 | + + String.format("%.0f", doubleEnd - 1); | |
2541 | 2567 | } else { |
2542 | 2568 | return String.format("%.0f", doubleStart); |
2543 | 2569 | } |
2544 | 2570 | } catch (NumberFormatException e) { |
2545 | 2571 | return key; |
2546 | - } | |
2572 | + } | |
2547 | 2573 | } else { |
2548 | - return "["+doubleStart+","+doubleEnd+")"; | |
2549 | - } | |
2550 | - } | |
2574 | + return "[" + doubleStart + "," + doubleEnd + ")"; | |
2575 | + } | |
2576 | + } | |
2551 | 2577 | } |
2552 | 2578 | |
2553 | 2579 | private static Integer[] mergeDocLists(Integer[] a, Integer[] b) { |
... | ... |
src/mtas/codec/util/CodecComponent.java
... | ... | @@ -266,6 +266,9 @@ public class CodecComponent { |
266 | 266 | |
267 | 267 | /** The regexp. */ |
268 | 268 | public String key, prefix, regexp; |
269 | + | |
270 | + /** The list. */ | |
271 | + public HashSet<String> list; | |
269 | 272 | |
270 | 273 | /** The stats type. */ |
271 | 274 | public String dataType, statsType; |
... | ... | @@ -283,10 +286,10 @@ public class CodecComponent { |
283 | 286 | public HashMap<Integer, String> uniqueKey; |
284 | 287 | |
285 | 288 | /** The stats. */ |
286 | - public HashMap<Integer, MtasDataCollector<?, ?>> stats; | |
289 | + public HashMap<Integer, MtasDataCollector<?, ?>> statsData; | |
287 | 290 | |
288 | 291 | /** The list. */ |
289 | - public HashMap<Integer, MtasDataCollector<?, ?>> list; | |
292 | + public HashMap<Integer, MtasDataCollector<?, ?>> statsList; | |
290 | 293 | |
291 | 294 | /** |
292 | 295 | * Instantiates a new component document. |
... | ... | @@ -299,10 +302,15 @@ public class CodecComponent { |
299 | 302 | * @throws IOException Signals that an I/O exception has occurred. |
300 | 303 | */ |
301 | 304 | public ComponentDocument(String key, String prefix, String statsType, |
302 | - String regexp, int number) throws IOException { | |
305 | + String regexp, String[] list, int number) throws IOException { | |
303 | 306 | this.key = key; |
304 | 307 | this.prefix = prefix; |
305 | 308 | this.regexp = regexp; |
309 | + if (list != null && list.length > 0) { | |
310 | + this.list = new HashSet(Arrays.asList(list)); | |
311 | + } else { | |
312 | + this.list = null; | |
313 | + } | |
306 | 314 | this.number = number; |
307 | 315 | uniqueKey = new HashMap<Integer, String>(); |
308 | 316 | dataType = CodecUtil.DATA_TYPE_LONG; |
... | ... | @@ -316,11 +324,11 @@ public class CodecComponent { |
316 | 324 | prefix + MtasToken.DELIMITER + regexp + "\u0000*"); |
317 | 325 | compiledAutomaton = new CompiledAutomaton(re.toAutomaton()); |
318 | 326 | } |
319 | - this.stats = new HashMap<Integer, MtasDataCollector<?, ?>>(); | |
327 | + this.statsData = new HashMap<Integer, MtasDataCollector<?, ?>>(); | |
320 | 328 | if (this.number > 0) { |
321 | - this.list = new HashMap<Integer, MtasDataCollector<?, ?>>(); | |
329 | + this.statsList = new HashMap<Integer, MtasDataCollector<?, ?>>(); | |
322 | 330 | } else { |
323 | - this.list = null; | |
331 | + this.statsList = null; | |
324 | 332 | } |
325 | 333 | } |
326 | 334 | } |
... | ... |
src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java
... | ... | @@ -43,6 +43,9 @@ public class MtasSolrComponentDocument { |
43 | 43 | /** The Constant NAME_MTAS_DOCUMENT_REGEXP. */ |
44 | 44 | public static final String NAME_MTAS_DOCUMENT_REGEXP = "regexp"; |
45 | 45 | |
46 | + /** The Constant NAME_MTAS_DOCUMENT_REGEXP. */ | |
47 | + public static final String NAME_MTAS_DOCUMENT_LIST = "list"; | |
48 | + | |
46 | 49 | /** The Constant NAME_MTAS_DOCUMENT_NUMBER. */ |
47 | 50 | public static final String NAME_MTAS_DOCUMENT_NUMBER = "number"; |
48 | 51 | |
... | ... | @@ -73,6 +76,7 @@ public class MtasSolrComponentDocument { |
73 | 76 | String[] prefixes = new String[ids.size()]; |
74 | 77 | String[] types = new String[ids.size()]; |
75 | 78 | String[] regexps = new String[ids.size()]; |
79 | + String[] lists = new String[ids.size()]; | |
76 | 80 | String[] numbers = new String[ids.size()]; |
77 | 81 | for (String id : ids) { |
78 | 82 | fields[tmpCounter] = rb.req.getParams().get( |
... | ... | @@ -91,6 +95,9 @@ public class MtasSolrComponentDocument { |
91 | 95 | regexps[tmpCounter] = rb.req.getParams().get( |
92 | 96 | PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_REGEXP, |
93 | 97 | null); |
98 | + lists[tmpCounter] = rb.req.getParams().get( | |
99 | + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST, | |
100 | + null); | |
94 | 101 | numbers[tmpCounter] = rb.req.getParams().get( |
95 | 102 | PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_NUMBER, |
96 | 103 | null); |
... | ... | @@ -114,6 +121,8 @@ public class MtasSolrComponentDocument { |
114 | 121 | NAME_MTAS_DOCUMENT_FIELD, false); |
115 | 122 | MtasSolrResultUtil.compareAndCheck(regexps, fields, |
116 | 123 | NAME_MTAS_DOCUMENT_REGEXP, NAME_MTAS_DOCUMENT_FIELD, false); |
124 | + MtasSolrResultUtil.compareAndCheck(lists, fields, | |
125 | + NAME_MTAS_DOCUMENT_LIST, NAME_MTAS_DOCUMENT_FIELD, false); | |
117 | 126 | MtasSolrResultUtil.compareAndCheck(numbers, fields, |
118 | 127 | NAME_MTAS_DOCUMENT_NUMBER, NAME_MTAS_DOCUMENT_FIELD, false); |
119 | 128 | for (int i = 0; i < fields.length; i++) { |
... | ... | @@ -123,10 +132,19 @@ public class MtasSolrComponentDocument { |
123 | 132 | String prefix = prefixes[i]; |
124 | 133 | String type = types[i]; |
125 | 134 | String regexp = regexps[i]; |
135 | + String[] list = null; | |
136 | + if(lists[i]!=null) { | |
137 | + ArrayList<String> tmpList = new ArrayList<String>(); | |
138 | + String[] subList = lists[i].split("(?<!\\\\),"); | |
139 | + for(int j=0; j<subList.length; j++) { | |
140 | + tmpList.add(subList[j].replace("\\,", ",").replace("\\\\", "\\")); | |
141 | + } | |
142 | + list = tmpList.toArray(new String[tmpList.size()]); | |
143 | + } | |
126 | 144 | int number = Math.max(0, (numbers[i] == null) || (numbers[i].isEmpty()) |
127 | 145 | ? 0 : Integer.parseInt(numbers[i])); |
128 | 146 | mtasFields.list.get(fields[i]).documentList |
129 | - .add(new ComponentDocument(key, prefix, type, regexp, number)); | |
147 | + .add(new ComponentDocument(key, prefix, type, regexp, list, number)); | |
130 | 148 | } |
131 | 149 | } |
132 | 150 | } |
... | ... | @@ -143,12 +161,12 @@ public class MtasSolrComponentDocument { |
143 | 161 | SimpleOrderedMap<Object> mtasDocumentResponse = new SimpleOrderedMap<>(); |
144 | 162 | mtasDocumentResponse.add("key", document.key); |
145 | 163 | ArrayList<NamedList<Object>> mtasDocumentItemResponses = new ArrayList<NamedList<Object>>(); |
146 | - for (int docId : document.stats.keySet()) { | |
164 | + for (int docId : document.statsData.keySet()) { | |
147 | 165 | NamedList<Object> mtasDocumentItemResponse = new SimpleOrderedMap<>(); |
148 | - MtasDataCollector<?, ?> stats = document.stats.get(docId); | |
166 | + MtasDataCollector<?, ?> stats = document.statsData.get(docId); | |
149 | 167 | MtasDataCollector<?, ?> list = null; |
150 | - if (document.list != null) { | |
151 | - list = document.list.get(docId); | |
168 | + if (document.statsList != null) { | |
169 | + list = document.statsList.get(docId); | |
152 | 170 | } |
153 | 171 | mtasDocumentItemResponse.add("stats", new MtasSolrResult(stats, |
154 | 172 | stats.getDataType(), stats.getStatsType(), stats.statsItems, null)); |
... | ... |