Commit c3c710901d1acc3379e8456bca1ef23492a95cb4
1 parent
6ec38b89
add list to document
Showing
4 changed files
with
133 additions
and
81 deletions
docker/Dockerfile
| 1 | 1 | # Automatically generated Dockerfile |
| 2 | -# - Build 2017-01-10 08:13 | |
| 2 | +# - Build 2017-01-13 14:47 | |
| 3 | 3 | # - Lucene/Solr version 6.3.0 |
| 4 | 4 | # - Mtas release 20170110 |
| 5 | 5 | # |
| ... | ... | @@ -55,7 +55,7 @@ RUN apt-get update && apt-get install -y lsof software-properties-common python- |
| 55 | 55 | && chmod -R 755 /var/www/html \ |
| 56 | 56 | && printf "echo\n" >> /start.sh \ |
| 57 | 57 | && printf "echo \"================ Mtas -- Multi Tier Annotation Search =================\"\n" >> /start.sh \ |
| 58 | -&& printf "echo \" Timestamp 2017-01-10 08:13\"\n" >> /start.sh \ | |
| 58 | +&& printf "echo \" Timestamp 2017-01-13 14:47\"\n" >> /start.sh \ | |
| 59 | 59 | && printf "echo \" Lucene/Solr version 6.3.0\"\n" >> /start.sh \ |
| 60 | 60 | && printf "echo \" Mtas release 20170110\"\n" >> /start.sh \ |
| 61 | 61 | && printf "echo \" See https://meertensinstituut.github.io/mtas/ for more information\"\n" >> /start.sh \ |
| ... | ... |
src/mtas/codec/util/CodecCollector.java
| ... | ... | @@ -2028,13 +2028,13 @@ public class CodecCollector { |
| 2028 | 2028 | DataCollector.COLLECTOR_TYPE_DATA, document.dataType, |
| 2029 | 2029 | document.statsType, document.statsItems, null, null, null, null, |
| 2030 | 2030 | null, null); |
| 2031 | - document.stats.put(docId, stats); | |
| 2032 | - if (document.list != null) { | |
| 2031 | + document.statsData.put(docId, stats); | |
| 2032 | + if (document.statsList != null) { | |
| 2033 | 2033 | MtasDataCollector<?, ?> list = DataCollector.getCollector( |
| 2034 | 2034 | DataCollector.COLLECTOR_TYPE_LIST, CodecUtil.DATA_TYPE_LONG, |
| 2035 | 2035 | listStatsType, listStatsItems, CodecUtil.STATS_TYPE_SUM, |
| 2036 | 2036 | CodecUtil.SORT_DESC, 0, document.number, null, null); |
| 2037 | - document.list.put(docId, list); | |
| 2037 | + document.statsList.put(docId, list); | |
| 2038 | 2038 | } |
| 2039 | 2039 | } |
| 2040 | 2040 | } |
| ... | ... | @@ -2046,44 +2046,57 @@ public class CodecCollector { |
| 2046 | 2046 | PostingsEnum postingsEnum = null; |
| 2047 | 2047 | // loop over termvectors |
| 2048 | 2048 | for (ComponentDocument document : documentList) { |
| 2049 | - termsEnum = t.intersect(document.compiledAutomaton, null); | |
| 2050 | - // init | |
| 2051 | - int initSize = Math.min((int) t.size(), 1000); | |
| 2052 | - for (int docId : docList) { | |
| 2053 | - document.stats.get(docId).initNewList(1); | |
| 2054 | - if (document.list != null) { | |
| 2055 | - document.list.get(docId).initNewList(initSize); | |
| 2056 | - } | |
| 2049 | + | |
| 2050 | + List<CompiledAutomaton> listAutomata; | |
| 2051 | + if (document.list == null) { | |
| 2052 | + listAutomata = new ArrayList<CompiledAutomaton>(); | |
| 2053 | + listAutomata.add(document.compiledAutomaton); | |
| 2054 | + } else { | |
| 2055 | + listAutomata = MtasToken.createAutomata(document.prefix, | |
| 2056 | + document.regexp, new ArrayList<String>(document.list)); | |
| 2057 | 2057 | } |
| 2058 | - // fill | |
| 2059 | - while ((term = termsEnum.next()) != null) { | |
| 2060 | - Iterator<Integer> docIterator = docList.iterator(); | |
| 2061 | - postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS); | |
| 2062 | - int termDocId = -1; | |
| 2063 | - while (docIterator.hasNext()) { | |
| 2064 | - int segmentDocId = docIterator.next() - lrc.docBase; | |
| 2065 | - if (segmentDocId >= termDocId) { | |
| 2066 | - if ((segmentDocId == termDocId) || ((termDocId = postingsEnum | |
| 2067 | - .advance(segmentDocId)) == segmentDocId)) { | |
| 2068 | - // register stats | |
| 2069 | - document.stats.get(segmentDocId + lrc.docBase) | |
| 2070 | - .add(new long[] { postingsEnum.freq() }, 1); | |
| 2071 | - // register list | |
| 2072 | - if (document.list != null) { | |
| 2073 | - document.list.get(segmentDocId + lrc.docBase).add( | |
| 2074 | - MtasToken.getPostfixFromValue(term), | |
| 2075 | - new long[] { postingsEnum.freq() }, 1); | |
| 2058 | + | |
| 2059 | + for (CompiledAutomaton compiledAutomaton : listAutomata) { | |
| 2060 | + | |
| 2061 | + termsEnum = t.intersect(compiledAutomaton, null); | |
| 2062 | + // init | |
| 2063 | + int initSize = Math.min((int) t.size(), 1000); | |
| 2064 | + for (int docId : docList) { | |
| 2065 | + document.statsData.get(docId).initNewList(1); | |
| 2066 | + if (document.statsList != null) { | |
| 2067 | + document.statsList.get(docId).initNewList(initSize); | |
| 2068 | + } | |
| 2069 | + } | |
| 2070 | + // fill | |
| 2071 | + while ((term = termsEnum.next()) != null) { | |
| 2072 | + Iterator<Integer> docIterator = docList.iterator(); | |
| 2073 | + postingsEnum = termsEnum.postings(postingsEnum, | |
| 2074 | + PostingsEnum.FREQS); | |
| 2075 | + int termDocId = -1; | |
| 2076 | + while (docIterator.hasNext()) { | |
| 2077 | + int segmentDocId = docIterator.next() - lrc.docBase; | |
| 2078 | + if (segmentDocId >= termDocId) { | |
| 2079 | + if ((segmentDocId == termDocId) || ((termDocId = postingsEnum | |
| 2080 | + .advance(segmentDocId)) == segmentDocId)) { | |
| 2081 | + // register stats | |
| 2082 | + document.statsData.get(segmentDocId + lrc.docBase) | |
| 2083 | + .add(new long[] { postingsEnum.freq() }, 1); | |
| 2084 | + // register list | |
| 2085 | + if (document.statsList != null) { | |
| 2086 | + document.statsList.get(segmentDocId + lrc.docBase).add( | |
| 2087 | + MtasToken.getPostfixFromValue(term), | |
| 2088 | + new long[] { postingsEnum.freq() }, 1); | |
| 2089 | + } | |
| 2076 | 2090 | } |
| 2077 | 2091 | } |
| 2078 | 2092 | } |
| 2079 | 2093 | } |
| 2080 | - } | |
| 2081 | - | |
| 2082 | - // close | |
| 2083 | - for (int docId : docList) { | |
| 2084 | - document.stats.get(docId).closeNewList(); | |
| 2085 | - if (document.list != null) { | |
| 2086 | - document.list.get(docId).closeNewList(); | |
| 2094 | + // close | |
| 2095 | + for (int docId : docList) { | |
| 2096 | + document.statsData.get(docId).closeNewList(); | |
| 2097 | + if (document.statsList != null) { | |
| 2098 | + document.statsList.get(docId).closeNewList(); | |
| 2099 | + } | |
| 2087 | 2100 | } |
| 2088 | 2101 | } |
| 2089 | 2102 | } |
| ... | ... | @@ -2231,14 +2244,22 @@ public class CodecCollector { |
| 2231 | 2244 | /** |
| 2232 | 2245 | * Creates the facet base. |
| 2233 | 2246 | * |
| 2234 | - * @param cf the cf | |
| 2235 | - * @param level the level | |
| 2236 | - * @param dataCollector the data collector | |
| 2237 | - * @param positionsData the positions data | |
| 2238 | - * @param spansNumberData the spans number data | |
| 2239 | - * @param facetData the facet data | |
| 2240 | - * @param docSet the doc set | |
| 2241 | - * @throws IOException Signals that an I/O exception has occurred. | |
| 2247 | + * @param cf | |
| 2248 | + * the cf | |
| 2249 | + * @param level | |
| 2250 | + * the level | |
| 2251 | + * @param dataCollector | |
| 2252 | + * the data collector | |
| 2253 | + * @param positionsData | |
| 2254 | + * the positions data | |
| 2255 | + * @param spansNumberData | |
| 2256 | + * the spans number data | |
| 2257 | + * @param facetData | |
| 2258 | + * the facet data | |
| 2259 | + * @param docSet | |
| 2260 | + * the doc set | |
| 2261 | + * @throws IOException | |
| 2262 | + * Signals that an I/O exception has occurred. | |
| 2242 | 2263 | */ |
| 2243 | 2264 | private static void createFacetBase(ComponentFacet cf, int level, |
| 2244 | 2265 | MtasDataCollector<?, ?> dataCollector, |
| ... | ... | @@ -2287,22 +2308,24 @@ public class CodecCollector { |
| 2287 | 2308 | // only if documents and facets |
| 2288 | 2309 | if (docSet.length > 0 && list.size() > 0) { |
| 2289 | 2310 | HashMap<String, Integer[]> docLists = new HashMap<String, Integer[]>(); |
| 2290 | - HashMap<String, String> groupedKeys = new HashMap<String,String>(); | |
| 2311 | + HashMap<String, String> groupedKeys = new HashMap<String, String>(); | |
| 2291 | 2312 | boolean documentsInFacets = false; |
| 2292 | 2313 | // compute intersections |
| 2293 | 2314 | for (String key : list.keySet()) { |
| 2294 | - //fill grouped keys | |
| 2295 | - if(!groupedKeys.containsKey(key)) { | |
| 2296 | - groupedKeys.put(key, groupedKeyName(key, cf.baseRangeSizes[level], cf.baseRangeBases[level])); | |
| 2315 | + // fill grouped keys | |
| 2316 | + if (!groupedKeys.containsKey(key)) { | |
| 2317 | + groupedKeys.put(key, groupedKeyName(key, cf.baseRangeSizes[level], | |
| 2318 | + cf.baseRangeBases[level])); | |
| 2297 | 2319 | } |
| 2298 | 2320 | // intersect docSet with docList |
| 2299 | 2321 | Integer[] docList = intersectedDocList(list.get(key), docSet); |
| 2300 | 2322 | if (docList.length > 0) { |
| 2301 | 2323 | documentsInFacets = true; |
| 2302 | 2324 | } |
| 2303 | - //update docLists | |
| 2304 | - if(docLists.containsKey(groupedKeys.get(key))) { | |
| 2305 | - docLists.put(groupedKeys.get(key), mergeDocLists(docLists.get(groupedKeys.get(key)), docList)); | |
| 2325 | + // update docLists | |
| 2326 | + if (docLists.containsKey(groupedKeys.get(key))) { | |
| 2327 | + docLists.put(groupedKeys.get(key), | |
| 2328 | + mergeDocLists(docLists.get(groupedKeys.get(key)), docList)); | |
| 2306 | 2329 | } else { |
| 2307 | 2330 | docLists.put(groupedKeys.get(key), docList); |
| 2308 | 2331 | } |
| ... | ... | @@ -2515,39 +2538,42 @@ public class CodecCollector { |
| 2515 | 2538 | function.dataCollector.closeNewList(); |
| 2516 | 2539 | } |
| 2517 | 2540 | } |
| 2518 | - } | |
| 2541 | + } | |
| 2519 | 2542 | |
| 2520 | 2543 | } |
| 2521 | - | |
| 2522 | - private static String groupedKeyName(String key, Double baseRangeSize, Double baseRangeBase) { | |
| 2523 | - if(baseRangeSize==null || baseRangeSize<=0) { | |
| 2544 | + | |
| 2545 | + private static String groupedKeyName(String key, Double baseRangeSize, | |
| 2546 | + Double baseRangeBase) { | |
| 2547 | + if (baseRangeSize == null || baseRangeSize <= 0) { | |
| 2524 | 2548 | return key; |
| 2525 | - } else { | |
| 2549 | + } else { | |
| 2526 | 2550 | Double doubleKey, doubleBase, doubleNumber, doubleStart, doubleEnd; |
| 2527 | 2551 | try { |
| 2528 | - doubleKey = Double.parseDouble(key); | |
| 2529 | - doubleBase = baseRangeBase==null?0:baseRangeBase; | |
| 2530 | - doubleNumber = Math.floor((doubleKey - doubleBase) / baseRangeSize); | |
| 2552 | + doubleKey = Double.parseDouble(key); | |
| 2553 | + doubleBase = baseRangeBase == null ? 0 : baseRangeBase; | |
| 2554 | + doubleNumber = Math.floor((doubleKey - doubleBase) / baseRangeSize); | |
| 2531 | 2555 | doubleStart = doubleBase + doubleNumber * baseRangeSize; |
| 2532 | - doubleEnd = doubleStart+baseRangeSize; | |
| 2556 | + doubleEnd = doubleStart + baseRangeSize; | |
| 2533 | 2557 | } catch (NumberFormatException e) { |
| 2534 | 2558 | return key; |
| 2535 | 2559 | } |
| 2536 | - //integer | |
| 2537 | - if(Math.floor(baseRangeSize) == baseRangeSize && Math.floor(doubleBase)==doubleBase) { | |
| 2560 | + // integer | |
| 2561 | + if (Math.floor(baseRangeSize) == baseRangeSize | |
| 2562 | + && Math.floor(doubleBase) == doubleBase) { | |
| 2538 | 2563 | try { |
| 2539 | - if(baseRangeSize>1) { | |
| 2540 | - return String.format("%.0f", doubleStart)+"-"+String.format("%.0f", doubleEnd-1); | |
| 2564 | + if (baseRangeSize > 1) { | |
| 2565 | + return String.format("%.0f", doubleStart) + "-" | |
| 2566 | + + String.format("%.0f", doubleEnd - 1); | |
| 2541 | 2567 | } else { |
| 2542 | 2568 | return String.format("%.0f", doubleStart); |
| 2543 | 2569 | } |
| 2544 | 2570 | } catch (NumberFormatException e) { |
| 2545 | 2571 | return key; |
| 2546 | - } | |
| 2572 | + } | |
| 2547 | 2573 | } else { |
| 2548 | - return "["+doubleStart+","+doubleEnd+")"; | |
| 2549 | - } | |
| 2550 | - } | |
| 2574 | + return "[" + doubleStart + "," + doubleEnd + ")"; | |
| 2575 | + } | |
| 2576 | + } | |
| 2551 | 2577 | } |
| 2552 | 2578 | |
| 2553 | 2579 | private static Integer[] mergeDocLists(Integer[] a, Integer[] b) { |
| ... | ... |
src/mtas/codec/util/CodecComponent.java
| ... | ... | @@ -266,6 +266,9 @@ public class CodecComponent { |
| 266 | 266 | |
| 267 | 267 | /** The regexp. */ |
| 268 | 268 | public String key, prefix, regexp; |
| 269 | + | |
| 270 | + /** The list. */ | |
| 271 | + public HashSet<String> list; | |
| 269 | 272 | |
| 270 | 273 | /** The stats type. */ |
| 271 | 274 | public String dataType, statsType; |
| ... | ... | @@ -283,10 +286,10 @@ public class CodecComponent { |
| 283 | 286 | public HashMap<Integer, String> uniqueKey; |
| 284 | 287 | |
| 285 | 288 | /** The stats. */ |
| 286 | - public HashMap<Integer, MtasDataCollector<?, ?>> stats; | |
| 289 | + public HashMap<Integer, MtasDataCollector<?, ?>> statsData; | |
| 287 | 290 | |
| 288 | 291 | /** The list. */ |
| 289 | - public HashMap<Integer, MtasDataCollector<?, ?>> list; | |
| 292 | + public HashMap<Integer, MtasDataCollector<?, ?>> statsList; | |
| 290 | 293 | |
| 291 | 294 | /** |
| 292 | 295 | * Instantiates a new component document. |
| ... | ... | @@ -299,10 +302,15 @@ public class CodecComponent { |
| 299 | 302 | * @throws IOException Signals that an I/O exception has occurred. |
| 300 | 303 | */ |
| 301 | 304 | public ComponentDocument(String key, String prefix, String statsType, |
| 302 | - String regexp, int number) throws IOException { | |
| 305 | + String regexp, String[] list, int number) throws IOException { | |
| 303 | 306 | this.key = key; |
| 304 | 307 | this.prefix = prefix; |
| 305 | 308 | this.regexp = regexp; |
| 309 | + if (list != null && list.length > 0) { | |
| 310 | + this.list = new HashSet(Arrays.asList(list)); | |
| 311 | + } else { | |
| 312 | + this.list = null; | |
| 313 | + } | |
| 306 | 314 | this.number = number; |
| 307 | 315 | uniqueKey = new HashMap<Integer, String>(); |
| 308 | 316 | dataType = CodecUtil.DATA_TYPE_LONG; |
| ... | ... | @@ -316,11 +324,11 @@ public class CodecComponent { |
| 316 | 324 | prefix + MtasToken.DELIMITER + regexp + "\u0000*"); |
| 317 | 325 | compiledAutomaton = new CompiledAutomaton(re.toAutomaton()); |
| 318 | 326 | } |
| 319 | - this.stats = new HashMap<Integer, MtasDataCollector<?, ?>>(); | |
| 327 | + this.statsData = new HashMap<Integer, MtasDataCollector<?, ?>>(); | |
| 320 | 328 | if (this.number > 0) { |
| 321 | - this.list = new HashMap<Integer, MtasDataCollector<?, ?>>(); | |
| 329 | + this.statsList = new HashMap<Integer, MtasDataCollector<?, ?>>(); | |
| 322 | 330 | } else { |
| 323 | - this.list = null; | |
| 331 | + this.statsList = null; | |
| 324 | 332 | } |
| 325 | 333 | } |
| 326 | 334 | } |
| ... | ... |
src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java
| ... | ... | @@ -43,6 +43,9 @@ public class MtasSolrComponentDocument { |
| 43 | 43 | /** The Constant NAME_MTAS_DOCUMENT_REGEXP. */ |
| 44 | 44 | public static final String NAME_MTAS_DOCUMENT_REGEXP = "regexp"; |
| 45 | 45 | |
| 46 | + /** The Constant NAME_MTAS_DOCUMENT_REGEXP. */ | |
| 47 | + public static final String NAME_MTAS_DOCUMENT_LIST = "list"; | |
| 48 | + | |
| 46 | 49 | /** The Constant NAME_MTAS_DOCUMENT_NUMBER. */ |
| 47 | 50 | public static final String NAME_MTAS_DOCUMENT_NUMBER = "number"; |
| 48 | 51 | |
| ... | ... | @@ -73,6 +76,7 @@ public class MtasSolrComponentDocument { |
| 73 | 76 | String[] prefixes = new String[ids.size()]; |
| 74 | 77 | String[] types = new String[ids.size()]; |
| 75 | 78 | String[] regexps = new String[ids.size()]; |
| 79 | + String[] lists = new String[ids.size()]; | |
| 76 | 80 | String[] numbers = new String[ids.size()]; |
| 77 | 81 | for (String id : ids) { |
| 78 | 82 | fields[tmpCounter] = rb.req.getParams().get( |
| ... | ... | @@ -91,6 +95,9 @@ public class MtasSolrComponentDocument { |
| 91 | 95 | regexps[tmpCounter] = rb.req.getParams().get( |
| 92 | 96 | PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_REGEXP, |
| 93 | 97 | null); |
| 98 | + lists[tmpCounter] = rb.req.getParams().get( | |
| 99 | + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST, | |
| 100 | + null); | |
| 94 | 101 | numbers[tmpCounter] = rb.req.getParams().get( |
| 95 | 102 | PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_NUMBER, |
| 96 | 103 | null); |
| ... | ... | @@ -114,6 +121,8 @@ public class MtasSolrComponentDocument { |
| 114 | 121 | NAME_MTAS_DOCUMENT_FIELD, false); |
| 115 | 122 | MtasSolrResultUtil.compareAndCheck(regexps, fields, |
| 116 | 123 | NAME_MTAS_DOCUMENT_REGEXP, NAME_MTAS_DOCUMENT_FIELD, false); |
| 124 | + MtasSolrResultUtil.compareAndCheck(lists, fields, | |
| 125 | + NAME_MTAS_DOCUMENT_LIST, NAME_MTAS_DOCUMENT_FIELD, false); | |
| 117 | 126 | MtasSolrResultUtil.compareAndCheck(numbers, fields, |
| 118 | 127 | NAME_MTAS_DOCUMENT_NUMBER, NAME_MTAS_DOCUMENT_FIELD, false); |
| 119 | 128 | for (int i = 0; i < fields.length; i++) { |
| ... | ... | @@ -123,10 +132,19 @@ public class MtasSolrComponentDocument { |
| 123 | 132 | String prefix = prefixes[i]; |
| 124 | 133 | String type = types[i]; |
| 125 | 134 | String regexp = regexps[i]; |
| 135 | + String[] list = null; | |
| 136 | + if(lists[i]!=null) { | |
| 137 | + ArrayList<String> tmpList = new ArrayList<String>(); | |
| 138 | + String[] subList = lists[i].split("(?<!\\\\),"); | |
| 139 | + for(int j=0; j<subList.length; j++) { | |
| 140 | + tmpList.add(subList[j].replace("\\,", ",").replace("\\\\", "\\")); | |
| 141 | + } | |
| 142 | + list = tmpList.toArray(new String[tmpList.size()]); | |
| 143 | + } | |
| 126 | 144 | int number = Math.max(0, (numbers[i] == null) || (numbers[i].isEmpty()) |
| 127 | 145 | ? 0 : Integer.parseInt(numbers[i])); |
| 128 | 146 | mtasFields.list.get(fields[i]).documentList |
| 129 | - .add(new ComponentDocument(key, prefix, type, regexp, number)); | |
| 147 | + .add(new ComponentDocument(key, prefix, type, regexp, list, number)); | |
| 130 | 148 | } |
| 131 | 149 | } |
| 132 | 150 | } |
| ... | ... | @@ -143,12 +161,12 @@ public class MtasSolrComponentDocument { |
| 143 | 161 | SimpleOrderedMap<Object> mtasDocumentResponse = new SimpleOrderedMap<>(); |
| 144 | 162 | mtasDocumentResponse.add("key", document.key); |
| 145 | 163 | ArrayList<NamedList<Object>> mtasDocumentItemResponses = new ArrayList<NamedList<Object>>(); |
| 146 | - for (int docId : document.stats.keySet()) { | |
| 164 | + for (int docId : document.statsData.keySet()) { | |
| 147 | 165 | NamedList<Object> mtasDocumentItemResponse = new SimpleOrderedMap<>(); |
| 148 | - MtasDataCollector<?, ?> stats = document.stats.get(docId); | |
| 166 | + MtasDataCollector<?, ?> stats = document.statsData.get(docId); | |
| 149 | 167 | MtasDataCollector<?, ?> list = null; |
| 150 | - if (document.list != null) { | |
| 151 | - list = document.list.get(docId); | |
| 168 | + if (document.statsList != null) { | |
| 169 | + list = document.statsList.get(docId); | |
| 152 | 170 | } |
| 153 | 171 | mtasDocumentItemResponse.add("stats", new MtasSolrResult(stats, |
| 154 | 172 | stats.getDataType(), stats.getStatsType(), stats.statsItems, null)); |
| ... | ... |