Commit c3c710901d1acc3379e8456bca1ef23492a95cb4

Authored by Matthijs Brouwer
1 parent 6ec38b89

add list to document

docker/Dockerfile
1 1 # Automatically generated Dockerfile
2   -# - Build 2017-01-10 08:13
  2 +# - Build 2017-01-13 14:47
3 3 # - Lucene/Solr version 6.3.0
4 4 # - Mtas release 20170110
5 5 #
... ... @@ -55,7 +55,7 @@ RUN apt-get update && apt-get install -y lsof software-properties-common python-
55 55 && chmod -R 755 /var/www/html \
56 56 && printf "echo\n" >> /start.sh \
57 57 && printf "echo \"================ Mtas -- Multi Tier Annotation Search =================\"\n" >> /start.sh \
58   -&& printf "echo \" Timestamp 2017-01-10 08:13\"\n" >> /start.sh \
  58 +&& printf "echo \" Timestamp 2017-01-13 14:47\"\n" >> /start.sh \
59 59 && printf "echo \" Lucene/Solr version 6.3.0\"\n" >> /start.sh \
60 60 && printf "echo \" Mtas release 20170110\"\n" >> /start.sh \
61 61 && printf "echo \" See https://meertensinstituut.github.io/mtas/ for more information\"\n" >> /start.sh \
... ...
src/mtas/codec/util/CodecCollector.java
... ... @@ -2028,13 +2028,13 @@ public class CodecCollector {
2028 2028 DataCollector.COLLECTOR_TYPE_DATA, document.dataType,
2029 2029 document.statsType, document.statsItems, null, null, null, null,
2030 2030 null, null);
2031   - document.stats.put(docId, stats);
2032   - if (document.list != null) {
  2031 + document.statsData.put(docId, stats);
  2032 + if (document.statsList != null) {
2033 2033 MtasDataCollector<?, ?> list = DataCollector.getCollector(
2034 2034 DataCollector.COLLECTOR_TYPE_LIST, CodecUtil.DATA_TYPE_LONG,
2035 2035 listStatsType, listStatsItems, CodecUtil.STATS_TYPE_SUM,
2036 2036 CodecUtil.SORT_DESC, 0, document.number, null, null);
2037   - document.list.put(docId, list);
  2037 + document.statsList.put(docId, list);
2038 2038 }
2039 2039 }
2040 2040 }
... ... @@ -2046,44 +2046,57 @@ public class CodecCollector {
2046 2046 PostingsEnum postingsEnum = null;
2047 2047 // loop over termvectors
2048 2048 for (ComponentDocument document : documentList) {
2049   - termsEnum = t.intersect(document.compiledAutomaton, null);
2050   - // init
2051   - int initSize = Math.min((int) t.size(), 1000);
2052   - for (int docId : docList) {
2053   - document.stats.get(docId).initNewList(1);
2054   - if (document.list != null) {
2055   - document.list.get(docId).initNewList(initSize);
2056   - }
  2049 +
  2050 + List<CompiledAutomaton> listAutomata;
  2051 + if (document.list == null) {
  2052 + listAutomata = new ArrayList<CompiledAutomaton>();
  2053 + listAutomata.add(document.compiledAutomaton);
  2054 + } else {
  2055 + listAutomata = MtasToken.createAutomata(document.prefix,
  2056 + document.regexp, new ArrayList<String>(document.list));
2057 2057 }
2058   - // fill
2059   - while ((term = termsEnum.next()) != null) {
2060   - Iterator<Integer> docIterator = docList.iterator();
2061   - postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
2062   - int termDocId = -1;
2063   - while (docIterator.hasNext()) {
2064   - int segmentDocId = docIterator.next() - lrc.docBase;
2065   - if (segmentDocId >= termDocId) {
2066   - if ((segmentDocId == termDocId) || ((termDocId = postingsEnum
2067   - .advance(segmentDocId)) == segmentDocId)) {
2068   - // register stats
2069   - document.stats.get(segmentDocId + lrc.docBase)
2070   - .add(new long[] { postingsEnum.freq() }, 1);
2071   - // register list
2072   - if (document.list != null) {
2073   - document.list.get(segmentDocId + lrc.docBase).add(
2074   - MtasToken.getPostfixFromValue(term),
2075   - new long[] { postingsEnum.freq() }, 1);
  2058 +
  2059 + for (CompiledAutomaton compiledAutomaton : listAutomata) {
  2060 +
  2061 + termsEnum = t.intersect(compiledAutomaton, null);
  2062 + // init
  2063 + int initSize = Math.min((int) t.size(), 1000);
  2064 + for (int docId : docList) {
  2065 + document.statsData.get(docId).initNewList(1);
  2066 + if (document.statsList != null) {
  2067 + document.statsList.get(docId).initNewList(initSize);
  2068 + }
  2069 + }
  2070 + // fill
  2071 + while ((term = termsEnum.next()) != null) {
  2072 + Iterator<Integer> docIterator = docList.iterator();
  2073 + postingsEnum = termsEnum.postings(postingsEnum,
  2074 + PostingsEnum.FREQS);
  2075 + int termDocId = -1;
  2076 + while (docIterator.hasNext()) {
  2077 + int segmentDocId = docIterator.next() - lrc.docBase;
  2078 + if (segmentDocId >= termDocId) {
  2079 + if ((segmentDocId == termDocId) || ((termDocId = postingsEnum
  2080 + .advance(segmentDocId)) == segmentDocId)) {
  2081 + // register stats
  2082 + document.statsData.get(segmentDocId + lrc.docBase)
  2083 + .add(new long[] { postingsEnum.freq() }, 1);
  2084 + // register list
  2085 + if (document.statsList != null) {
  2086 + document.statsList.get(segmentDocId + lrc.docBase).add(
  2087 + MtasToken.getPostfixFromValue(term),
  2088 + new long[] { postingsEnum.freq() }, 1);
  2089 + }
2076 2090 }
2077 2091 }
2078 2092 }
2079 2093 }
2080   - }
2081   -
2082   - // close
2083   - for (int docId : docList) {
2084   - document.stats.get(docId).closeNewList();
2085   - if (document.list != null) {
2086   - document.list.get(docId).closeNewList();
  2094 + // close
  2095 + for (int docId : docList) {
  2096 + document.statsData.get(docId).closeNewList();
  2097 + if (document.statsList != null) {
  2098 + document.statsList.get(docId).closeNewList();
  2099 + }
2087 2100 }
2088 2101 }
2089 2102 }
... ... @@ -2231,14 +2244,22 @@ public class CodecCollector {
2231 2244 /**
2232 2245 * Creates the facet base.
2233 2246 *
2234   - * @param cf the cf
2235   - * @param level the level
2236   - * @param dataCollector the data collector
2237   - * @param positionsData the positions data
2238   - * @param spansNumberData the spans number data
2239   - * @param facetData the facet data
2240   - * @param docSet the doc set
2241   - * @throws IOException Signals that an I/O exception has occurred.
  2247 + * @param cf
  2248 + * the cf
  2249 + * @param level
  2250 + * the level
  2251 + * @param dataCollector
  2252 + * the data collector
  2253 + * @param positionsData
  2254 + * the positions data
  2255 + * @param spansNumberData
  2256 + * the spans number data
  2257 + * @param facetData
  2258 + * the facet data
  2259 + * @param docSet
  2260 + * the doc set
  2261 + * @throws IOException
  2262 + * Signals that an I/O exception has occurred.
2242 2263 */
2243 2264 private static void createFacetBase(ComponentFacet cf, int level,
2244 2265 MtasDataCollector<?, ?> dataCollector,
... ... @@ -2287,22 +2308,24 @@ public class CodecCollector {
2287 2308 // only if documents and facets
2288 2309 if (docSet.length > 0 && list.size() > 0) {
2289 2310 HashMap<String, Integer[]> docLists = new HashMap<String, Integer[]>();
2290   - HashMap<String, String> groupedKeys = new HashMap<String,String>();
  2311 + HashMap<String, String> groupedKeys = new HashMap<String, String>();
2291 2312 boolean documentsInFacets = false;
2292 2313 // compute intersections
2293 2314 for (String key : list.keySet()) {
2294   - //fill grouped keys
2295   - if(!groupedKeys.containsKey(key)) {
2296   - groupedKeys.put(key, groupedKeyName(key, cf.baseRangeSizes[level], cf.baseRangeBases[level]));
  2315 + // fill grouped keys
  2316 + if (!groupedKeys.containsKey(key)) {
  2317 + groupedKeys.put(key, groupedKeyName(key, cf.baseRangeSizes[level],
  2318 + cf.baseRangeBases[level]));
2297 2319 }
2298 2320 // intersect docSet with docList
2299 2321 Integer[] docList = intersectedDocList(list.get(key), docSet);
2300 2322 if (docList.length > 0) {
2301 2323 documentsInFacets = true;
2302 2324 }
2303   - //update docLists
2304   - if(docLists.containsKey(groupedKeys.get(key))) {
2305   - docLists.put(groupedKeys.get(key), mergeDocLists(docLists.get(groupedKeys.get(key)), docList));
  2325 + // update docLists
  2326 + if (docLists.containsKey(groupedKeys.get(key))) {
  2327 + docLists.put(groupedKeys.get(key),
  2328 + mergeDocLists(docLists.get(groupedKeys.get(key)), docList));
2306 2329 } else {
2307 2330 docLists.put(groupedKeys.get(key), docList);
2308 2331 }
... ... @@ -2515,39 +2538,42 @@ public class CodecCollector {
2515 2538 function.dataCollector.closeNewList();
2516 2539 }
2517 2540 }
2518   - }
  2541 + }
2519 2542  
2520 2543 }
2521   -
2522   - private static String groupedKeyName(String key, Double baseRangeSize, Double baseRangeBase) {
2523   - if(baseRangeSize==null || baseRangeSize<=0) {
  2544 +
  2545 + private static String groupedKeyName(String key, Double baseRangeSize,
  2546 + Double baseRangeBase) {
  2547 + if (baseRangeSize == null || baseRangeSize <= 0) {
2524 2548 return key;
2525   - } else {
  2549 + } else {
2526 2550 Double doubleKey, doubleBase, doubleNumber, doubleStart, doubleEnd;
2527 2551 try {
2528   - doubleKey = Double.parseDouble(key);
2529   - doubleBase = baseRangeBase==null?0:baseRangeBase;
2530   - doubleNumber = Math.floor((doubleKey - doubleBase) / baseRangeSize);
  2552 + doubleKey = Double.parseDouble(key);
  2553 + doubleBase = baseRangeBase == null ? 0 : baseRangeBase;
  2554 + doubleNumber = Math.floor((doubleKey - doubleBase) / baseRangeSize);
2531 2555 doubleStart = doubleBase + doubleNumber * baseRangeSize;
2532   - doubleEnd = doubleStart+baseRangeSize;
  2556 + doubleEnd = doubleStart + baseRangeSize;
2533 2557 } catch (NumberFormatException e) {
2534 2558 return key;
2535 2559 }
2536   - //integer
2537   - if(Math.floor(baseRangeSize) == baseRangeSize && Math.floor(doubleBase)==doubleBase) {
  2560 + // integer
  2561 + if (Math.floor(baseRangeSize) == baseRangeSize
  2562 + && Math.floor(doubleBase) == doubleBase) {
2538 2563 try {
2539   - if(baseRangeSize>1) {
2540   - return String.format("%.0f", doubleStart)+"-"+String.format("%.0f", doubleEnd-1);
  2564 + if (baseRangeSize > 1) {
  2565 + return String.format("%.0f", doubleStart) + "-"
  2566 + + String.format("%.0f", doubleEnd - 1);
2541 2567 } else {
2542 2568 return String.format("%.0f", doubleStart);
2543 2569 }
2544 2570 } catch (NumberFormatException e) {
2545 2571 return key;
2546   - }
  2572 + }
2547 2573 } else {
2548   - return "["+doubleStart+","+doubleEnd+")";
2549   - }
2550   - }
  2574 + return "[" + doubleStart + "," + doubleEnd + ")";
  2575 + }
  2576 + }
2551 2577 }
2552 2578  
2553 2579 private static Integer[] mergeDocLists(Integer[] a, Integer[] b) {
... ...
src/mtas/codec/util/CodecComponent.java
... ... @@ -266,6 +266,9 @@ public class CodecComponent {
266 266  
267 267 /** The regexp. */
268 268 public String key, prefix, regexp;
  269 +
  270 + /** The list. */
  271 + public HashSet<String> list;
269 272  
270 273 /** The stats type. */
271 274 public String dataType, statsType;
... ... @@ -283,10 +286,10 @@ public class CodecComponent {
283 286 public HashMap<Integer, String> uniqueKey;
284 287  
285 288 /** The stats. */
286   - public HashMap<Integer, MtasDataCollector<?, ?>> stats;
  289 + public HashMap<Integer, MtasDataCollector<?, ?>> statsData;
287 290  
288 291 /** The list. */
289   - public HashMap<Integer, MtasDataCollector<?, ?>> list;
  292 + public HashMap<Integer, MtasDataCollector<?, ?>> statsList;
290 293  
291 294 /**
292 295 * Instantiates a new component document.
... ... @@ -299,10 +302,15 @@ public class CodecComponent {
299 302 * @throws IOException Signals that an I/O exception has occurred.
300 303 */
301 304 public ComponentDocument(String key, String prefix, String statsType,
302   - String regexp, int number) throws IOException {
  305 + String regexp, String[] list, int number) throws IOException {
303 306 this.key = key;
304 307 this.prefix = prefix;
305 308 this.regexp = regexp;
  309 + if (list != null && list.length > 0) {
  310 + this.list = new HashSet(Arrays.asList(list));
  311 + } else {
  312 + this.list = null;
  313 + }
306 314 this.number = number;
307 315 uniqueKey = new HashMap<Integer, String>();
308 316 dataType = CodecUtil.DATA_TYPE_LONG;
... ... @@ -316,11 +324,11 @@ public class CodecComponent {
316 324 prefix + MtasToken.DELIMITER + regexp + "\u0000*");
317 325 compiledAutomaton = new CompiledAutomaton(re.toAutomaton());
318 326 }
319   - this.stats = new HashMap<Integer, MtasDataCollector<?, ?>>();
  327 + this.statsData = new HashMap<Integer, MtasDataCollector<?, ?>>();
320 328 if (this.number > 0) {
321   - this.list = new HashMap<Integer, MtasDataCollector<?, ?>>();
  329 + this.statsList = new HashMap<Integer, MtasDataCollector<?, ?>>();
322 330 } else {
323   - this.list = null;
  331 + this.statsList = null;
324 332 }
325 333 }
326 334 }
... ...
src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java
... ... @@ -43,6 +43,9 @@ public class MtasSolrComponentDocument {
43 43 /** The Constant NAME_MTAS_DOCUMENT_REGEXP. */
44 44 public static final String NAME_MTAS_DOCUMENT_REGEXP = "regexp";
45 45  
  46 + /** The Constant NAME_MTAS_DOCUMENT_REGEXP. */
  47 + public static final String NAME_MTAS_DOCUMENT_LIST = "list";
  48 +
46 49 /** The Constant NAME_MTAS_DOCUMENT_NUMBER. */
47 50 public static final String NAME_MTAS_DOCUMENT_NUMBER = "number";
48 51  
... ... @@ -73,6 +76,7 @@ public class MtasSolrComponentDocument {
73 76 String[] prefixes = new String[ids.size()];
74 77 String[] types = new String[ids.size()];
75 78 String[] regexps = new String[ids.size()];
  79 + String[] lists = new String[ids.size()];
76 80 String[] numbers = new String[ids.size()];
77 81 for (String id : ids) {
78 82 fields[tmpCounter] = rb.req.getParams().get(
... ... @@ -91,6 +95,9 @@ public class MtasSolrComponentDocument {
91 95 regexps[tmpCounter] = rb.req.getParams().get(
92 96 PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_REGEXP,
93 97 null);
  98 + lists[tmpCounter] = rb.req.getParams().get(
  99 + PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_LIST,
  100 + null);
94 101 numbers[tmpCounter] = rb.req.getParams().get(
95 102 PARAM_MTAS_DOCUMENT + "." + id + "." + NAME_MTAS_DOCUMENT_NUMBER,
96 103 null);
... ... @@ -114,6 +121,8 @@ public class MtasSolrComponentDocument {
114 121 NAME_MTAS_DOCUMENT_FIELD, false);
115 122 MtasSolrResultUtil.compareAndCheck(regexps, fields,
116 123 NAME_MTAS_DOCUMENT_REGEXP, NAME_MTAS_DOCUMENT_FIELD, false);
  124 + MtasSolrResultUtil.compareAndCheck(lists, fields,
  125 + NAME_MTAS_DOCUMENT_LIST, NAME_MTAS_DOCUMENT_FIELD, false);
117 126 MtasSolrResultUtil.compareAndCheck(numbers, fields,
118 127 NAME_MTAS_DOCUMENT_NUMBER, NAME_MTAS_DOCUMENT_FIELD, false);
119 128 for (int i = 0; i < fields.length; i++) {
... ... @@ -123,10 +132,19 @@ public class MtasSolrComponentDocument {
123 132 String prefix = prefixes[i];
124 133 String type = types[i];
125 134 String regexp = regexps[i];
  135 + String[] list = null;
  136 + if(lists[i]!=null) {
  137 + ArrayList<String> tmpList = new ArrayList<String>();
  138 + String[] subList = lists[i].split("(?<!\\\\),");
  139 + for(int j=0; j<subList.length; j++) {
  140 + tmpList.add(subList[j].replace("\\,", ",").replace("\\\\", "\\"));
  141 + }
  142 + list = tmpList.toArray(new String[tmpList.size()]);
  143 + }
126 144 int number = Math.max(0, (numbers[i] == null) || (numbers[i].isEmpty())
127 145 ? 0 : Integer.parseInt(numbers[i]));
128 146 mtasFields.list.get(fields[i]).documentList
129   - .add(new ComponentDocument(key, prefix, type, regexp, number));
  147 + .add(new ComponentDocument(key, prefix, type, regexp, list, number));
130 148 }
131 149 }
132 150 }
... ... @@ -143,12 +161,12 @@ public class MtasSolrComponentDocument {
143 161 SimpleOrderedMap<Object> mtasDocumentResponse = new SimpleOrderedMap<>();
144 162 mtasDocumentResponse.add("key", document.key);
145 163 ArrayList<NamedList<Object>> mtasDocumentItemResponses = new ArrayList<NamedList<Object>>();
146   - for (int docId : document.stats.keySet()) {
  164 + for (int docId : document.statsData.keySet()) {
147 165 NamedList<Object> mtasDocumentItemResponse = new SimpleOrderedMap<>();
148   - MtasDataCollector<?, ?> stats = document.stats.get(docId);
  166 + MtasDataCollector<?, ?> stats = document.statsData.get(docId);
149 167 MtasDataCollector<?, ?> list = null;
150   - if (document.list != null) {
151   - list = document.list.get(docId);
  168 + if (document.statsList != null) {
  169 + list = document.statsList.get(docId);
152 170 }
153 171 mtasDocumentItemResponse.add("stats", new MtasSolrResult(stats,
154 172 stats.getDataType(), stats.getStatsType(), stats.statsItems, null));
... ...