Commit 3e66a04a2eab4cfe1403a004f6e0e160134c9b49
1 parent
c3c71090
fix rewrite
Showing
6 changed files
with
163 additions
and
112 deletions
docker/Dockerfile
1 | 1 | # Automatically generated Dockerfile |
2 | -# - Build 2017-01-13 14:47 | |
2 | +# - Build 2017-01-23 14:37 | |
3 | 3 | # - Lucene/Solr version 6.3.0 |
4 | 4 | # - Mtas release 20170110 |
5 | 5 | # |
... | ... | @@ -55,7 +55,7 @@ RUN apt-get update && apt-get install -y lsof software-properties-common python- |
55 | 55 | && chmod -R 755 /var/www/html \ |
56 | 56 | && printf "echo\n" >> /start.sh \ |
57 | 57 | && printf "echo \"================ Mtas -- Multi Tier Annotation Search =================\"\n" >> /start.sh \ |
58 | -&& printf "echo \" Timestamp 2017-01-13 14:47\"\n" >> /start.sh \ | |
58 | +&& printf "echo \" Timestamp 2017-01-23 14:37\"\n" >> /start.sh \ | |
59 | 59 | && printf "echo \" Lucene/Solr version 6.3.0\"\n" >> /start.sh \ |
60 | 60 | && printf "echo \" Mtas release 20170110\"\n" >> /start.sh \ |
61 | 61 | && printf "echo \" See https://meertensinstituut.github.io/mtas/ for more information\"\n" >> /start.sh \ |
... | ... |
src/mtas/codec/util/CodecCollector.java
... | ... | @@ -1111,8 +1111,9 @@ public class CodecCollector { |
1111 | 1111 | // try to call functionParser as little as possible |
1112 | 1112 | if (span.statsType.equals(CodecUtil.STATS_BASIC) |
1113 | 1113 | && (span.minimumLong == null) && (span.maximumLong == null) |
1114 | - && (span.functions == null || (span.functionSumRule() | |
1115 | - && !span.functionNeedPositions()))) { | |
1114 | + && (span.functions == null | |
1115 | + || (span.functionBasic() && span.functionSumRule() | |
1116 | + && !span.functionNeedPositions()))) { | |
1116 | 1117 | // initialise |
1117 | 1118 | int length = span.parser.needArgumentsNumber(); |
1118 | 1119 | long[] valueSum = new long[length]; |
... | ... | @@ -2335,11 +2336,26 @@ public class CodecCollector { |
2335 | 2336 | HashMap<Integer, long[]> args = computeArguments(spansNumberData, |
2336 | 2337 | cf.spanQueries, docSet); |
2337 | 2338 | if (cf.baseDataTypes[level].equals(CodecUtil.DATA_TYPE_LONG)) { |
2338 | - // sumrule | |
2339 | + // check functions | |
2340 | + boolean applySumRule = false; | |
2339 | 2341 | if (cf.baseStatsTypes[level].equals(CodecUtil.STATS_BASIC) |
2340 | 2342 | && cf.baseParsers[level].sumRule() |
2341 | 2343 | && (cf.baseMinimumLongs[level] == null) |
2342 | 2344 | && (cf.baseMaximumLongs[level] == null)) { |
2345 | + applySumRule = true; | |
2346 | + if (cf.baseFunctionList[level].get(dataCollector) != null) { | |
2347 | + for (SubComponentFunction function : cf.baseFunctionList[level] | |
2348 | + .get(dataCollector)) { | |
2349 | + if (!function.statsType.equals(CodecUtil.STATS_BASIC) | |
2350 | + || !function.parserFunction.sumRule() | |
2351 | + || function.parserFunction.needPositions()) { | |
2352 | + applySumRule = false; | |
2353 | + break; | |
2354 | + } | |
2355 | + } | |
2356 | + } | |
2357 | + } | |
2358 | + if (applySumRule) { | |
2343 | 2359 | for (String key : groupedKeys.values()) { |
2344 | 2360 | if (docLists.get(key).length > 0) { |
2345 | 2361 | // initialise |
... | ... | @@ -2695,127 +2711,142 @@ public class CodecCollector { |
2695 | 2711 | } |
2696 | 2712 | |
2697 | 2713 | for (CompiledAutomaton compiledAutomaton : listAutomata) { |
2698 | - termsEnum = t.intersect(compiledAutomaton, null); | |
2699 | - int initSize = Math.min((int) t.size(), 1000); | |
2700 | - termVector.subComponentFunction.dataCollector.initNewList(initSize, | |
2701 | - segmentName, segmentNumber, termVector.boundary); | |
2702 | - boolean doBasic = termVector.subComponentFunction.dataCollector | |
2703 | - .getStatsType().equals(CodecUtil.STATS_BASIC); | |
2704 | - if (termVector.functions != null) { | |
2705 | - for (SubComponentFunction function : termVector.functions) { | |
2706 | - function.dataCollector.initNewList(initSize); | |
2707 | - doBasic = doBasic ? (function.parserFunction.sumRule() | |
2708 | - && !function.parserFunction.needPositions() | |
2709 | - && function.dataCollector.getStatsType() | |
2710 | - .equals(CodecUtil.STATS_BASIC)) | |
2711 | - : doBasic; | |
2714 | + if (!compiledAutomaton.type | |
2715 | + .equals(CompiledAutomaton.AUTOMATON_TYPE.NORMAL)) { | |
2716 | + if (compiledAutomaton.type | |
2717 | + .equals(CompiledAutomaton.AUTOMATON_TYPE.NONE)) { | |
2718 | + // do nothing | |
2719 | + } else { | |
2720 | + throw new IOException( | |
2721 | + "compiledAutomaton is " + compiledAutomaton.type); | |
2712 | 2722 | } |
2713 | - } | |
2714 | - // only if documents | |
2715 | - if (docSet.size() > 0) { | |
2716 | - int termDocId; | |
2717 | - // loop over terms | |
2718 | - while ((term = termsEnum.next()) != null) { | |
2719 | - termDocId = -1; | |
2720 | - if (doBasic) { | |
2721 | - // compute numbers; | |
2722 | - TermvectorNumberBasic numberBasic = computeTermvectorNumberBasic( | |
2723 | - docSet, termDocId, termsEnum, r, lrc, postingsEnum); | |
2724 | - // register | |
2725 | - if (numberBasic.docNumber > 0) { | |
2726 | - long valueLong = 0; | |
2727 | - try { | |
2728 | - valueLong = termVector.subComponentFunction.parserFunction | |
2729 | - .getValueLong(numberBasic.valueSum, 1); | |
2730 | - } catch (IOException e) { | |
2731 | - termVector.subComponentFunction.dataCollector.error( | |
2732 | - MtasToken.getPostfixFromValue(term), e.getMessage()); | |
2733 | - } | |
2734 | - String key = MtasToken.getPostfixFromValue(term); | |
2735 | - termVector.subComponentFunction.dataCollector.add(key, | |
2736 | - valueLong, numberBasic.docNumber); | |
2737 | - if (termVector.functions != null) { | |
2738 | - for (SubComponentFunction function : termVector.functions) { | |
2739 | - if (function.dataType | |
2740 | - .equals(CodecUtil.DATA_TYPE_LONG)) { | |
2741 | - long valueFunction = function.parserFunction | |
2742 | - .getValueLong(numberBasic.valueSum, 0); | |
2743 | - function.dataCollector.add(key, valueFunction, | |
2744 | - numberBasic.docNumber); | |
2745 | - } else if (function.dataType | |
2746 | - .equals(CodecUtil.DATA_TYPE_DOUBLE)) { | |
2747 | - double valueFunction = function.parserFunction | |
2748 | - .getValueDouble(numberBasic.valueSum, 0); | |
2749 | - function.dataCollector.add(key, valueFunction, | |
2750 | - numberBasic.docNumber); | |
2751 | - } | |
2752 | - } | |
2753 | - } | |
2754 | - | |
2755 | - } | |
2756 | - } else { | |
2757 | - TermvectorNumberFull numberFull = computeTermvectorNumberFull( | |
2758 | - docSet, termDocId, termsEnum, r, lrc, postingsEnum, | |
2759 | - positionsData); | |
2760 | - if (numberFull.docNumber > 0) { | |
2761 | - long[] valuesLong = new long[numberFull.docNumber]; | |
2762 | - String key = MtasToken.getPostfixFromValue(term); | |
2763 | - for (int i = 0; i < numberFull.docNumber; i++) { | |
2723 | + } else { | |
2724 | + termsEnum = t.intersect(compiledAutomaton, null); | |
2725 | + | |
2726 | + int initSize = Math.min((int) t.size(), 1000); | |
2727 | + termVector.subComponentFunction.dataCollector.initNewList( | |
2728 | + initSize, segmentName, segmentNumber, termVector.boundary); | |
2729 | + boolean doBasic = termVector.subComponentFunction.dataCollector | |
2730 | + .getStatsType().equals(CodecUtil.STATS_BASIC); | |
2731 | + if (termVector.functions != null) { | |
2732 | + for (SubComponentFunction function : termVector.functions) { | |
2733 | + function.dataCollector.initNewList(initSize); | |
2734 | + doBasic = doBasic ? (function.parserFunction.sumRule() | |
2735 | + && !function.parserFunction.needPositions() | |
2736 | + && function.dataCollector.getStatsType() | |
2737 | + .equals(CodecUtil.STATS_BASIC)) | |
2738 | + : doBasic; | |
2739 | + } | |
2740 | + } | |
2741 | + // only if documents | |
2742 | + if (docSet.size() > 0) { | |
2743 | + int termDocId; | |
2744 | + // loop over terms | |
2745 | + while ((term = termsEnum.next()) != null) { | |
2746 | + termDocId = -1; | |
2747 | + if (doBasic) { | |
2748 | + // compute numbers; | |
2749 | + TermvectorNumberBasic numberBasic = computeTermvectorNumberBasic( | |
2750 | + docSet, termDocId, termsEnum, r, lrc, postingsEnum); | |
2751 | + // register | |
2752 | + if (numberBasic.docNumber > 0) { | |
2753 | + long valueLong = 0; | |
2764 | 2754 | try { |
2765 | - valuesLong[i] = termVector.subComponentFunction.parserFunction | |
2766 | - .getValueLong(new long[] { numberFull.args[i] }, | |
2767 | - numberFull.positions[i]); | |
2755 | + valueLong = termVector.subComponentFunction.parserFunction | |
2756 | + .getValueLong(numberBasic.valueSum, 1); | |
2768 | 2757 | } catch (IOException e) { |
2769 | - termVector.subComponentFunction.dataCollector.error(key, | |
2758 | + termVector.subComponentFunction.dataCollector.error( | |
2759 | + MtasToken.getPostfixFromValue(term), | |
2770 | 2760 | e.getMessage()); |
2771 | 2761 | } |
2762 | + String key = MtasToken.getPostfixFromValue(term); | |
2763 | + termVector.subComponentFunction.dataCollector.add(key, | |
2764 | + valueLong, numberBasic.docNumber); | |
2765 | + if (termVector.functions != null) { | |
2766 | + for (SubComponentFunction function : termVector.functions) { | |
2767 | + if (function.dataType | |
2768 | + .equals(CodecUtil.DATA_TYPE_LONG)) { | |
2769 | + long valueFunction = function.parserFunction | |
2770 | + .getValueLong(numberBasic.valueSum, 0); | |
2771 | + function.dataCollector.add(key, valueFunction, | |
2772 | + numberBasic.docNumber); | |
2773 | + } else if (function.dataType | |
2774 | + .equals(CodecUtil.DATA_TYPE_DOUBLE)) { | |
2775 | + double valueFunction = function.parserFunction | |
2776 | + .getValueDouble(numberBasic.valueSum, 0); | |
2777 | + function.dataCollector.add(key, valueFunction, | |
2778 | + numberBasic.docNumber); | |
2779 | + } | |
2780 | + } | |
2781 | + } | |
2782 | + | |
2772 | 2783 | } |
2773 | - termVector.subComponentFunction.dataCollector.add(key, | |
2774 | - valuesLong, valuesLong.length); | |
2775 | - if (termVector.functions != null) { | |
2776 | - for (SubComponentFunction function : termVector.functions) { | |
2777 | - if (function.dataType | |
2778 | - .equals(CodecUtil.DATA_TYPE_LONG)) { | |
2779 | - valuesLong = new long[numberFull.docNumber]; | |
2780 | - for (int i = 0; i < numberFull.docNumber; i++) { | |
2781 | - try { | |
2782 | - valuesLong[i] = function.parserFunction | |
2783 | - .getValueLong( | |
2784 | - new long[] { numberFull.args[i] }, | |
2785 | - numberFull.positions[i]); | |
2786 | - } catch (IOException e) { | |
2787 | - function.dataCollector.error(key, e.getMessage()); | |
2784 | + } else { | |
2785 | + TermvectorNumberFull numberFull = computeTermvectorNumberFull( | |
2786 | + docSet, termDocId, termsEnum, r, lrc, postingsEnum, | |
2787 | + positionsData); | |
2788 | + if (numberFull.docNumber > 0) { | |
2789 | + long[] valuesLong = new long[numberFull.docNumber]; | |
2790 | + String key = MtasToken.getPostfixFromValue(term); | |
2791 | + for (int i = 0; i < numberFull.docNumber; i++) { | |
2792 | + try { | |
2793 | + valuesLong[i] = termVector.subComponentFunction.parserFunction | |
2794 | + .getValueLong(new long[] { numberFull.args[i] }, | |
2795 | + numberFull.positions[i]); | |
2796 | + } catch (IOException e) { | |
2797 | + termVector.subComponentFunction.dataCollector | |
2798 | + .error(key, e.getMessage()); | |
2799 | + } | |
2800 | + } | |
2801 | + termVector.subComponentFunction.dataCollector.add(key, | |
2802 | + valuesLong, valuesLong.length); | |
2803 | + if (termVector.functions != null) { | |
2804 | + for (SubComponentFunction function : termVector.functions) { | |
2805 | + if (function.dataType | |
2806 | + .equals(CodecUtil.DATA_TYPE_LONG)) { | |
2807 | + valuesLong = new long[numberFull.docNumber]; | |
2808 | + for (int i = 0; i < numberFull.docNumber; i++) { | |
2809 | + try { | |
2810 | + valuesLong[i] = function.parserFunction | |
2811 | + .getValueLong( | |
2812 | + new long[] { numberFull.args[i] }, | |
2813 | + numberFull.positions[i]); | |
2814 | + } catch (IOException e) { | |
2815 | + function.dataCollector.error(key, | |
2816 | + e.getMessage()); | |
2817 | + } | |
2788 | 2818 | } |
2789 | - } | |
2790 | - function.dataCollector.add(key, valuesLong, | |
2791 | - valuesLong.length); | |
2792 | - } else if (function.dataType | |
2793 | - .equals(CodecUtil.DATA_TYPE_DOUBLE)) { | |
2794 | - double[] valuesDouble = new double[numberFull.docNumber]; | |
2795 | - for (int i = 0; i < numberFull.docNumber; i++) { | |
2796 | - try { | |
2797 | - valuesDouble[i] = function.parserFunction | |
2798 | - .getValueDouble( | |
2799 | - new long[] { numberFull.args[i] }, | |
2800 | - numberFull.positions[i]); | |
2801 | - } catch (IOException e) { | |
2802 | - function.dataCollector.error(key, e.getMessage()); | |
2819 | + function.dataCollector.add(key, valuesLong, | |
2820 | + valuesLong.length); | |
2821 | + } else if (function.dataType | |
2822 | + .equals(CodecUtil.DATA_TYPE_DOUBLE)) { | |
2823 | + double[] valuesDouble = new double[numberFull.docNumber]; | |
2824 | + for (int i = 0; i < numberFull.docNumber; i++) { | |
2825 | + try { | |
2826 | + valuesDouble[i] = function.parserFunction | |
2827 | + .getValueDouble( | |
2828 | + new long[] { numberFull.args[i] }, | |
2829 | + numberFull.positions[i]); | |
2830 | + } catch (IOException e) { | |
2831 | + function.dataCollector.error(key, | |
2832 | + e.getMessage()); | |
2833 | + } | |
2803 | 2834 | } |
2835 | + function.dataCollector.add(key, valuesDouble, | |
2836 | + valuesDouble.length); | |
2804 | 2837 | } |
2805 | - function.dataCollector.add(key, valuesDouble, | |
2806 | - valuesDouble.length); | |
2807 | 2838 | } |
2808 | 2839 | } |
2809 | 2840 | } |
2810 | - } | |
2811 | 2841 | |
2842 | + } | |
2812 | 2843 | } |
2813 | 2844 | } |
2814 | - } | |
2815 | - termVector.subComponentFunction.dataCollector.closeNewList(); | |
2816 | - if (termVector.functions != null) { | |
2817 | - for (SubComponentFunction function : termVector.functions) { | |
2818 | - function.dataCollector.closeNewList(); | |
2845 | + termVector.subComponentFunction.dataCollector.closeNewList(); | |
2846 | + if (termVector.functions != null) { | |
2847 | + for (SubComponentFunction function : termVector.functions) { | |
2848 | + function.dataCollector.closeNewList(); | |
2849 | + } | |
2819 | 2850 | } |
2820 | 2851 | } |
2821 | 2852 | } |
... | ... |
src/mtas/codec/util/CodecComponent.java
... | ... | @@ -1333,6 +1333,17 @@ public class CodecComponent { |
1333 | 1333 | } |
1334 | 1334 | return true; |
1335 | 1335 | } |
1336 | + | |
1337 | + public boolean functionBasic() { | |
1338 | + if (functions != null) { | |
1339 | + for (SubComponentFunction function : functions) { | |
1340 | + if (!function.statsType.equals(CodecUtil.STATS_BASIC)) { | |
1341 | + return false; | |
1342 | + } | |
1343 | + } | |
1344 | + } | |
1345 | + return true; | |
1346 | + } | |
1336 | 1347 | |
1337 | 1348 | /** |
1338 | 1349 | * Function need positions. |
... | ... |
src/mtas/search/spans/MtasSpanNotQuery.java
... | ... | @@ -2,7 +2,9 @@ package mtas.search.spans; |
2 | 2 | |
3 | 3 | import java.io.IOException; |
4 | 4 | |
5 | +import org.apache.lucene.index.IndexReader; | |
5 | 6 | import org.apache.lucene.search.IndexSearcher; |
7 | +import org.apache.lucene.search.spans.SpanContainingQuery; | |
6 | 8 | import org.apache.lucene.search.spans.SpanNotQuery; |
7 | 9 | import org.apache.lucene.search.spans.SpanQuery; |
8 | 10 | import org.apache.lucene.search.spans.SpanWeight; |
... | ... | @@ -30,6 +32,12 @@ public class MtasSpanNotQuery extends MtasSpanQuery { |
30 | 32 | } |
31 | 33 | |
32 | 34 | @Override |
35 | + public MtasSpanQuery rewrite(IndexReader reader) throws IOException { | |
36 | + baseQuery = (SpanNotQuery) baseQuery.rewrite(reader); | |
37 | + return this; | |
38 | + } | |
39 | + | |
40 | + @Override | |
33 | 41 | public String toString(String field) { |
34 | 42 | return baseQuery.toString(field); |
35 | 43 | } |
... | ... |
src/mtas/solr/handler/component/MtasSolrSearchComponent.java
... | ... | @@ -244,6 +244,7 @@ public class MtasSolrSearchComponent extends SearchComponent { |
244 | 244 | docListList, docSetList, mtasFields.list.get(field)); |
245 | 245 | } catch (IllegalAccessException | IllegalArgumentException |
246 | 246 | | InvocationTargetException e) { |
247 | + e.printStackTrace(); | |
247 | 248 | throw new IOException(e.getMessage()); |
248 | 249 | } |
249 | 250 | } |
... | ... |
src/mtas/solr/handler/component/util/MtasSolrComponentTermvector.java
... | ... | @@ -1017,7 +1017,7 @@ public class MtasSolrComponentTermvector { |
1017 | 1017 | } |
1018 | 1018 | paramsNewRequest.add(PARAM_MTAS_TERMVECTOR + "." |
1019 | 1019 | + termvectorCounter + "." + NAME_MTAS_TERMVECTOR_FULL, |
1020 | - "true"); | |
1020 | + tv.full?"true":"false"); | |
1021 | 1021 | paramsNewRequest.add(PARAM_MTAS_TERMVECTOR + "." |
1022 | 1022 | + termvectorCounter + "." + NAME_MTAS_TERMVECTOR_LIST, |
1023 | 1023 | listValue); |
... | ... |