Commit 6402c5ec518afc5531e77bc97aef17d074895033

Authored by Matthijs Brouwer
1 parent 2fa472f5

preparations dockerfile

conf/parser/mtas.xml
... ... @@ -2,6 +2,7 @@
2 2 <mtas>
3 3 <configurations type="mtas.analysis.util.MtasTokenizerFactory">
4 4 <configuration name="test" file="mtas/folia_test.xml" />
  5 + <configuration name="TEST" file="mtas/folia_dbnl.xml" />
5 6 <configuration name="CRM" file="mtas/crm_test.xml" />
6 7 <configuration name="DBNL" file="mtas/folia_dbnl.xml" />
7 8 <configuration name="DDD" file="mtas/folia_ddd.xml" />
... ... @@ -10,6 +11,7 @@
10 11 </configurations>
11 12 <configurations type="mtas.analysis.util.MtasCharFilterFactory">
12 13 <configuration name="test" type="file" />
  14 + <configuration name="TEST" type="file" prefix="/Users/matthijs/Software/Mtas/demo/data/" />
13 15 <configuration name="CRM" type="file" prefix="/Users/matthijs/Software/Mtas/data/CRM/data/files/" postfix=".txt" />
14 16 <configuration name="DBNL" type="url" prefix="https://openskos.meertens.knaw.nl/nederlab/archief/get/" />
15 17 <configuration name="DDD" type="url" prefix="https://openskos.meertens.knaw.nl/nederlab/archief/get/" />
... ...
conf/parser/mtas/tei_test.xml
... ... @@ -14,13 +14,13 @@
14 14  
15 15  
16 16  
17   - <!-- START CONFIGURATION MTAS FOLIA PARSER -->
  17 + <!-- START CONFIGURATION MTAS TEI PARSER -->
18 18 <parser name="mtas.analysis.parser.MtasTEIParser">
19 19  
20   - <!-- START GENERAL SETTINGS MTAS FOLIA PARSER -->
  20 + <!-- START GENERAL SETTINGS MTAS TEI PARSER -->
21 21 <autorepair value="true" />
22 22 <makeunique value="true" />
23   - <!-- END GENERAL SETTINGS MTAS FOLIA PARSER -->
  23 + <!-- END GENERAL SETTINGS MTAS TEI PARSER -->
24 24  
25 25 <!-- START REFERENCES -->
26 26 <references>
... ... @@ -141,6 +141,6 @@
141 141 <!-- END MAPPINGS -->
142 142  
143 143 </parser>
144   - <!-- END CONFIGURATION MTAS FOLIA PARSER -->
  144 + <!-- END CONFIGURATION MTAS TEI PARSER -->
145 145  
146 146 </mtas>
147 147 \ No newline at end of file
... ...
demo/index/post.json 0 → 100644
  1 +{
  2 + "NLProfile_name": "nederlabTitleProfile",
  3 + "NLCore_NLIdentification_nederlabID": "text1",
  4 + "NLCore_NLIdentification_versionID": "text1",
  5 + "NLCore_NLAdministrative_ingestTime": "2016-10-28T08:00:00Z",
  6 + "NLCore_NLAdministrative_sourceCollection": "TEST",
  7 + "NLCore_NLAdministrative_isThesaurusElement": true,
  8 + "NLTitle_title": "This is a test",
  9 + "NLTitle_genre": [
  10 + "testgenre 1",
  11 + "testgenre 2"
  12 + ],
  13 + "NLTitle_yearOfPublicationMin": 2016,
  14 + "NLTitle_yearOfPublicationMax": 2016,
  15 + "NLTitle_yearOfPublicationApprox": false,
  16 + "NLTitle_yearOfPublicationLabel": "2016",
  17 + "NLTitle_edition": "1ste druk",
  18 + "NLTitle_primaryLanguage": "nld",
  19 + "NLTitle_isTranslation": false,
  20 + "NLTitle_autopsyPerformed": false,
  21 + "NLContent_mtas": "text1.xml"
  22 +}
... ...
demo/index/resetOnderzoeksportaal.sh 0 → 100755
  1 +
  2 +rm -rf /Users/matthijs/www/www.meertens.dev/html/onderzoeksportaaltest/smarty/cache/*
  3 +rm -rf /Users/matthijs/www/www.meertens.dev/html/onderzoeksportaaltest/smarty/templates_c/*
  4 +rm -rf /Users/matthijs/www/www.meertens.dev/html/onderzoeksportaaltest/sites/nederlab/smarty/cache/*
  5 +rm -rf /Users/matthijs/www/www.meertens.dev/html/onderzoeksportaaltest/sites/nederlab/smarty/templates_c/*
  6 +
... ...
docker/mtas.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8" ?>
  2 +<mtas>
  3 + <configurations type="mtas.analysis.util.MtasTokenizerFactory">
  4 + <configuration name="folia" file="mtas/demo_folia.xml" />
  5 + <configuration name="tei" file="mtas/demo_tei.xml" />
  6 + </configurations>
  7 + <configurations type="mtas.analysis.util.MtasCharFilterFactory">
  8 + <configuration name="folia" type="url" prefix="http://localhost/demo/" postfix="" />
  9 + <configuration name="tei" type="url" prefix="http://localhost/demo/" postfix="" />
  10 + </configurations>
  11 +</mtas>
  12 +
... ...
docker/mtas/demo_folia.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8" ?>
  2 +<mtas>
  3 +
  4 + <!-- START MTAS INDEX CONFIGURATION -->
  5 + <index>
  6 + <!-- START GENERAL SETTINGS MTAS INDEX PROCESS -->
  7 + <payload index="false" />
  8 + <offset index="false" />
  9 + <realoffset index="false" />
  10 + <parent index="true" />
  11 + <!-- END GENERAL SETTINGS MTAS INDEX PROCESS -->
  12 + </index>
  13 + <!-- END MTAS INDEX CONFIGURATION -->
  14 +
  15 +
  16 +
  17 + <!-- START CONFIGURATION MTAS FOLIA PARSER -->
  18 + <parser name="mtas.analysis.parser.MtasFoliaParser">
  19 +
  20 + <!-- START GENERAL SETTINGS MTAS FOLIA PARSER -->
  21 + <autorepair value="true" />
  22 + <makeunique value="true" />
  23 + <!-- END GENERAL SETTINGS MTAS FOLIA PARSER -->
  24 +
  25 + <!-- START REFERENCES -->
  26 + <references>
  27 + <reference name="wref" ref="id" />
  28 + </references>
  29 + <!-- END REFERENCES -->
  30 +
  31 + <!-- START MAPPINGS -->
  32 + <mappings>
  33 +
  34 + <!-- START WORDS -->
  35 + <mapping type="word" name="w">
  36 + </mapping>
  37 + <mapping type="word" name="w">
  38 + <token type="string" offset="false" realoffset="false" parent="false">
  39 + <pre>
  40 + <item type="name" />
  41 + </pre>
  42 + <post>
  43 + <item type="attribute" name="class" />
  44 + </post>
  45 + </token>
  46 + <condition>
  47 + <item type="attribute" name="class" />
  48 + <item type="attribute" name="class" not="true" condition="WORD" />
  49 + </condition>
  50 + </mapping>
  51 + <!-- END WORDS -->
  52 +
  53 + <!-- START WORD ANNOTATIONS -->
  54 + <mapping type="wordAnnotation" name="t">
  55 + <token type="string" offset="false">
  56 + <pre>
  57 + <item type="name" />
  58 + </pre>
  59 + <post>
  60 + <item type="text" />
  61 + </post>
  62 + </token>
  63 + <token type="string" offset="false" realoffset="false" parent="false">
  64 + <pre>
  65 + <item type="name" />
  66 + <item type="string" value="_lc" />
  67 + </pre>
  68 + <post>
  69 + <item type="text" filter="ascii,lowercase" />
  70 + </post>
  71 + </token>
  72 + <condition>
  73 + <item type="ancestor" number="0" />
  74 + <item type="ancestorWord" number="1" />
  75 + <item type="unknownAncestor" number="0" />
  76 + </condition>
  77 + </mapping>
  78 + <mapping type="wordAnnotation" name="lemma">
  79 + <token type="string" offset="false" realoffset="false" parent="false">
  80 + <pre>
  81 + <item type="name" />
  82 + </pre>
  83 + <post>
  84 + <item type="attribute" name="class" />
  85 + </post>
  86 + </token>
  87 + <condition>
  88 + <item type="attribute" name="class" />
  89 + <item type="ancestor" number="0" />
  90 + <item type="unknownAncestor" number="0" />
  91 + </condition>
  92 + </mapping>
  93 + <mapping type="wordAnnotation" name="morphology">
  94 + </mapping>
  95 + <mapping type="wordAnnotation" name="morpheme">
  96 + <condition>
  97 + <item type="ancestor" number="1" />
  98 + <item type="ancestorName" condition="morphology" />
  99 + </condition>
  100 + </mapping>
  101 + <mapping type="wordAnnotation" name="t">
  102 + <token type="string" offset="false" realoffset="false" parent="false">
  103 + <pre>
  104 + <item type="ancestorName" />
  105 + </pre>
  106 + <post>
  107 + <item type="text" />
  108 + </post>
  109 + </token>
  110 + <condition>
  111 + <item type="ancestorName" distance="0" condition="morpheme" />
  112 + </condition>
  113 + </mapping>
  114 + <mapping type="wordAnnotation" name="pos">
  115 + <token type="string" offset="false" realoffset="false" parent="false">
  116 + <pre>
  117 + <item type="name" />
  118 + </pre>
  119 + <post>
  120 + <item type="attribute" name="head" />
  121 + </post>
  122 + <payload>
  123 + <item type="attribute" name="confidence" />
  124 + </payload>
  125 + </token>
  126 + <condition>
  127 + <item type="ancestor" number="0" />
  128 + <item type="unknownAncestor" number="0" />
  129 + <item type="attribute" name="class" />
  130 + </condition>
  131 + </mapping>
  132 + <mapping type="wordAnnotation" name="feat">
  133 + <token type="string" offset="false" realoffset="false" parent="false">
  134 + <pre>
  135 + <item type="name" />
  136 + <item type="attribute" name="subset" prefix="." />
  137 + </pre>
  138 + <post>
  139 + <item type="attribute" name="class" />
  140 + </post>
  141 + <payload>
  142 + <item type="ancestorAttribute" distance="0" name="confidence" />
  143 + </payload>
  144 + </token>
  145 + <condition>
  146 + <item type="ancestor" number="1" />
  147 + <item type="unknownAncestor" number="0" />
  148 + <item type="attribute" name="class" />
  149 + <item type="attribute" name="subset" />
  150 + </condition>
  151 + </mapping>
  152 + <!-- END WORD ANNOTATIONS -->
  153 +
  154 + <!-- START RELATIONS -->
  155 + <mapping type="relation" name="chunk">
  156 + <token type="string" offset="false" realoffset="false" parent="false">
  157 + <pre>
  158 + <item type="name" />
  159 + </pre>
  160 + <post>
  161 + <item type="attribute" name="class" />
  162 + </post>
  163 + <payload>
  164 + <item type="attribute" name="confidence" />
  165 + </payload>
  166 + </token>
  167 + <condition>
  168 + <item type="attribute" name="class" />
  169 + </condition>
  170 + </mapping>
  171 + <mapping type="relation" name="dependency">
  172 + <token type="string" offset="false" realoffset="false" parent="false">
  173 + <pre>
  174 + <item type="name" />
  175 + </pre>
  176 + <post>
  177 + <item type="attribute" name="class" />
  178 + </post>
  179 + </token>
  180 + <condition>
  181 + <item type="attribute" name="class" />
  182 + </condition>
  183 + </mapping>
  184 + <mapping type="relation" name="hd">
  185 + <token type="string" offset="false" realoffset="false" parent="false">
  186 + <pre>
  187 + <item type="ancestorName" distance="0" />
  188 + <item type="name" prefix="." />
  189 + </pre>
  190 + </token>
  191 + <condition>
  192 + <item type="ancestorName" condition="dependency" />
  193 + <item type="ancestor" number="1" />
  194 + </condition>
  195 + </mapping>
  196 + <mapping type="relation" name="dep">
  197 + <token type="string" offset="false" realoffset="false" parent="false">
  198 + <pre>
  199 + <item type="ancestorName" distance="0" />
  200 + <item type="name" prefix="." />
  201 + </pre>
  202 + </token>
  203 + <condition>
  204 + <item type="ancestor" number="1" />
  205 + <item type="ancestorName" condition="dependency" />
  206 + </condition>
  207 + </mapping>
  208 + <mapping type="relation" name="entities">
  209 + </mapping>
  210 + <mapping type="relation" name="entity">
  211 + <token type="string" offset="false" realoffset="false" parent="false">
  212 + <pre>
  213 + <item type="name" />
  214 + </pre>
  215 + <post>
  216 + <item type="attribute" name="class" />
  217 + </post>
  218 + <payload>
  219 + <item type="attribute" name="confidence" />
  220 + </payload>
  221 + </token>
  222 + <condition>
  223 + <item type="ancestor" number="1" />
  224 + <item type="ancestorName" condition="entities" />
  225 + </condition>
  226 + </mapping>
  227 + <!-- END RELATIONS -->
  228 +
  229 + <!-- START GROUPS -->
  230 + <mapping type="group" name="s">
  231 + <token type="string" offset="false">
  232 + <pre>
  233 + <item type="name" />
  234 + </pre>
  235 + <post>
  236 + <item type="attribute" name="class" />
  237 + </post>
  238 + </token>
  239 + </mapping>
  240 + <mapping type="group" name="p">
  241 + <token type="string" offset="false">
  242 + <pre>
  243 + <item type="name" />
  244 + </pre>
  245 + <post>
  246 + <item type="attribute" name="class" />
  247 + </post>
  248 + </token>
  249 + </mapping>
  250 + <mapping type="group" name="div">
  251 + <token type="string" offset="false">
  252 + <pre>
  253 + <item type="name" />
  254 + </pre>
  255 + <post>
  256 + <item type="attribute" name="class" />
  257 + </post>
  258 + </token>
  259 + </mapping>
  260 + <mapping type="group" name="head">
  261 + <token type="string" offset="false">
  262 + <pre>
  263 + <item type="name" />
  264 + </pre>
  265 + <post>
  266 + <item type="attribute" name="class" />
  267 + </post>
  268 + </token>
  269 + </mapping>
  270 + <!-- END GROUPS -->
  271 +
  272 + <!-- START GROUP ANNOTATIONS -->
  273 + <mapping type="groupAnnotation" name="lang">
  274 + <token type="string" offset="false" realoffset="false" parent="false">
  275 + <pre>
  276 + <item type="name" />
  277 + </pre>
  278 + <post>
  279 + <item type="attribute" name="class" />
  280 + </post>
  281 + </token>
  282 + </mapping>
  283 + <!-- END GROUP ANNOTATIONS -->
  284 +
  285 + </mappings>
  286 + <!-- END MAPPINGS -->
  287 +
  288 + </parser>
  289 + <!-- END CONFIGURATION MTAS FOLIA PARSER -->
  290 +
  291 +
  292 +</mtas>
... ...
docker/mtas/demo_tei.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8" ?>
  2 +<mtas>
  3 +
  4 + <!-- START MTAS INDEX CONFIGURATION -->
  5 + <index>
  6 + <!-- START GENERAL SETTINGS MTAS INDEX PROCESS -->
  7 + <payload index="true" />
  8 + <offset index="true" />
  9 + <realoffset index="true" />
  10 + <parent index="true" />
  11 + <!-- END GENERAL SETTINGS MTAS INDEX PROCESS -->
  12 + </index>
  13 + <!-- END MTAS INDEX CONFIGURATION -->
  14 +
  15 +
  16 +
  17 + <!-- START CONFIGURATION MTAS TEI PARSER -->
  18 + <parser name="mtas.analysis.parser.MtasTEIParser">
  19 +
  20 + <!-- START GENERAL SETTINGS MTAS TEI PARSER -->
  21 + <autorepair value="true" />
  22 + <makeunique value="true" />
  23 + <!-- END GENERAL SETTINGS MTAS TEI PARSER -->
  24 +
  25 + <!-- START REFERENCES -->
  26 + <references>
  27 + </references>
  28 + <!-- END REFERENCES -->
  29 +
  30 + <!-- START MAPPINGS -->
  31 + <mappings>
  32 +
  33 + <!-- START WORDS -->
  34 + <mapping type="word" name="w">
  35 + <token type="string" offset="false" realoffset="false" parent="true">
  36 + <pre>
  37 + <item type="string" value="t" />
  38 + </pre>
  39 + <post>
  40 + <item type="text" />
  41 + </post>
  42 + </token>
  43 + <token type="string" offset="false" realoffset="false" parent="false">
  44 + <pre>
  45 + <item type="string" value="t_lc" />
  46 + </pre>
  47 + <post>
  48 + <item type="text" filter="ascii,lowercase" />
  49 + </post>
  50 + </token>
  51 + </mapping>
  52 + <mapping type="word" name="w">
  53 + <token type="string" offset="false" realoffset="false" parent="false">
  54 + <pre>
  55 + <item type="string" value="lemma" />
  56 + </pre>
  57 + <post>
  58 + <item type="attribute" name="lemma" />
  59 + </post>
  60 + </token>
  61 + <condition>
  62 + <item type="attribute" name="lemma" />
  63 + </condition>
  64 + </mapping>
  65 + <mapping type="word" name="w">
  66 + <token type="string" offset="false" realoffset="false" parent="false">
  67 + <pre>
  68 + <item type="string" value="type" />
  69 + </pre>
  70 + <post>
  71 + <item type="attribute" name="type" />
  72 + </post>
  73 + </token>
  74 + <condition>
  75 + <item type="attribute" name="type" />
  76 + </condition>
  77 + </mapping>
  78 + <mapping type="word" name="pc">
  79 + <token type="string" offset="false" realoffset="false" parent="false">
  80 + <pre>
  81 + <item type="string" value="t" />
  82 + </pre>
  83 + <post>
  84 + <item type="text" />
  85 + </post>
  86 + </token>
  87 + <token type="string" offset="false" realoffset="false" parent="false">
  88 + <pre>
  89 + <item type="string" value="t_lc" />
  90 + </pre>
  91 + <post>
  92 + <item type="text" filter="ascii,lowercase" />
  93 + </post>
  94 + </token>
  95 + </mapping>
  96 + <!-- END WORDS -->
  97 +
  98 + <!-- START WORD ANNOTATIONS -->
  99 + <!-- END WORD ANNOTATIONS -->
  100 +
  101 + <!-- START RELATIONS -->
  102 + <!-- END RELATIONS -->
  103 +
  104 + <!-- START GROUPS -->
  105 + <mapping type="group" name="p">
  106 + <token type="string" offset="false">
  107 + <pre>
  108 + <item type="name" />
  109 + </pre>
  110 + <post>
  111 + <item type="attribute" name="type" />
  112 + </post>
  113 + </token>
  114 + </mapping>
  115 + <mapping type="group" name="div">
  116 + <token type="string" offset="false">
  117 + <pre>
  118 + <item type="name" />
  119 + </pre>
  120 + <post>
  121 + <item type="attribute" name="type" />
  122 + </post>
  123 + </token>
  124 + </mapping>
  125 + <mapping type="group" name="rs">
  126 + <token type="string" offset="false">
  127 + <pre>
  128 + <item type="name" />
  129 + </pre>
  130 + <post>
  131 + <item type="attribute" name="type" />
  132 + </post>
  133 + </token>
  134 + </mapping>
  135 + <!-- END GROUPS -->
  136 +
  137 + <!-- START GROUP ANNOTATIONS -->
  138 + <!-- END GROUP ANNOTATIONS -->
  139 +
  140 + </mappings>
  141 + <!-- END MAPPINGS -->
  142 +
  143 + </parser>
  144 + <!-- END CONFIGURATION MTAS TEI PARSER -->
  145 +
  146 +</mtas>
... ...
docker/schemaBasic.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8" ?>
  2 +
  3 +<schema name="demo" version="1.6">
  4 +
  5 + <field name="_version_" type="long" indexed="true" multiValued="false" stored="false" docValues="true" />
  6 + <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
  7 + <field name="type" type="text" required="false" multiValued="false" indexed="true" stored="true" />
  8 + <field name="title" type="text" required="false" multiValued="false" indexed="true" stored="true" />
  9 + <field name="genre" type="string" required="false" multiValued="true" indexed="true" stored="true" />
  10 + <field name="text" type="mtas" required="false" multiValued="false" indexed="true" stored="true" />
  11 +
  12 + <uniqueKey>id</uniqueKey>
  13 +
  14 + <fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
  15 + <fieldType name="int" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0" />
  16 + <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0" />
  17 + <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
  18 + <analyzer type="index">
  19 + <tokenizer class="solr.StandardTokenizerFactory" />
  20 + <filter class="solr.LowerCaseFilterFactory" />
  21 + </analyzer>
  22 + <analyzer type="query">
  23 + <tokenizer class="solr.StandardTokenizerFactory" />
  24 + <filter class="solr.LowerCaseFilterFactory" />
  25 + </analyzer>
  26 + </fieldType>
  27 + <fieldType name="mtas" class="solr.TextField" postingsFormat="MtasCodec">
  28 + <analyzer type="index">
  29 + <charFilter class="mtas.analysis.util.MtasCharFilterFactory" type="url" prefix="http://localhost/demo/" postfix="" />
  30 + <tokenizer class="mtas.analysis.util.MtasTokenizerFactory" configFile="mtas/demo_folia.xml" />
  31 + </analyzer>
  32 + <analyzer type="query">
  33 + <tokenizer class="solr.WhitespaceTokenizerFactory" />
  34 + <filter class="mtas.analysis.util.MtasPrefixTokenFilterFactory" prefix="t" />
  35 + </analyzer>
  36 + </fieldType>
  37 +
  38 +</schema>
  39 +
... ...
docker/schemaFull.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8" ?>
  2 +
  3 +<schema name="demo" version="1.6">
  4 +
  5 + <field name="_version_" type="long" indexed="true" multiValued="false" stored="false" docValues="true" />
  6 + <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
  7 + <field name="type" type="text" required="true" multiValued="false" indexed="true" stored="true" />
  8 + <field name="title" type="text" required="false" multiValued="false" indexed="true" stored="true" />
  9 + <field name="genre" type="string" required="false" multiValued="true" indexed="true" stored="true" />
  10 + <field name="text" type="mtas" required="false" multiValued="false" indexed="true" stored="true" />
  11 + <field name="error" type="string" indexed="true" stored="true" />
  12 + <field name="numberOfTokens" type="int" indexed="true" stored="true" />
  13 + <field name="numberOfPositions" type="int" indexed="true" stored="true" />
  14 + <field name="size" type="int" indexed="true" stored="true" />
  15 +
  16 + <uniqueKey>id</uniqueKey>
  17 +
  18 + <fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
  19 + <fieldType name="int" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0" />
  20 + <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0" />
  21 + <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
  22 + <analyzer type="index">
  23 + <tokenizer class="solr.StandardTokenizerFactory" />
  24 + <filter class="solr.LowerCaseFilterFactory" />
  25 + </analyzer>
  26 + <analyzer type="query">
  27 + <tokenizer class="solr.StandardTokenizerFactory" />
  28 + <filter class="solr.LowerCaseFilterFactory" />
  29 + </analyzer>
  30 + </fieldType>
  31 + <fieldType name="mtas_config" class="solr.TextField" postingsFormat="MtasCodec">
  32 + <analyzer type="index">
  33 + <charFilter class="mtas.analysis.util.MtasCharFilterFactory" config="mtas.xml" />
  34 + <tokenizer class="mtas.analysis.util.MtasTokenizerFactory" config="mtas.xml" />
  35 + </analyzer>
  36 + </fieldType>
  37 + <fieldType name="mtas" class="mtas.solr.schema.MtasPreAnalyzedField"
  38 + followIndexAnalyzer="mtas_config" defaultConfiguration="default"
  39 + configurationFromField="type" setNumberOfTokens="numberOfTokens"
  40 + setNumberOfPositions="numberOfPositions" setSize="size"
  41 + setError="error" postingsFormat="MtasCodec">
  42 + <analyzer type="query">
  43 + <tokenizer class="solr.WhitespaceTokenizerFactory" />
  44 + <filter class="mtas.analysis.util.MtasPrefixTokenFilterFactory" prefix="t" />
  45 + </analyzer>
  46 + </fieldType>
  47 +
  48 +</schema>
  49 +
... ...
docker/solrconfig.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8" ?>
  2 +<!--
  3 + Licensed to the Apache Software Foundation (ASF) under one or more
  4 + contributor license agreements. See the NOTICE file distributed with
  5 + this work for additional information regarding copyright ownership.
  6 + The ASF licenses this file to You under the Apache License, Version 2.0
  7 + (the "License"); you may not use this file except in compliance with
  8 + the License. You may obtain a copy of the License at
  9 +
  10 + http://www.apache.org/licenses/LICENSE-2.0
  11 +
  12 + Unless required by applicable law or agreed to in writing, software
  13 + distributed under the License is distributed on an "AS IS" BASIS,
  14 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 + See the License for the specific language governing permissions and
  16 + limitations under the License.
  17 +-->
  18 +
  19 +<!--
  20 + For more details about configurations options that may appear in
  21 + this file, see http://wiki.apache.org/solr/SolrConfigXml.
  22 +-->
  23 +<config>
  24 + <!-- In all configuration below, a prefix of "solr." for class names
  25 + is an alias that causes solr to search appropriate packages,
  26 + including org.apache.solr.(search|update|request|core|analysis)
  27 +
  28 + You may also specify a fully qualified Java classname if you
  29 + have your own custom plugins.
  30 + -->
  31 +
  32 + <!-- Controls what version of Lucene various components of Solr
  33 + adhere to. Generally, you want to use the latest version to
  34 + get all bug fixes and improvements. It is highly recommended
  35 + that you fully re-index after changing this setting as it can
  36 + affect both how text is indexed and queried.
  37 + -->
  38 + <luceneMatchVersion>6.2.0</luceneMatchVersion>
  39 +
  40 + <!-- <lib/> directives can be used to instruct Solr to load any Jars
  41 + identified and use them to resolve any "plugins" specified in
  42 + your solrconfig.xml or schema.xml (ie: Analyzers, Request
  43 + Handlers, etc...).
  44 +
  45 + All directories and paths are resolved relative to the
  46 + instanceDir.
  47 +
  48 + Please note that <lib/> directives are processed in the order
  49 + that they appear in your solrconfig.xml file, and are "stacked"
  50 + on top of each other when building a ClassLoader - so if you have
  51 + plugin jars with dependencies on other jars, the "lower level"
  52 + dependency jars should be loaded first.
  53 +
  54 + If a "./lib" directory exists in your instanceDir, all files
  55 + found in it are included as if you had used the following
  56 + syntax...
  57 +
  58 + <lib dir="./lib" />
  59 + -->
  60 +
  61 + <!-- A 'dir' option by itself adds any files found in the directory
  62 + to the classpath, this is useful for including all jars in a
  63 + directory.
  64 +
  65 + When a 'regex' is specified in addition to a 'dir', only the
  66 + files in that directory which completely match the regex
  67 + (anchored on both ends) will be included.
  68 +
  69 + If a 'dir' option (with or without a regex) is used and nothing
  70 + is found that matches, a warning will be logged.
  71 +
  72 + The examples below can be used to load some solr-contribs along
  73 + with their external dependencies.
  74 + -->
  75 + <lib dir="${solr.install.dir:../../../..}/contrib/extraction/lib" regex=".*\.jar" />
  76 + <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-cell-\d.*\.jar" />
  77 +
  78 + <lib dir="${solr.install.dir:../../../..}/contrib/clustering/lib/" regex=".*\.jar" />
  79 + <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-clustering-\d.*\.jar" />
  80 +
  81 + <lib dir="${solr.install.dir:../../../..}/contrib/langid/lib/" regex=".*\.jar" />
  82 + <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-langid-\d.*\.jar" />
  83 +
  84 + <lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*\.jar" />
  85 + <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-\d.*\.jar" />
  86 + <!-- an exact 'path' can be used instead of a 'dir' to specify a
  87 + specific jar file. This will cause a serious error to be logged
  88 + if it can't be loaded.
  89 + -->
  90 + <!--
  91 + <lib path="../a-jar-that-does-not-exist.jar" />
  92 + -->
  93 +
  94 + <!-- Data Directory
  95 +
  96 + Used to specify an alternate directory to hold all index data
  97 + other than the default ./data under the Solr home. If
  98 + replication is in use, this should match the replication
  99 + configuration.
  100 + -->
  101 + <dataDir>${solr.data.dir:}</dataDir>
  102 +
  103 +
  104 + <!-- The DirectoryFactory to use for indexes.
  105 +
  106 + solr.StandardDirectoryFactory is filesystem
  107 + based and tries to pick the best implementation for the current
  108 + JVM and platform. solr.NRTCachingDirectoryFactory, the default,
  109 + wraps solr.StandardDirectoryFactory and caches small files in memory
  110 + for better NRT performance.
  111 +
  112 + One can force a particular implementation via solr.MMapDirectoryFactory,
  113 + solr.NIOFSDirectoryFactory, or solr.SimpleFSDirectoryFactory.
  114 +
  115 + solr.RAMDirectoryFactory is memory based, not
  116 + persistent, and doesn't work with replication.
  117 + -->
  118 + <directoryFactory name="DirectoryFactory"
  119 + class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/>
  120 +
  121 + <!-- The CodecFactory for defining the format of the inverted index.
  122 + The default implementation is SchemaCodecFactory, which is the official Lucene
  123 + index format, but hooks into the schema to provide per-field customization of
  124 + the postings lists and per-document values in the fieldType element
  125 + (postingsFormat/docValuesFormat). Note that most of the alternative implementations
  126 + are experimental, so if you choose to customize the index format, it's a good
  127 + idea to convert back to the official format e.g. via IndexWriter.addIndexes(IndexReader)
  128 + before upgrading to a newer version to avoid unnecessary reindexing.
  129 + A "compressionMode" string element can be added to <codecFactory> to choose
  130 + between the existing compression modes in the default codec: "BEST_SPEED" (default)
  131 + or "BEST_COMPRESSION".
  132 + -->
  133 + <codecFactory class="solr.SchemaCodecFactory"/>
  134 +
  135 + <!-- MTAS: use classic schema for demo -->
  136 + <schemaFactory class="ClassicIndexSchemaFactory"/>
  137 +
  138 + <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  139 + Index Config - These settings control low-level behavior of indexing
  140 + Most example settings here show the default value, but are commented
  141 + out, to more easily see where customizations have been made.
  142 +
  143 + Note: This replaces <indexDefaults> and <mainIndex> from older versions
  144 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
  145 + <indexConfig>
  146 + <!-- maxFieldLength was removed in 4.0. To get similar behavior, include a
  147 + LimitTokenCountFilterFactory in your fieldType definition. E.g.
  148 + <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10000"/>
  149 + -->
  150 + <!-- Maximum time to wait for a write lock (ms) for an IndexWriter. Default: 1000 -->
  151 + <!-- <writeLockTimeout>1000</writeLockTimeout> -->
  152 +
  153 + <!-- Expert: Enabling compound file will use less files for the index,
  154 + using fewer file descriptors on the expense of performance decrease.
  155 + Default in Lucene is "true". Default in Solr is "false" (since 3.6) -->
  156 + <!-- <useCompoundFile>false</useCompoundFile> -->
  157 +
  158 + <!-- ramBufferSizeMB sets the amount of RAM that may be used by Lucene
  159 + indexing for buffering added documents and deletions before they are
  160 + flushed to the Directory.
  161 + maxBufferedDocs sets a limit on the number of documents buffered
  162 + before flushing.
  163 + If both ramBufferSizeMB and maxBufferedDocs is set, then
  164 + Lucene will flush based on whichever limit is hit first. -->
  165 + <!-- <ramBufferSizeMB>100</ramBufferSizeMB> -->
  166 + <!-- <maxBufferedDocs>1000</maxBufferedDocs> -->
  167 +
  168 + <!-- Expert: Merge Policy
  169 + The Merge Policy in Lucene controls how merging of segments is done.
  170 + The default since Solr/Lucene 3.3 is TieredMergePolicy.
  171 + The default since Lucene 2.3 was the LogByteSizeMergePolicy,
  172 + Even older versions of Lucene used LogDocMergePolicy.
  173 + -->
  174 + <!--
  175 + <mergePolicyFactory class="org.apache.solr.index.TieredMergePolicyFactory">
  176 + <int name="maxMergeAtOnce">10</int>
  177 + <int name="segmentsPerTier">10</int>
  178 + <double name="noCFSRatio">0.1</double>
  179 + </mergePolicyFactory>
  180 + -->
  181 +
  182 + <!-- Expert: Merge Scheduler
  183 + The Merge Scheduler in Lucene controls how merges are
  184 + performed. The ConcurrentMergeScheduler (Lucene 2.3 default)
  185 + can perform merges in the background using separate threads.
  186 + The SerialMergeScheduler (Lucene 2.2 default) does not.
  187 + -->
  188 + <!--
  189 + <mergeScheduler class="org.apache.lucene.index.ConcurrentMergeScheduler"/>
  190 + -->
  191 +
  192 + <!-- LockFactory
  193 +
  194 + This option specifies which Lucene LockFactory implementation
  195 + to use.
  196 +
  197 + single = SingleInstanceLockFactory - suggested for a
  198 + read-only index or when there is no possibility of
  199 + another process trying to modify the index.
  200 + native = NativeFSLockFactory - uses OS native file locking.
  201 + Do not use when multiple solr webapps in the same
  202 + JVM are attempting to share a single index.
  203 + simple = SimpleFSLockFactory - uses a plain file for locking
  204 +
  205 + Defaults: 'native' is default for Solr3.6 and later, otherwise
  206 + 'simple' is the default
  207 +
  208 + More details on the nuances of each LockFactory...
  209 + http://wiki.apache.org/lucene-java/AvailableLockFactories
  210 + -->
  211 + <lockType>${solr.lock.type:native}</lockType>
  212 +
  213 + <!-- Commit Deletion Policy
  214 + Custom deletion policies can be specified here. The class must
  215 + implement org.apache.lucene.index.IndexDeletionPolicy.
  216 +
  217 + The default Solr IndexDeletionPolicy implementation supports
  218 + deleting index commit points on number of commits, age of
  219 + commit point and optimized status.
  220 +
  221 + The latest commit point should always be preserved regardless
  222 + of the criteria.
  223 + -->
  224 + <!--
  225 + <deletionPolicy class="solr.SolrDeletionPolicy">
  226 + -->
  227 + <!-- The number of commit points to be kept -->
  228 + <!-- <str name="maxCommitsToKeep">1</str> -->
  229 + <!-- The number of optimized commit points to be kept -->
  230 + <!-- <str name="maxOptimizedCommitsToKeep">0</str> -->
  231 + <!--
  232 + Delete all commit points once they have reached the given age.
  233 + Supports DateMathParser syntax e.g.
  234 + -->
  235 + <!--
  236 + <str name="maxCommitAge">30MINUTES</str>
  237 + <str name="maxCommitAge">1DAY</str>
  238 + -->
  239 + <!--
  240 + </deletionPolicy>
  241 + -->
  242 +
  243 + <!-- Lucene Infostream
  244 +
  245 + To aid in advanced debugging, Lucene provides an "InfoStream"
  246 + of detailed information when indexing.
  247 +
  248 + Setting The value to true will instruct the underlying Lucene
  249 + IndexWriter to write its debugging info the specified file
  250 + -->
  251 + <!-- <infoStream file="INFOSTREAM.txt">false</infoStream> -->
  252 + </indexConfig>
  253 +
  254 +
  255 + <!-- JMX
  256 +
  257 + This example enables JMX if and only if an existing MBeanServer
  258 + is found, use this if you want to configure JMX through JVM
  259 + parameters. Remove this to disable exposing Solr configuration
  260 + and statistics to JMX.
  261 +
  262 + For more details see http://wiki.apache.org/solr/SolrJmx
  263 + -->
  264 + <jmx />
  265 + <!-- If you want to connect to a particular server, specify the
  266 + agentId
  267 + -->
  268 + <!-- <jmx agentId="myAgent" /> -->
  269 + <!-- If you want to start a new MBeanServer, specify the serviceUrl -->
  270 + <!-- <jmx serviceUrl="service:jmx:rmi:///jndi/rmi://localhost:9999/solr"/>
  271 + -->
  272 +
  273 + <!-- The default high-performance update handler -->
  274 + <updateHandler class="solr.DirectUpdateHandler2">
  275 +
  276 + <!-- Enables a transaction log, used for real-time get, durability, and
  277 + and solr cloud replica recovery. The log can grow as big as
  278 + uncommitted changes to the index, so use of a hard autoCommit
  279 + is recommended (see below).
  280 + "dir" - the target directory for transaction logs, defaults to the
  281 + solr data directory.
  282 + "numVersionBuckets" - sets the number of buckets used to keep
  283 + track of max version values when checking for re-ordered
  284 + updates; increase this value to reduce the cost of
  285 + synchronizing access to version buckets during high-volume
  286 + indexing, this requires 8 bytes (long) * numVersionBuckets
  287 + of heap space per Solr core.
  288 + -->
  289 + <updateLog>
  290 + <str name="dir">${solr.ulog.dir:}</str>
  291 + <int name="numVersionBuckets">${solr.ulog.numVersionBuckets:65536}</int>
  292 + </updateLog>
  293 +
  294 + <!-- AutoCommit
  295 +
  296 + Perform a hard commit automatically under certain conditions.
  297 + Instead of enabling autoCommit, consider using "commitWithin"
  298 + when adding documents.
  299 +
  300 + http://wiki.apache.org/solr/UpdateXmlMessages
  301 +
  302 + maxDocs - Maximum number of documents to add since the last
  303 + commit before automatically triggering a new commit.
  304 +
  305 + maxTime - Maximum amount of time in ms that is allowed to pass
  306 + since a document was added before automatically
  307 + triggering a new commit.
  308 + openSearcher - if false, the commit causes recent index changes
  309 + to be flushed to stable storage, but does not cause a new
  310 + searcher to be opened to make those changes visible.
  311 +
  312 + If the updateLog is enabled, then it's highly recommended to
  313 + have some sort of hard autoCommit to limit the log size.
  314 + -->
  315 + <autoCommit>
  316 + <maxTime>${solr.autoCommit.maxTime:15000}</maxTime>
  317 + <openSearcher>false</openSearcher>
  318 + </autoCommit>
  319 +
  320 + <!-- softAutoCommit is like autoCommit except it causes a
  321 + 'soft' commit which only ensures that changes are visible
  322 + but does not ensure that data is synced to disk. This is
  323 + faster and more near-realtime friendly than a hard commit.
  324 + -->
  325 +
  326 + <autoSoftCommit>
  327 + <maxTime>${solr.autoSoftCommit.maxTime:-1}</maxTime>
  328 + </autoSoftCommit>
  329 +
  330 + <!-- Update Related Event Listeners
  331 +
  332 + Various IndexWriter related events can trigger Listeners to
  333 + take actions.
  334 +
  335 + postCommit - fired after every commit or optimize command
  336 + postOptimize - fired after every optimize command
  337 + -->
  338 + <!-- The RunExecutableListener executes an external command from a
  339 + hook such as postCommit or postOptimize.
  340 +
  341 + exe - the name of the executable to run
  342 + dir - dir to use as the current working directory. (default=".")
  343 + wait - the calling thread waits until the executable returns.
  344 + (default="true")
  345 + args - the arguments to pass to the program. (default is none)
  346 + env - environment variables to set. (default is none)
  347 + -->
  348 + <!-- This example shows how RunExecutableListener could be used
  349 + with the script based replication...
  350 + http://wiki.apache.org/solr/CollectionDistribution
  351 + -->
  352 + <!--
  353 + <listener event="postCommit" class="solr.RunExecutableListener">
  354 + <str name="exe">solr/bin/snapshooter</str>
  355 + <str name="dir">.</str>
  356 + <bool name="wait">true</bool>
  357 + <arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
  358 + <arr name="env"> <str>MYVAR=val1</str> </arr>
  359 + </listener>
  360 + -->
  361 +
  362 + </updateHandler>
  363 +
  364 + <!-- IndexReaderFactory
  365 +
  366 + Use the following format to specify a custom IndexReaderFactory,
  367 + which allows for alternate IndexReader implementations.
  368 +
  369 + ** Experimental Feature **
  370 +
  371 + Please note - Using a custom IndexReaderFactory may prevent
  372 + certain other features from working. The API to
  373 + IndexReaderFactory may change without warning or may even be
  374 + removed from future releases if the problems cannot be
  375 + resolved.
  376 +
  377 +
  378 + ** Features that may not work with custom IndexReaderFactory **
  379 +
  380 + The ReplicationHandler assumes a disk-resident index. Using a
  381 + custom IndexReader implementation may cause incompatibility
  382 + with ReplicationHandler and may cause replication to not work
  383 + correctly. See SOLR-1366 for details.
  384 +
  385 + -->
  386 + <!--
  387 + <indexReaderFactory name="IndexReaderFactory" class="package.class">
  388 + <str name="someArg">Some Value</str>
  389 + </indexReaderFactory >
  390 + -->
  391 +
  392 + <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  393 + Query section - these settings control query time things like caches
  394 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
  395 + <query>
  396 + <!-- Max Boolean Clauses
  397 +
  398 + Maximum number of clauses in each BooleanQuery, an exception
  399 + is thrown if exceeded.
  400 +
  401 + ** WARNING **
  402 +
  403 + This option actually modifies a global Lucene property that
  404 + will affect all SolrCores. If multiple solrconfig.xml files
  405 + disagree on this property, the value at any given moment will
  406 + be based on the last SolrCore to be initialized.
  407 +
  408 + -->
  409 + <maxBooleanClauses>1024</maxBooleanClauses>
  410 +
  411 +
  412 + <!-- Solr Internal Query Caches
  413 +
  414 + There are two implementations of cache available for Solr,
  415 + LRUCache, based on a synchronized LinkedHashMap, and
  416 + FastLRUCache, based on a ConcurrentHashMap.
  417 +
  418 + FastLRUCache has faster gets and slower puts in single
  419 + threaded operation and thus is generally faster than LRUCache
  420 + when the hit ratio of the cache is high (> 75%), and may be
  421 + faster under other scenarios on multi-cpu systems.
  422 + -->
  423 +
  424 + <!-- Filter Cache
  425 +
  426 + Cache used by SolrIndexSearcher for filters (DocSets),
  427 + unordered sets of *all* documents that match a query. When a
  428 + new searcher is opened, its caches may be prepopulated or
  429 + "autowarmed" using data from caches in the old searcher.
  430 + autowarmCount is the number of items to prepopulate. For
  431 + LRUCache, the autowarmed items will be the most recently
  432 + accessed items.
  433 +
  434 + Parameters:
  435 + class - the SolrCache implementation LRUCache or
  436 + (LRUCache or FastLRUCache)
  437 + size - the maximum number of entries in the cache
  438 + initialSize - the initial capacity (number of entries) of
  439 + the cache. (see java.util.HashMap)
  440 + autowarmCount - the number of entries to prepopulate from
  441 + and old cache.
  442 + -->
  443 + <filterCache class="solr.FastLRUCache"
  444 + size="512"
  445 + initialSize="512"
  446 + autowarmCount="0"/>
  447 +
  448 + <!-- Query Result Cache
  449 +
  450 + Caches results of searches - ordered lists of document ids
  451 + (DocList) based on a query, a sort, and the range of documents requested.
  452 + Additional supported parameter by LRUCache:
  453 + maxRamMB - the maximum amount of RAM (in MB) that this cache is allowed
  454 + to occupy
  455 + -->
  456 + <queryResultCache class="solr.LRUCache"
  457 + size="512"
  458 + initialSize="512"
  459 + autowarmCount="0"/>
  460 +
  461 + <!-- Document Cache
  462 +
  463 + Caches Lucene Document objects (the stored fields for each
  464 + document). Since Lucene internal document ids are transient,
  465 + this cache will not be autowarmed.
  466 + -->
  467 + <documentCache class="solr.LRUCache"
  468 + size="512"
  469 + initialSize="512"
  470 + autowarmCount="0"/>
  471 +
  472 + <!-- custom cache currently used by block join -->
  473 + <cache name="perSegFilter"
  474 + class="solr.search.LRUCache"
  475 + size="10"
  476 + initialSize="0"
  477 + autowarmCount="10"
  478 + regenerator="solr.NoOpRegenerator" />
  479 +
  480 + <!-- Field Value Cache
  481 +
  482 + Cache used to hold field values that are quickly accessible
  483 + by document id. The fieldValueCache is created by default
  484 + even if not configured here.
  485 + -->
  486 + <!--
  487 + <fieldValueCache class="solr.FastLRUCache"
  488 + size="512"
  489 + autowarmCount="128"
  490 + showItems="32" />
  491 + -->
  492 +
  493 + <!-- Custom Cache
  494 +
  495 + Example of a generic cache. These caches may be accessed by
  496 + name through SolrIndexSearcher.getCache(),cacheLookup(), and
  497 + cacheInsert(). The purpose is to enable easy caching of
  498 + user/application level data. The regenerator argument should
  499 + be specified as an implementation of solr.CacheRegenerator
  500 + if autowarming is desired.
  501 + -->
  502 + <!--
  503 + <cache name="myUserCache"
  504 + class="solr.LRUCache"
  505 + size="4096"
  506 + initialSize="1024"
  507 + autowarmCount="1024"
  508 + regenerator="com.mycompany.MyRegenerator"
  509 + />
  510 + -->
  511 +
  512 +
  513 + <!-- Lazy Field Loading
  514 +
  515 + If true, stored fields that are not requested will be loaded
  516 + lazily. This can result in a significant speed improvement
  517 + if the usual case is to not load all stored fields,
  518 + especially if the skipped fields are large compressed text
  519 + fields.
  520 + -->
  521 + <enableLazyFieldLoading>true</enableLazyFieldLoading>
  522 +
  523 + <!-- Use Filter For Sorted Query
  524 +
  525 + A possible optimization that attempts to use a filter to
  526 + satisfy a search. If the requested sort does not include
  527 + score, then the filterCache will be checked for a filter
  528 + matching the query. If found, the filter will be used as the
  529 + source of document ids, and then the sort will be applied to
  530 + that.
  531 +
  532 + For most situations, this will not be useful unless you
  533 + frequently get the same search repeatedly with different sort
  534 + options, and none of them ever use "score"
  535 + -->
  536 + <!--
  537 + <useFilterForSortedQuery>true</useFilterForSortedQuery>
  538 + -->
  539 +
  540 + <!-- Result Window Size
  541 +
  542 + An optimization for use with the queryResultCache. When a search
  543 + is requested, a superset of the requested number of document ids
  544 + are collected. For example, if a search for a particular query
  545 + requests matching documents 10 through 19, and queryWindowSize is 50,
  546 + then documents 0 through 49 will be collected and cached. Any further
  547 + requests in that range can be satisfied via the cache.
  548 + -->
  549 + <queryResultWindowSize>20</queryResultWindowSize>
  550 +
  551 + <!-- Maximum number of documents to cache for any entry in the
  552 + queryResultCache.
  553 + -->
  554 + <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
  555 +
  556 + <!-- Query Related Event Listeners
  557 +
  558 + Various IndexSearcher related events can trigger Listeners to
  559 + take actions.
  560 +
  561 + newSearcher - fired whenever a new searcher is being prepared
  562 + and there is a current searcher handling requests (aka
  563 + registered). It can be used to prime certain caches to
  564 + prevent long request times for certain requests.
  565 +
  566 + firstSearcher - fired whenever a new searcher is being
  567 + prepared but there is no current registered searcher to handle
  568 + requests or to gain autowarming data from.
  569 +
  570 +
  571 + -->
  572 + <!-- QuerySenderListener takes an array of NamedList and executes a
  573 + local query request for each NamedList in sequence.
  574 + -->
  575 + <listener event="newSearcher" class="solr.QuerySenderListener">
  576 + <arr name="queries">
  577 + <!--
  578 + <lst><str name="q">solr</str><str name="sort">price asc</str></lst>
  579 + <lst><str name="q">rocks</str><str name="sort">weight asc</str></lst>
  580 + -->
  581 + </arr>
  582 + </listener>
  583 + <listener event="firstSearcher" class="solr.QuerySenderListener">
  584 + <arr name="queries">
  585 + <!--
  586 + <lst>
  587 + <str name="q">static firstSearcher warming in solrconfig.xml</str>
  588 + </lst>
  589 + -->
  590 + </arr>
  591 + </listener>
  592 +
  593 + <!-- Use Cold Searcher
  594 +
  595 + If a search request comes in and there is no current
  596 + registered searcher, then immediately register the still
  597 + warming searcher and use it. If "false" then all requests
  598 + will block until the first searcher is done warming.
  599 + -->
  600 + <useColdSearcher>false</useColdSearcher>
  601 +
  602 + <!-- Max Warming Searchers
  603 +
  604 + Maximum number of searchers that may be warming in the
  605 + background concurrently. An error is returned if this limit
  606 + is exceeded.
  607 +
  608 + Recommend values of 1-2 for read-only slaves, higher for
  609 + masters w/o cache warming.
  610 + -->
  611 + <maxWarmingSearchers>2</maxWarmingSearchers>
  612 +
  613 + </query>
  614 +
  615 +
  616 + <!-- Request Dispatcher
  617 +
  618 + This section contains instructions for how the SolrDispatchFilter
  619 + should behave when processing requests for this SolrCore.
  620 +
  621 + handleSelect is a legacy option that affects the behavior of requests
  622 + such as /select?qt=XXX
  623 +
  624 + handleSelect="true" will cause the SolrDispatchFilter to process
  625 + the request and dispatch the query to a handler specified by the
  626 + "qt" param, assuming "/select" isn't already registered.
  627 +
  628 + handleSelect="false" will cause the SolrDispatchFilter to
  629 + ignore "/select" requests, resulting in a 404 unless a handler
  630 + is explicitly registered with the name "/select"
  631 +
  632 + handleSelect="true" is not recommended for new users, but is the default
  633 + for backwards compatibility
  634 + -->
  635 + <requestDispatcher handleSelect="false" >
  636 + <!-- Request Parsing
  637 +
  638 + These settings indicate how Solr Requests may be parsed, and
  639 + what restrictions may be placed on the ContentStreams from
  640 + those requests
  641 +
  642 + enableRemoteStreaming - enables use of the stream.file
  643 + and stream.url parameters for specifying remote streams.
  644 +
  645 + multipartUploadLimitInKB - specifies the max size (in KiB) of
  646 + Multipart File Uploads that Solr will allow in a Request.
  647 +
  648 + formdataUploadLimitInKB - specifies the max size (in KiB) of
  649 + form data (application/x-www-form-urlencoded) sent via
  650 + POST. You can use POST to pass request parameters not
  651 + fitting into the URL.
  652 +
  653 + addHttpRequestToContext - if set to true, it will instruct
  654 + the requestParsers to include the original HttpServletRequest
  655 + object in the context map of the SolrQueryRequest under the
  656 + key "httpRequest". It will not be used by any of the existing
  657 + Solr components, but may be useful when developing custom
  658 + plugins.
  659 +
  660 + *** WARNING ***
  661 + The settings below authorize Solr to fetch remote files, You
  662 + should make sure your system has some authentication before
  663 + using enableRemoteStreaming="true"
  664 +
  665 + -->
  666 + <requestParsers enableRemoteStreaming="true"
  667 + multipartUploadLimitInKB="2048000"
  668 + formdataUploadLimitInKB="2048"
  669 + addHttpRequestToContext="false"/>
  670 +
  671 + <!-- HTTP Caching
  672 +
  673 + Set HTTP caching related parameters (for proxy caches and clients).
  674 +
  675 + The options below instruct Solr not to output any HTTP Caching
  676 + related headers
  677 + -->
  678 + <httpCaching never304="true" />
  679 + <!-- If you include a <cacheControl> directive, it will be used to
  680 + generate a Cache-Control header (as well as an Expires header
  681 + if the value contains "max-age=")
  682 +
  683 + By default, no Cache-Control header is generated.
  684 +
  685 + You can use the <cacheControl> option even if you have set
  686 + never304="true"
  687 + -->
  688 + <!--
  689 + <httpCaching never304="true" >
  690 + <cacheControl>max-age=30, public</cacheControl>
  691 + </httpCaching>
  692 + -->
  693 + <!-- To enable Solr to respond with automatically generated HTTP
  694 + Caching headers, and to response to Cache Validation requests
  695 + correctly, set the value of never304="false"
  696 +
  697 + This will cause Solr to generate Last-Modified and ETag
  698 + headers based on the properties of the Index.
  699 +
  700 + The following options can also be specified to affect the
  701 + values of these headers...
  702 +
  703 + lastModFrom - the default value is "openTime" which means the
  704 + Last-Modified value (and validation against If-Modified-Since
  705 + requests) will all be relative to when the current Searcher
  706 + was opened. You can change it to lastModFrom="dirLastMod" if
  707 + you want the value to exactly correspond to when the physical
  708 + index was last modified.
  709 +
  710 + etagSeed="..." is an option you can change to force the ETag
  711 + header (and validation against If-None-Match requests) to be
  712 + different even if the index has not changed (ie: when making
  713 + significant changes to your config file)
  714 +
  715 + (lastModifiedFrom and etagSeed are both ignored if you use
  716 + the never304="true" option)
  717 + -->
  718 + <!--
  719 + <httpCaching lastModifiedFrom="openTime"
  720 + etagSeed="Solr">
  721 + <cacheControl>max-age=30, public</cacheControl>
  722 + </httpCaching>
  723 + -->
  724 + </requestDispatcher>
  725 +
  726 + <!-- Request Handlers
  727 +
  728 + http://wiki.apache.org/solr/SolrRequestHandler
  729 +
  730 + Incoming queries will be dispatched to a specific handler by name
  731 + based on the path specified in the request.
  732 +
  733 + Legacy behavior: If the request path uses "/select" but no Request
  734 + Handler has that name, and if handleSelect="true" has been specified in
  735 + the requestDispatcher, then the Request Handler is dispatched based on
  736 + the qt parameter. Handlers without a leading '/' are accessed this way
  737 + like so: http://host/app/[core/]select?qt=name If no qt is
  738 + given, then the requestHandler that declares default="true" will be
  739 + used or the one named "standard".
  740 +
  741 + If a Request Handler is declared with startup="lazy", then it will
  742 + not be initialized until the first request that uses it.
  743 +
  744 + -->
  745 + <!-- SearchHandler
  746 +
  747 + http://wiki.apache.org/solr/SearchHandler
  748 +
  749 + For processing Search Queries, the primary Request Handler
  750 + provided with Solr is "SearchHandler" It delegates to a sequent
  751 + of SearchComponents (see below) and supports distributed
  752 + queries across multiple shards
  753 + -->
  754 + <requestHandler name="/select" class="solr.SearchHandler">
  755 + <!-- default values for query parameters can be specified, these
  756 + will be overridden by parameters in the request
  757 + -->
  758 + <!-- MTAS : add mtas component -->
  759 + <arr name="last-components">
  760 + <str>mtas</str>
  761 + </arr>
  762 + <lst name="defaults">
  763 + <str name="echoParams">explicit</str>
  764 + <int name="rows">10</int>
  765 + <!-- <str name="df">text</str> -->
  766 + </lst>
  767 + <!-- In addition to defaults, "appends" params can be specified
  768 + to identify values which should be appended to the list of
  769 + multi-val params from the query (or the existing "defaults").
  770 + -->
  771 + <!-- In this example, the param "fq=instock:true" would be appended to
  772 + any query time fq params the user may specify, as a mechanism for
  773 + partitioning the index, independent of any user selected filtering
  774 + that may also be desired (perhaps as a result of faceted searching).
  775 +
  776 + NOTE: there is *absolutely* nothing a client can do to prevent these
  777 + "appends" values from being used, so don't use this mechanism
  778 + unless you are sure you always want it.
  779 + -->
  780 + <!--
  781 + <lst name="appends">
  782 + <str name="fq">inStock:true</str>
  783 + </lst>
  784 + -->
  785 + <!-- "invariants" are a way of letting the Solr maintainer lock down
  786 + the options available to Solr clients. Any params values
  787 + specified here are used regardless of what values may be specified
  788 + in either the query, the "defaults", or the "appends" params.
  789 +
  790 + In this example, the facet.field and facet.query params would
  791 + be fixed, limiting the facets clients can use. Faceting is
  792 + not turned on by default - but if the client does specify
  793 + facet=true in the request, these are the only facets they
  794 + will be able to see counts for; regardless of what other
  795 + facet.field or facet.query params they may specify.
  796 +
  797 + NOTE: there is *absolutely* nothing a client can do to prevent these
  798 + "invariants" values from being used, so don't use this mechanism
  799 + unless you are sure you always want it.
  800 + -->
  801 + <!--
  802 + <lst name="invariants">
  803 + <str name="facet.field">cat</str>
  804 + <str name="facet.field">manu_exact</str>
  805 + <str name="facet.query">price:[* TO 500]</str>
  806 + <str name="facet.query">price:[500 TO *]</str>
  807 + </lst>
  808 + -->
  809 + <!-- If the default list of SearchComponents is not desired, that
  810 + list can either be overridden completely, or components can be
  811 + prepended or appended to the default list. (see below)
  812 + -->
  813 + <!--
  814 + <arr name="components">
  815 + <str>nameOfCustomComponent1</str>
  816 + <str>nameOfCustomComponent2</str>
  817 + </arr>
  818 + -->
  819 + </requestHandler>
  820 +
  821 + <!-- A request handler that returns indented JSON by default -->
  822 + <requestHandler name="/query" class="solr.SearchHandler">
  823 + <lst name="defaults">
  824 + <str name="echoParams">explicit</str>
  825 + <str name="wt">json</str>
  826 + <str name="indent">true</str>
  827 + </lst>
  828 + </requestHandler>
  829 +
  830 +
  831 + <!-- A Robust Example
  832 +
  833 + This example SearchHandler declaration shows off usage of the
  834 + SearchHandler with many defaults declared
  835 +
  836 + Note that multiple instances of the same Request Handler
  837 + (SearchHandler) can be registered multiple times with different
  838 + names (and different init parameters)
  839 + -->
  840 + <requestHandler name="/browse" class="solr.SearchHandler" useParams="query,facets,velocity,browse">
  841 + <lst name="defaults">
  842 + <str name="echoParams">explicit</str>
  843 + </lst>
  844 + </requestHandler>
  845 +
  846 + <initParams path="/update/**,/query,/select,/tvrh,/elevate,/spell,/browse">
  847 + <lst name="defaults">
  848 + <str name="df">_text_</str>
  849 + </lst>
  850 + </initParams>
  851 +
  852 + <initParams path="/update/**">
  853 + <lst name="defaults">
  854 + <str name="update.chain">add-unknown-fields-to-the-schema</str>
  855 + </lst>
  856 + </initParams>
  857 +
  858 + <!-- Solr Cell Update Request Handler
  859 +
  860 + http://wiki.apache.org/solr/ExtractingRequestHandler
  861 +
  862 + -->
  863 + <requestHandler name="/update/extract"
  864 + startup="lazy"
  865 + class="solr.extraction.ExtractingRequestHandler" >
  866 + <lst name="defaults">
  867 + <str name="lowernames">true</str>
  868 + <str name="fmap.meta">ignored_</str>
  869 + <str name="fmap.content">_text_</str>
  870 + </lst>
  871 + </requestHandler>
  872 +
  873 + <!-- Field Analysis Request Handler
  874 +
  875 + RequestHandler that provides much the same functionality as
  876 + analysis.jsp. Provides the ability to specify multiple field
  877 + types and field names in the same request and outputs
  878 + index-time and query-time analysis for each of them.
  879 +
  880 + Request parameters are:
  881 + analysis.fieldname - field name whose analyzers are to be used
  882 +
  883 + analysis.fieldtype - field type whose analyzers are to be used
  884 + analysis.fieldvalue - text for index-time analysis
  885 + q (or analysis.q) - text for query time analysis
  886 + analysis.showmatch (true|false) - When set to true and when
  887 + query analysis is performed, the produced tokens of the
  888 + field value analysis will be marked as "matched" for every
  889 + token that is produces by the query analysis
  890 + -->
  891 + <requestHandler name="/analysis/field"
  892 + startup="lazy"
  893 + class="solr.FieldAnalysisRequestHandler" />
  894 +
  895 +
  896 + <!-- Document Analysis Handler
  897 +
  898 + http://wiki.apache.org/solr/AnalysisRequestHandler
  899 +
  900 + An analysis handler that provides a breakdown of the analysis
  901 + process of provided documents. This handler expects a (single)
  902 + content stream with the following format:
  903 +
  904 + <docs>
  905 + <doc>
  906 + <field name="id">1</field>
  907 + <field name="name">The Name</field>
  908 + <field name="text">The Text Value</field>
  909 + </doc>
  910 + <doc>...</doc>
  911 + <doc>...</doc>
  912 + ...
  913 + </docs>
  914 +
  915 + Note: Each document must contain a field which serves as the
  916 + unique key. This key is used in the returned response to associate
  917 + an analysis breakdown to the analyzed document.
  918 +
  919 + Like the FieldAnalysisRequestHandler, this handler also supports
  920 + query analysis by sending either an "analysis.query" or "q"
  921 + request parameter that holds the query text to be analyzed. It
  922 + also supports the "analysis.showmatch" parameter which when set to
  923 + true, all field tokens that match the query tokens will be marked
  924 + as a "match".
  925 + -->
  926 + <requestHandler name="/analysis/document"
  927 + class="solr.DocumentAnalysisRequestHandler"
  928 + startup="lazy" />
  929 +
  930 + <!-- Echo the request contents back to the client -->
  931 + <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
  932 + <lst name="defaults">
  933 + <str name="echoParams">explicit</str>
  934 + <str name="echoHandler">true</str>
  935 + </lst>
  936 + </requestHandler>
  937 +
  938 + <!-- Search Components
  939 +
  940 + Search components are registered to SolrCore and used by
  941 + instances of SearchHandler (which can access them by name)
  942 +
  943 + By default, the following components are available:
  944 +
  945 + <searchComponent name="query" class="solr.QueryComponent" />
  946 + <searchComponent name="facet" class="solr.FacetComponent" />
  947 + <searchComponent name="mlt" class="solr.MoreLikeThisComponent" />
  948 + <searchComponent name="highlight" class="solr.HighlightComponent" />
  949 + <searchComponent name="stats" class="solr.StatsComponent" />
  950 + <searchComponent name="debug" class="solr.DebugComponent" />
  951 +
  952 + Default configuration in a requestHandler would look like:
  953 +
  954 + <arr name="components">
  955 + <str>query</str>
  956 + <str>facet</str>
  957 + <str>mlt</str>
  958 + <str>highlight</str>
  959 + <str>stats</str>
  960 + <str>debug</str>
  961 + </arr>
  962 +
  963 + If you register a searchComponent to one of the standard names,
  964 + that will be used instead of the default.
  965 +
  966 + To insert components before or after the 'standard' components, use:
  967 +
  968 + <arr name="first-components">
  969 + <str>myFirstComponentName</str>
  970 + </arr>
  971 +
  972 + <arr name="last-components">
  973 + <str>myLastComponentName</str>
  974 + </arr>
  975 +
  976 + NOTE: The component registered with the name "debug" will
  977 + always be executed after the "last-components"
  978 +
  979 + -->
  980 +
  981 + <!-- Spell Check
  982 +
  983 + The spell check component can return a list of alternative spelling
  984 + suggestions.
  985 +
  986 + http://wiki.apache.org/solr/SpellCheckComponent
  987 + -->
  988 + <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
  989 +
  990 + <str name="queryAnalyzerFieldType">text_general</str>
  991 +
  992 + <!-- Multiple "Spell Checkers" can be declared and used by this
  993 + component
  994 + -->
  995 +
  996 + <!-- a spellchecker built from a field of the main index -->
  997 + <lst name="spellchecker">
  998 + <str name="name">default</str>
  999 + <str name="field">_text_</str>
  1000 + <str name="classname">solr.DirectSolrSpellChecker</str>
  1001 + <!-- the spellcheck distance measure used, the default is the internal levenshtein -->
  1002 + <str name="distanceMeasure">internal</str>
  1003 + <!-- minimum accuracy needed to be considered a valid spellcheck suggestion -->
  1004 + <float name="accuracy">0.5</float>
  1005 + <!-- the maximum #edits we consider when enumerating terms: can be 1 or 2 -->
  1006 + <int name="maxEdits">2</int>
  1007 + <!-- the minimum shared prefix when enumerating terms -->
  1008 + <int name="minPrefix">1</int>
  1009 + <!-- maximum number of inspections per result. -->
  1010 + <int name="maxInspections">5</int>
  1011 + <!-- minimum length of a query term to be considered for correction -->
  1012 + <int name="minQueryLength">4</int>
  1013 + <!-- maximum threshold of documents a query term can appear to be considered for correction -->
  1014 + <float name="maxQueryFrequency">0.01</float>
  1015 + <!-- uncomment this to require suggestions to occur in 1% of the documents
  1016 + <float name="thresholdTokenFrequency">.01</float>
  1017 + -->
  1018 + </lst>
  1019 +
  1020 + <!-- a spellchecker that can break or combine words. See "/spell" handler below for usage -->
  1021 + <!--
  1022 + <lst name="spellchecker">
  1023 + <str name="name">wordbreak</str>
  1024 + <str name="classname">solr.WordBreakSolrSpellChecker</str>
  1025 + <str name="field">name</str>
  1026 + <str name="combineWords">true</str>
  1027 + <str name="breakWords">true</str>
  1028 + <int name="maxChanges">10</int>
  1029 + </lst>
  1030 + -->
  1031 + </searchComponent>
  1032 +
  1033 + <!-- A request handler for demonstrating the spellcheck component.
  1034 +
  1035 + NOTE: This is purely as an example. The whole purpose of the
  1036 + SpellCheckComponent is to hook it into the request handler that
  1037 + handles your normal user queries so that a separate request is
  1038 + not needed to get suggestions.
  1039 +
  1040 + IN OTHER WORDS, THERE IS REALLY GOOD CHANCE THE SETUP BELOW IS
  1041 + NOT WHAT YOU WANT FOR YOUR PRODUCTION SYSTEM!
  1042 +
  1043 + See http://wiki.apache.org/solr/SpellCheckComponent for details
  1044 + on the request parameters.
  1045 + -->
  1046 + <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
  1047 + <lst name="defaults">
  1048 + <!-- Solr will use suggestions from both the 'default' spellchecker
  1049 + and from the 'wordbreak' spellchecker and combine them.
  1050 + collations (re-written queries) can include a combination of
  1051 + corrections from both spellcheckers -->
  1052 + <str name="spellcheck.dictionary">default</str>
  1053 + <str name="spellcheck">on</str>
  1054 + <str name="spellcheck.extendedResults">true</str>
  1055 + <str name="spellcheck.count">10</str>
  1056 + <str name="spellcheck.alternativeTermCount">5</str>
  1057 + <str name="spellcheck.maxResultsForSuggest">5</str>
  1058 + <str name="spellcheck.collate">true</str>
  1059 + <str name="spellcheck.collateExtendedResults">true</str>
  1060 + <str name="spellcheck.maxCollationTries">10</str>
  1061 + <str name="spellcheck.maxCollations">5</str>
  1062 + </lst>
  1063 + <arr name="last-components">
  1064 + <str>spellcheck</str>
  1065 + </arr>
  1066 + </requestHandler>
  1067 +
  1068 + <!-- Term Vector Component
  1069 +
  1070 + http://wiki.apache.org/solr/TermVectorComponent
  1071 + -->
  1072 + <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>
  1073 +
  1074 + <!-- A request handler for demonstrating the term vector component
  1075 +
  1076 + This is purely as an example.
  1077 +
  1078 + In reality you will likely want to add the component to your
  1079 + already specified request handlers.
  1080 + -->
  1081 + <requestHandler name="/tvrh" class="solr.SearchHandler" startup="lazy">
  1082 + <lst name="defaults">
  1083 + <bool name="tv">true</bool>
  1084 + </lst>
  1085 + <arr name="last-components">
  1086 + <str>tvComponent</str>
  1087 + </arr>
  1088 + </requestHandler>
  1089 +
  1090 + <!-- Clustering Component. (Omitted here. See the default Solr example for a typical configuration.) -->
  1091 +
  1092 + <!-- Terms Component
  1093 +
  1094 + http://wiki.apache.org/solr/TermsComponent
  1095 +
  1096 + A component to return terms and document frequency of those
  1097 + terms
  1098 + -->
  1099 + <searchComponent name="terms" class="solr.TermsComponent"/>
  1100 +
  1101 + <!-- A request handler for demonstrating the terms component -->
  1102 + <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
  1103 + <lst name="defaults">
  1104 + <bool name="terms">true</bool>
  1105 + <bool name="distrib">false</bool>
  1106 + </lst>
  1107 + <arr name="components">
  1108 + <str>terms</str>
  1109 + </arr>
  1110 + </requestHandler>
  1111 +
  1112 +
  1113 + <!-- Query Elevation Component
  1114 +
  1115 + http://wiki.apache.org/solr/QueryElevationComponent
  1116 +
  1117 + a search component that enables you to configure the top
  1118 + results for a given query regardless of the normal lucene
  1119 + scoring.
  1120 + -->
  1121 + <!--
  1122 + <searchComponent name="elevator" class="solr.QueryElevationComponent" >
  1123 + <str name="queryFieldType">string</str>
  1124 + <str name="config-file">elevate.xml</str>
  1125 + </searchComponent>
  1126 + -->
  1127 +
  1128 + <!-- A request handler for demonstrating the elevator component -->
  1129 + <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
  1130 + <lst name="defaults">
  1131 + <str name="echoParams">explicit</str>
  1132 + </lst>
  1133 + <arr name="last-components">
  1134 + <str>elevator</str>
  1135 + </arr>
  1136 + </requestHandler>
  1137 +
  1138 + <!-- MTAS: requestHandler -->
  1139 + <requestHandler name="/mtas" class="mtas.solr.handler.MtasRequestHandler" />
  1140 +
  1141 + <!-- Highlighting Component
  1142 +
  1143 + http://wiki.apache.org/solr/HighlightingParameters
  1144 + -->
  1145 + <searchComponent class="solr.HighlightComponent" name="highlight">
  1146 + <highlighting>
  1147 + <!-- Configure the standard fragmenter -->
  1148 + <!-- This could most likely be commented out in the "default" case -->
  1149 + <fragmenter name="gap"
  1150 + default="true"
  1151 + class="solr.highlight.GapFragmenter">
  1152 + <lst name="defaults">
  1153 + <int name="hl.fragsize">100</int>
  1154 + </lst>
  1155 + </fragmenter>
  1156 +
  1157 + <!-- A regular-expression-based fragmenter
  1158 + (for sentence extraction)
  1159 + -->
  1160 + <fragmenter name="regex"
  1161 + class="solr.highlight.RegexFragmenter">
  1162 + <lst name="defaults">
  1163 + <!-- slightly smaller fragsizes work better because of slop -->
  1164 + <int name="hl.fragsize">70</int>
  1165 + <!-- allow 50% slop on fragment sizes -->
  1166 + <float name="hl.regex.slop">0.5</float>
  1167 + <!-- a basic sentence pattern -->
  1168 + <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
  1169 + </lst>
  1170 + </fragmenter>
  1171 +
  1172 + <!-- Configure the standard formatter -->
  1173 + <formatter name="html"
  1174 + default="true"
  1175 + class="solr.highlight.HtmlFormatter">
  1176 + <lst name="defaults">
  1177 + <str name="hl.simple.pre"><![CDATA[<em>]]></str>
  1178 + <str name="hl.simple.post"><![CDATA[</em>]]></str>
  1179 + </lst>
  1180 + </formatter>
  1181 +
  1182 + <!-- Configure the standard encoder -->
  1183 + <encoder name="html"
  1184 + class="solr.highlight.HtmlEncoder" />
  1185 +
  1186 + <!-- Configure the standard fragListBuilder -->
  1187 + <fragListBuilder name="simple"
  1188 + class="solr.highlight.SimpleFragListBuilder"/>
  1189 +
  1190 + <!-- Configure the single fragListBuilder -->
  1191 + <fragListBuilder name="single"
  1192 + class="solr.highlight.SingleFragListBuilder"/>
  1193 +
  1194 + <!-- Configure the weighted fragListBuilder -->
  1195 + <fragListBuilder name="weighted"
  1196 + default="true"
  1197 + class="solr.highlight.WeightedFragListBuilder"/>
  1198 +
  1199 + <!-- default tag FragmentsBuilder -->
  1200 + <fragmentsBuilder name="default"
  1201 + default="true"
  1202 + class="solr.highlight.ScoreOrderFragmentsBuilder">
  1203 + <!--
  1204 + <lst name="defaults">
  1205 + <str name="hl.multiValuedSeparatorChar">/</str>
  1206 + </lst>
  1207 + -->
  1208 + </fragmentsBuilder>
  1209 +
  1210 + <!-- multi-colored tag FragmentsBuilder -->
  1211 + <fragmentsBuilder name="colored"
  1212 + class="solr.highlight.ScoreOrderFragmentsBuilder">
  1213 + <lst name="defaults">
  1214 + <str name="hl.tag.pre"><![CDATA[
  1215 + <b style="background:yellow">,<b style="background:lawgreen">,
  1216 + <b style="background:aquamarine">,<b style="background:magenta">,
  1217 + <b style="background:palegreen">,<b style="background:coral">,
  1218 + <b style="background:wheat">,<b style="background:khaki">,
  1219 + <b style="background:lime">,<b style="background:deepskyblue">]]></str>
  1220 + <str name="hl.tag.post"><![CDATA[</b>]]></str>
  1221 + </lst>
  1222 + </fragmentsBuilder>
  1223 +
  1224 + <boundaryScanner name="default"
  1225 + default="true"
  1226 + class="solr.highlight.SimpleBoundaryScanner">
  1227 + <lst name="defaults">
  1228 + <str name="hl.bs.maxScan">10</str>
  1229 + <str name="hl.bs.chars">.,!? &#9;&#10;&#13;</str>
  1230 + </lst>
  1231 + </boundaryScanner>
  1232 +
  1233 + <boundaryScanner name="breakIterator"
  1234 + class="solr.highlight.BreakIteratorBoundaryScanner">
  1235 + <lst name="defaults">
  1236 + <!-- type should be one of CHARACTER, WORD(default), LINE and SENTENCE -->
  1237 + <str name="hl.bs.type">WORD</str>
  1238 + <!-- language and country are used when constructing Locale object. -->
  1239 + <!-- And the Locale object will be used when getting instance of BreakIterator -->
  1240 + <str name="hl.bs.language">en</str>
  1241 + <str name="hl.bs.country">US</str>
  1242 + </lst>
  1243 + </boundaryScanner>
  1244 + </highlighting>
  1245 + </searchComponent>
  1246 +
  1247 + <!-- MTAS: searchComponent -->
  1248 + <searchComponent name="mtas" class="mtas.solr.handler.component.MtasSolrSearchComponent"/>
  1249 +
  1250 +
  1251 + <!-- Update Processors
  1252 +
  1253 + Chains of Update Processor Factories for dealing with Update
  1254 + Requests can be declared, and then used by name in Update
  1255 + Request Processors
  1256 +
  1257 + http://wiki.apache.org/solr/UpdateRequestProcessor
  1258 +
  1259 + -->
  1260 +
  1261 + <!-- Add unknown fields to the schema
  1262 +
  1263 + An example field type guessing update processor that will
  1264 + attempt to parse string-typed field values as Booleans, Longs,
  1265 + Doubles, or Dates, and then add schema fields with the guessed
  1266 + field types.
  1267 +
  1268 + This requires that the schema is both managed and mutable, by
  1269 + declaring schemaFactory as ManagedIndexSchemaFactory, with
  1270 + mutable specified as true.
  1271 +
  1272 + See http://wiki.apache.org/solr/GuessingFieldTypes
  1273 + -->
  1274 + <!--
  1275 + <updateRequestProcessorChain name="add-unknown-fields-to-the-schema">
  1276 + <processor class="solr.UUIDUpdateProcessorFactory" />
  1277 +
  1278 + <processor class="solr.LogUpdateProcessorFactory"/>
  1279 + <processor class="solr.DistributedUpdateProcessorFactory"/>
  1280 + <processor class="solr.RemoveBlankFieldUpdateProcessorFactory"/>
  1281 + <processor class="solr.FieldNameMutatingUpdateProcessorFactory">
  1282 + <str name="pattern">[^\w-\.]</str>
  1283 + <str name="replacement">_</str>
  1284 + </processor>
  1285 + <processor class="solr.ParseBooleanFieldUpdateProcessorFactory"/>
  1286 + <processor class="solr.ParseLongFieldUpdateProcessorFactory"/>
  1287 + <processor class="solr.ParseDoubleFieldUpdateProcessorFactory"/>
  1288 + <processor class="solr.ParseDateFieldUpdateProcessorFactory">
  1289 + <arr name="format">
  1290 + <str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
  1291 + <str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
  1292 + <str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
  1293 + <str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
  1294 + <str>yyyy-MM-dd'T'HH:mm:ssZ</str>
  1295 + <str>yyyy-MM-dd'T'HH:mm:ss</str>
  1296 + <str>yyyy-MM-dd'T'HH:mmZ</str>
  1297 + <str>yyyy-MM-dd'T'HH:mm</str>
  1298 + <str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
  1299 + <str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
  1300 + <str>yyyy-MM-dd HH:mm:ss.SSS</str>
  1301 + <str>yyyy-MM-dd HH:mm:ss,SSS</str>
  1302 + <str>yyyy-MM-dd HH:mm:ssZ</str>
  1303 + <str>yyyy-MM-dd HH:mm:ss</str>
  1304 + <str>yyyy-MM-dd HH:mmZ</str>
  1305 + <str>yyyy-MM-dd HH:mm</str>
  1306 + <str>yyyy-MM-dd</str>
  1307 + </arr>
  1308 + </processor>
  1309 + <processor class="solr.AddSchemaFieldsUpdateProcessorFactory">
  1310 + <str name="defaultFieldType">strings</str>
  1311 + <lst name="typeMapping">
  1312 + <str name="valueClass">java.lang.Boolean</str>
  1313 + <str name="fieldType">booleans</str>
  1314 + </lst>
  1315 + <lst name="typeMapping">
  1316 + <str name="valueClass">java.util.Date</str>
  1317 + <str name="fieldType">tdates</str>
  1318 + </lst>
  1319 + <lst name="typeMapping">
  1320 + <str name="valueClass">java.lang.Long</str>
  1321 + <str name="valueClass">java.lang.Integer</str>
  1322 + <str name="fieldType">tlongs</str>
  1323 + </lst>
  1324 + <lst name="typeMapping">
  1325 + <str name="valueClass">java.lang.Number</str>
  1326 + <str name="fieldType">tdoubles</str>
  1327 + </lst>
  1328 + </processor>
  1329 + <processor class="solr.RunUpdateProcessorFactory"/>
  1330 + </updateRequestProcessorChain>
  1331 + -->
  1332 +
  1333 + <!-- Deduplication
  1334 +
  1335 + An example dedup update processor that creates the "id" field
  1336 + on the fly based on the hash code of some other fields. This
  1337 + example has overwriteDupes set to false since we are using the
  1338 + id field as the signatureField and Solr will maintain
  1339 + uniqueness based on that anyway.
  1340 +
  1341 + -->
  1342 + <!--
  1343 + <updateRequestProcessorChain name="dedupe">
  1344 + <processor class="solr.processor.SignatureUpdateProcessorFactory">
  1345 + <bool name="enabled">true</bool>
  1346 + <str name="signatureField">id</str>
  1347 + <bool name="overwriteDupes">false</bool>
  1348 + <str name="fields">name,features,cat</str>
  1349 + <str name="signatureClass">solr.processor.Lookup3Signature</str>
  1350 + </processor>
  1351 + <processor class="solr.LogUpdateProcessorFactory" />
  1352 + <processor class="solr.RunUpdateProcessorFactory" />
  1353 + </updateRequestProcessorChain>
  1354 + -->
  1355 +
  1356 + <!-- Language identification
  1357 +
  1358 + This example update chain identifies the language of the incoming
  1359 + documents using the langid contrib. The detected language is
  1360 + written to field language_s. No field name mapping is done.
  1361 + The fields used for detection are text, title, subject and description,
  1362 + making this example suitable for detecting languages form full-text
  1363 + rich documents injected via ExtractingRequestHandler.
  1364 + See more about langId at http://wiki.apache.org/solr/LanguageDetection
  1365 + -->
  1366 + <!--
  1367 + <updateRequestProcessorChain name="langid">
  1368 + <processor class="org.apache.solr.update.processor.TikaLanguageIdentifierUpdateProcessorFactory">
  1369 + <str name="langid.fl">text,title,subject,description</str>
  1370 + <str name="langid.langField">language_s</str>
  1371 + <str name="langid.fallback">en</str>
  1372 + </processor>
  1373 + <processor class="solr.LogUpdateProcessorFactory" />
  1374 + <processor class="solr.RunUpdateProcessorFactory" />
  1375 + </updateRequestProcessorChain>
  1376 + -->
  1377 +
  1378 + <!-- Script update processor
  1379 +
  1380 + This example hooks in an update processor implemented using JavaScript.
  1381 +
  1382 + See more about the script update processor at http://wiki.apache.org/solr/ScriptUpdateProcessor
  1383 + -->
  1384 + <!--
  1385 + <updateRequestProcessorChain name="script">
  1386 + <processor class="solr.StatelessScriptUpdateProcessorFactory">
  1387 + <str name="script">update-script.js</str>
  1388 + <lst name="params">
  1389 + <str name="config_param">example config parameter</str>
  1390 + </lst>
  1391 + </processor>
  1392 + <processor class="solr.RunUpdateProcessorFactory" />
  1393 + </updateRequestProcessorChain>
  1394 + -->
  1395 +
  1396 +
  1397 + <!-- MTAS: updates -->
  1398 + <requestHandler name="/update" class="solr.UpdateRequestHandler">
  1399 + <lst name="defaults">
  1400 + <str name="update.chain">mtasUpdateProcessor</str>
  1401 + </lst>
  1402 + </requestHandler>
  1403 +
  1404 + <updateRequestProcessorChain name="mtasUpdateProcessor">
  1405 + <processor class="mtas.solr.update.processor.MtasUpdateRequestProcessorFactory" />
  1406 + <processor class="solr.LogUpdateProcessorFactory" />
  1407 + <processor class="solr.RunUpdateProcessorFactory" />
  1408 + </updateRequestProcessorChain>
  1409 +
  1410 + <!-- Response Writers
  1411 +
  1412 + http://wiki.apache.org/solr/QueryResponseWriter
  1413 +
  1414 + Request responses will be written using the writer specified by
  1415 + the 'wt' request parameter matching the name of a registered
  1416 + writer.
  1417 +
  1418 + The "default" writer is the default and will be used if 'wt' is
  1419 + not specified in the request.
  1420 + -->
  1421 + <!-- The following response writers are implicitly configured unless
  1422 + overridden...
  1423 + -->
  1424 + <!--
  1425 + <queryResponseWriter name="xml"
  1426 + default="true"
  1427 + class="solr.XMLResponseWriter" />
  1428 + <queryResponseWriter name="json" class="solr.JSONResponseWriter"/>
  1429 + <queryResponseWriter name="python" class="solr.PythonResponseWriter"/>
  1430 + <queryResponseWriter name="ruby" class="solr.RubyResponseWriter"/>
  1431 + <queryResponseWriter name="php" class="solr.PHPResponseWriter"/>
  1432 + <queryResponseWriter name="phps" class="solr.PHPSerializedResponseWriter"/>
  1433 + <queryResponseWriter name="csv" class="solr.CSVResponseWriter"/>
  1434 + <queryResponseWriter name="schema.xml" class="solr.SchemaXmlResponseWriter"/>
  1435 + -->
  1436 +
  1437 + <queryResponseWriter name="json" class="solr.JSONResponseWriter">
  1438 + <!-- For the purposes of the tutorial, JSON responses are written as
  1439 + plain text so that they are easy to read in *any* browser.
  1440 + If you expect a MIME type of "application/json" just remove this override.
  1441 + -->
  1442 + <str name="content-type">text/plain; charset=UTF-8</str>
  1443 + </queryResponseWriter>
  1444 +
  1445 + <!--
  1446 + Custom response writers can be declared as needed...
  1447 + -->
  1448 + <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy">
  1449 + <str name="template.base.dir">${velocity.template.base.dir:}</str>
  1450 + <str name="solr.resource.loader.enabled">${velocity.solr.resource.loader.enabled:true}</str>
  1451 + <str name="params.resource.loader.enabled">${velocity.params.resource.loader.enabled:false}</str>
  1452 + </queryResponseWriter>
  1453 +
  1454 + <!-- XSLT response writer transforms the XML output by any xslt file found
  1455 + in Solr's conf/xslt directory. Changes to xslt files are checked for
  1456 + every xsltCacheLifetimeSeconds.
  1457 + -->
  1458 + <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
  1459 + <int name="xsltCacheLifetimeSeconds">5</int>
  1460 + </queryResponseWriter>
  1461 +
  1462 + <!-- Query Parsers
  1463 +
  1464 + https://cwiki.apache.org/confluence/display/solr/Query+Syntax+and+Parsing
  1465 +
  1466 + Multiple QParserPlugins can be registered by name, and then
  1467 + used in either the "defType" param for the QueryComponent (used
  1468 + by SearchHandler) or in LocalParams
  1469 + -->
  1470 + <!-- example of registering a query parser -->
  1471 + <!--
  1472 + <queryParser name="myparser" class="com.mycompany.MyQParserPlugin"/>
  1473 + -->
  1474 + <queryParser name="mtas_cql" class="mtas.solr.search.MtasSolrCQLQParserPlugin"/>
  1475 +
  1476 + <!-- Function Parsers
  1477 +
  1478 + http://wiki.apache.org/solr/FunctionQuery
  1479 +
  1480 + Multiple ValueSourceParsers can be registered by name, and then
  1481 + used as function names when using the "func" QParser.
  1482 + -->
  1483 + <!-- example of registering a custom function parser -->
  1484 + <!--
  1485 + <valueSourceParser name="myfunc"
  1486 + class="com.mycompany.MyValueSourceParser" />
  1487 + -->
  1488 +
  1489 +
  1490 + <!-- Document Transformers
  1491 + http://wiki.apache.org/solr/DocTransformers
  1492 + -->
  1493 + <!--
  1494 + Could be something like:
  1495 + <transformer name="db" class="com.mycompany.LoadFromDatabaseTransformer" >
  1496 + <int name="connection">jdbc://....</int>
  1497 + </transformer>
  1498 +
  1499 + To add a constant value to all docs, use:
  1500 + <transformer name="mytrans2" class="org.apache.solr.response.transform.ValueAugmenterFactory" >
  1501 + <int name="value">5</int>
  1502 + </transformer>
  1503 +
  1504 + If you want the user to still be able to change it with _value:something_ use this:
  1505 + <transformer name="mytrans3" class="org.apache.solr.response.transform.ValueAugmenterFactory" >
  1506 + <double name="defaultValue">5</double>
  1507 + </transformer>
  1508 +
  1509 + If you are using the QueryElevationComponent, you may wish to mark documents that get boosted. The
  1510 + EditorialMarkerFactory will do exactly that:
  1511 + <transformer name="qecBooster" class="org.apache.solr.response.transform.EditorialMarkerFactory" />
  1512 + -->
  1513 +</config>
... ...
1 1 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2 2 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3 3 <properties>
4   - <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  4 + <timestamp>${maven.build.timestamp}</timestamp>
  5 + <maven.build.timestamp.format>yyyy-MM-dd HH:mm</maven.build.timestamp.format>
  6 + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
5 7 <currentDevelopmentVersion>6.2.0</currentDevelopmentVersion>
6 8 <currentDevelopmentRelease>20161027</currentDevelopmentRelease>
7 9 </properties>
... ... @@ -36,6 +38,11 @@
36 38 <sourceDirectory>src</sourceDirectory>
37 39 <resources>
38 40 <resource>
  41 + <directory>src/docker</directory>
  42 + <filtering>true</filtering>
  43 + <targetPath>${basedir}/docker</targetPath>
  44 + </resource>
  45 + <resource>
39 46 <directory>resources</directory>
40 47 </resource>
41 48 </resources>
... ... @@ -51,8 +58,8 @@
51 58 <include>**/*</include>
52 59 </includes>
53 60 <excludes>
54   - <exclude>**/.git/</exclude>
55   - </excludes>
  61 + <exclude>**/.git/</exclude>
  62 + </excludes>
56 63 <followSymlinks>false</followSymlinks>
57 64 </fileset>
58 65 </filesets>
... ... @@ -84,7 +91,7 @@
84 91 <addMavenDescriptor>false</addMavenDescriptor>
85 92 </archive>
86 93 </configuration>
87   - </plugin>
  94 + </plugin>
88 95 </plugins>
89 96 </build>
90 97 <reporting>
... ...
src/docker/Dockerfile 0 → 100644
  1 +# Automatically generated Dockerfile
  2 +# - Build ${timestamp}
  3 +# - Lucene/Solr version ${currentDevelopmentVersion}
  4 +# - Mtas release ${currentDevelopmentRelease}
  5 +#
  6 +# To run this image after installing Docker, use the following command:
  7 +#
  8 +
  9 +FROM ubuntu:16.04
  10 +MAINTAINER Matthijs Brouwer
  11 +
  12 +LABEL mtas.timestamp="${timestamp}"
  13 +LABEL mtas.lucene="${currentDevelopmentVersion}"
  14 +LABEL mtas.release="${currentDevelopmentRelease}"
  15 +
  16 +EXPOSE 8983 80
  17 +
  18 +USER root
  19 +
  20 +WORKDIR "/root"
  21 +
  22 +RUN mkdir lib && mkdir data && mkdir data/mtas
  23 +
  24 +COPY solrconfig.xml /root/data/
  25 +COPY mtas.xml /root/data/
  26 +COPY mtas/demo_folia.xml /root/data/mtas/
  27 +COPY mtas/demo_tei.xml /root/data/mtas/
  28 +COPY schemaBasic.xml /root/data/
  29 +COPY schemaFull.xml /root/data/
  30 +
  31 +ADD http://archive.apache.org/dist/lucene/solr/${currentDevelopmentVersion}/solr-${currentDevelopmentVersion}.tgz /root/
  32 +ADD http://apache.cs.uu.nl/commons/math/binaries/commons-math3-3.6.1-bin.tar.gz /root/lib/
  33 +ADD https://github.com/meertensinstituut/mtas/releases/download/${currentDevelopmentRelease}/mtas-${currentDevelopmentVersion}.jar /root/lib/
  34 +ADD https://code.jquery.com/jquery-3.1.1.min.js /root/lib/
  35 +
  36 +RUN tar xzf lib/commons-math3-3.6.1-bin.tar.gz -C lib commons-math3-3.6.1/commons-math3-3.6.1.jar --strip-components=1 && rm lib/commons-math3-3.6.1-bin.tar.gz
  37 +
  38 +RUN apt-get update && apt-get install -y \
  39 +lsof \
  40 +software-properties-common \
  41 +python-software-properties \
  42 +apache2
  43 +
  44 +RUN add-apt-repository -y ppa:webupd8team/java \
  45 +&& apt-get update \
  46 +&& echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections \
  47 +&& apt-get install -y oracle-java8-installer
  48 +
  49 +RUN rm -rf /var/lib/apt/lists/*
  50 +
  51 +RUN tar xzf solr-${currentDevelopmentVersion}.tgz solr-${currentDevelopmentVersion}/bin/install_solr_service.sh --strip-components=2 \
  52 +&& bash ./install_solr_service.sh solr-${currentDevelopmentVersion}.tgz && rm install_solr_service.sh && rm -rf solr-${currentDevelopmentVersion}.tgz
  53 +
  54 +RUN service apache2 stop \
  55 +&& a2enmod proxy \
  56 +&& a2enmod proxy_http \
  57 +&& a2enmod proxy_ajp \
  58 +&& a2enmod rewrite \
  59 +&& a2enmod deflate \
  60 +&& a2enmod headers \
  61 +&& a2enmod proxy_balancer \
  62 +&& a2enmod proxy_connect \
  63 +&& a2enmod proxy_html \
  64 +&& sed -i '/<\/VirtualHost>/ i ProxyPass /solr http://localhost:8983/solr\nProxyPassReverse /solr http://localhost:8983/solr' /etc/apache2/sites-enabled/000-default.conf
  65 +
  66 +RUN printf "service solr start\nservice apache2 start\n" > /start.sh && chmod 755 /start.sh
  67 +
  68 +RUN mkdir demo1 && mkdir demo1/lib && mkdir demo1/conf && echo "name=demo1" > demo1/core.properties \
  69 +&& cp lib/commons-math3-3.6.1.jar demo1/lib/ && cp lib/mtas-${currentDevelopmentVersion}.jar demo1/lib/ \
  70 +&& cp data/solrconfig.xml demo1/conf/ && cp data/schemaBasic.xml demo1/conf/schema.xml \
  71 +&& cp -r data/mtas demo1/conf/ && cp data/mtas.xml demo1/conf/ \
  72 +&& chmod -R 777 demo1 && cp -rp demo1 demo2 \
  73 +&& cp data/schemaFull.xml demo2/conf/schema.xml && echo "name=demo2" > demo2/core.properties\
  74 +&& mv demo1 /var/solr/data/ && mv demo2 /var/solr/data/
  75 +
  76 +CMD bash -C '/start.sh'; 'bash'
  77 +
  78 +
  79 +
  80 +
  81 +
  82 +
  83 +
... ...
src/mtas/codec/util/CodecCollector.java
... ... @@ -75,30 +75,18 @@ public class CodecCollector {
75 75 /**
76 76 * Collect.
77 77 *
78   - * @param field
79   - * the field
80   - * @param searcher
81   - * the searcher
82   - * @param reader
83   - * the reader
84   - * @param rawReader
85   - * the raw reader
86   - * @param fullDocList
87   - * the full doc list
88   - * @param fullDocSet
89   - * the full doc set
90   - * @param fieldInfo
91   - * the field info
92   - * @param spansQueryWeight
93   - * the spans query weight
94   - * @throws IllegalAccessException
95   - * the illegal access exception
96   - * @throws IllegalArgumentException
97   - * the illegal argument exception
98   - * @throws InvocationTargetException
99   - * the invocation target exception
100   - * @throws IOException
101   - * Signals that an I/O exception has occurred.
  78 + * @param field the field
  79 + * @param searcher the searcher
  80 + * @param reader the reader
  81 + * @param rawReader the raw reader
  82 + * @param fullDocList the full doc list
  83 + * @param fullDocSet the full doc set
  84 + * @param fieldInfo the field info
  85 + * @param spansQueryWeight the spans query weight
  86 + * @throws IllegalAccessException the illegal access exception
  87 + * @throws IllegalArgumentException the illegal argument exception
  88 + * @throws InvocationTargetException the invocation target exception
  89 + * @throws IOException Signals that an I/O exception has occurred.
102 90 */
103 91 public static void collect(String field, IndexSearcher searcher,
104 92 IndexReader reader, IndexReader rawReader, ArrayList<Integer> fullDocList,
... ... @@ -197,30 +185,18 @@ public class CodecCollector {
197 185 /**
198 186 * Collect spans positions and tokens.
199 187 *
200   - * @param spansQueryWeight
201   - * the spans query weight
202   - * @param searcher
203   - * the searcher
204   - * @param mtasCodecInfo
205   - * the mtas codec info
206   - * @param r
207   - * the r
208   - * @param lrc
209   - * the lrc
210   - * @param field
211   - * the field
212   - * @param t
213   - * the t
214   - * @param docSet
215   - * the doc set
216   - * @param docList
217   - * the doc list
218   - * @param fieldInfo
219   - * the field info
220   - * @param fieldInfos
221   - * the field infos
222   - * @throws IOException
223   - * Signals that an I/O exception has occurred.
  188 + * @param spansQueryWeight the spans query weight
  189 + * @param searcher the searcher
  190 + * @param mtasCodecInfo the mtas codec info
  191 + * @param r the r
  192 + * @param lrc the lrc
  193 + * @param field the field
  194 + * @param t the t
  195 + * @param docSet the doc set
  196 + * @param docList the doc list
  197 + * @param fieldInfo the field info
  198 + * @param fieldInfos the field infos
  199 + * @throws IOException Signals that an I/O exception has occurred.
224 200 */
225 201 private static void collectSpansPositionsAndTokens(
226 202 HashMap<SpanQuery, SpanWeight> spansQueryWeight, IndexSearcher searcher,
... ... @@ -628,11 +604,9 @@ public class CodecCollector {
628 604 /**
629 605 * Collect known prefixes.
630 606 *
631   - * @param fi
632   - * the fi
  607 + * @param fi the fi
633 608 * @return the hash set
634   - * @throws IOException
635   - * Signals that an I/O exception has occurred.
  609 + * @throws IOException Signals that an I/O exception has occurred.
636 610 */
637 611 private static HashSet<String> collectKnownPrefixes(FieldInfo fi)
638 612 throws IOException {
... ... @@ -683,11 +657,9 @@ public class CodecCollector {
683 657 /**
684 658 * Collect intersection prefixes.
685 659 *
686   - * @param fi
687   - * the fi
  660 + * @param fi the fi
688 661 * @return the hash set
689   - * @throws IOException
690   - * Signals that an I/O exception has occurred.
  662 + * @throws IOException Signals that an I/O exception has occurred.
691 663 */
692 664 private static HashSet<String> collectIntersectionPrefixes(FieldInfo fi)
693 665 throws IOException {
... ... @@ -714,14 +686,10 @@ public class CodecCollector {
714 686 /**
715 687 * Collect prefixes.
716 688 *
717   - * @param fieldInfos
718   - * the field infos
719   - * @param field
720   - * the field
721   - * @param fieldInfo
722   - * the field info
723   - * @throws IOException
724   - * Signals that an I/O exception has occurred.
  689 + * @param fieldInfos the field infos
  690 + * @param field the field
  691 + * @param fieldInfo the field info
  692 + * @throws IOException Signals that an I/O exception has occurred.
725 693 */
726 694 private static void collectPrefixes(FieldInfos fieldInfos, String field,
727 695 ComponentField fieldInfo) throws IOException {
... ... @@ -771,21 +739,14 @@ public class CodecCollector {
771 739 /**
772 740 * Collect spans for occurences.
773 741 *
774   - * @param occurences
775   - * the occurences
776   - * @param prefixes
777   - * the prefixes
778   - * @param field
779   - * the field
780   - * @param mtasCodecInfo
781   - * the mtas codec info
782   - * @param searcher
783   - * the searcher
784   - * @param lrc
785   - * the lrc
  742 + * @param occurences the occurences
  743 + * @param prefixes the prefixes
  744 + * @param field the field
  745 + * @param mtasCodecInfo the mtas codec info
  746 + * @param searcher the searcher
  747 + * @param lrc the lrc
786 748 * @return the hash map
787   - * @throws IOException
788   - * Signals that an I/O exception has occurred.
  749 + * @throws IOException Signals that an I/O exception has occurred.
789 750 */
790 751 private static HashMap<GroupHit, Spans> collectSpansForOccurences(
791 752 HashSet<GroupHit> occurences, HashSet<String> prefixes, String field,
... ... @@ -810,12 +771,9 @@ public class CodecCollector {
810 771 /**
811 772 * Creates the query from group hit.
812 773 *
813   - * @param prefixes
814   - * the prefixes
815   - * @param field
816   - * the field
817   - * @param hit
818   - * the hit
  774 + * @param prefixes the prefixes
  775 + * @param field the field
  776 + * @param hit the hit
819 777 * @return the span query
820 778 */
821 779 private static SpanQuery createQueryFromGroupHit(HashSet<String> prefixes,
... ... @@ -880,21 +838,14 @@ public class CodecCollector {
880 838 /**
881 839 * Compute positions.
882 840 *
883   - * @param mtasCodecInfo
884   - * the mtas codec info
885   - * @param r
886   - * the r
887   - * @param lrc
888   - * the lrc
889   - * @param field
890   - * the field
891   - * @param t
892   - * the t
893   - * @param docSet
894   - * the doc set
  841 + * @param mtasCodecInfo the mtas codec info
  842 + * @param r the r
  843 + * @param lrc the lrc
  844 + * @param field the field
  845 + * @param t the t
  846 + * @param docSet the doc set
895 847 * @return the hash map
896   - * @throws IOException
897   - * Signals that an I/O exception has occurred.
  848 + * @throws IOException Signals that an I/O exception has occurred.
898 849 */
899 850 private static HashMap<Integer, Integer> computePositions(
900 851 CodecInfo mtasCodecInfo, LeafReader r, LeafReaderContext lrc,
... ... @@ -930,12 +881,9 @@ public class CodecCollector {
930 881 /**
931 882 * Compute arguments.
932 883 *
933   - * @param spansNumberData
934   - * the spans number data
935   - * @param queries
936   - * the queries
937   - * @param docSet
938   - * the doc set
  884 + * @param spansNumberData the spans number data
  885 + * @param queries the queries
  886 + * @param docSet the doc set
939 887 * @return the hash map
940 888 */
941 889 private static HashMap<Integer, long[]> computeArguments(
... ... @@ -966,10 +914,8 @@ public class CodecCollector {
966 914 /**
967 915 * Intersected doc list.
968 916 *
969   - * @param facetDocList
970   - * the facet doc list
971   - * @param docSet
972   - * the doc set
  917 + * @param facetDocList the facet doc list
  918 + * @param docSet the doc set
973 919 * @return the integer[]
974 920 */
975 921 private static Integer[] intersectedDocList(int[] facetDocList,
... ... @@ -1000,14 +946,10 @@ public class CodecCollector {
1000 946 /**
1001 947 * Creates the positions.
1002 948 *
1003   - * @param statsPositionList
1004   - * the stats position list
1005   - * @param positionsData
1006   - * the positions data
1007   - * @param docSet
1008   - * the doc set
1009   - * @throws IOException
1010   - * Signals that an I/O exception has occurred.
  949 + * @param statsPositionList the stats position list
  950 + * @param positionsData the positions data
  951 + * @param docSet the doc set
  952 + * @throws IOException Signals that an I/O exception has occurred.
1011 953 */
1012 954 private static void createPositions(List<ComponentPosition> statsPositionList,
1013 955 HashMap<Integer, Integer> positionsData, List<Integer> docSet)
... ... @@ -1040,14 +982,10 @@ public class CodecCollector {
1040 982 /**
1041 983 * Creates the tokens.
1042 984 *
1043   - * @param statsTokenList
1044   - * the stats token list
1045   - * @param tokensData
1046   - * the tokens data
1047   - * @param docSet
1048   - * the doc set
1049   - * @throws IOException
1050   - * Signals that an I/O exception has occurred.
  985 + * @param statsTokenList the stats token list
  986 + * @param tokensData the tokens data
  987 + * @param docSet the doc set
  988 + * @throws IOException Signals that an I/O exception has occurred.
1051 989 */
1052 990 private static void createTokens(List<ComponentToken> statsTokenList,
1053 991 HashMap<Integer, Integer> tokensData, List<Integer> docSet)
... ... @@ -1081,16 +1019,11 @@ public class CodecCollector {
1081 1019 /**
1082 1020 * Creates the stats.
1083 1021 *
1084   - * @param statsSpanList
1085   - * the stats span list
1086   - * @param positionsData
1087   - * the positions data
1088   - * @param spansNumberData
1089   - * the spans number data
1090   - * @param docSet
1091   - * the doc set
1092   - * @throws IOException
1093   - * Signals that an I/O exception has occurred.
  1022 + * @param statsSpanList the stats span list
  1023 + * @param positionsData the positions data
  1024 + * @param spansNumberData the spans number data
  1025 + * @param docSet the doc set
  1026 + * @throws IOException Signals that an I/O exception has occurred.
1094 1027 */
1095 1028 private static void createStats(List<ComponentSpan> statsSpanList,
1096 1029 HashMap<Integer, Integer> positionsData,
... ... @@ -1263,26 +1196,16 @@ public class CodecCollector {
1263 1196 /**
1264 1197 * Creates the list.
1265 1198 *
1266   - * @param listList
1267   - * the list list
1268   - * @param spansNumberData
1269   - * the spans number data
1270   - * @param spansMatchData
1271   - * the spans match data
1272   - * @param docSet
1273   - * the doc set
1274   - * @param field
1275   - * the field
1276   - * @param docBase
1277   - * the doc base
1278   - * @param uniqueKeyField
1279   - * the unique key field
1280   - * @param mtasCodecInfo
1281   - * the mtas codec info
1282   - * @param searcher
1283   - * the searcher
1284   - * @throws IOException
1285   - * Signals that an I/O exception has occurred.
  1199 + * @param listList the list list
  1200 + * @param spansNumberData the spans number data
  1201 + * @param spansMatchData the spans match data
  1202 + * @param docSet the doc set
  1203 + * @param field the field
  1204 + * @param docBase the doc base
  1205 + * @param uniqueKeyField the unique key field
  1206 + * @param mtasCodecInfo the mtas codec info
  1207 + * @param searcher the searcher
  1208 + * @throws IOException Signals that an I/O exception has occurred.
1286 1209 */
1287 1210 private static void createList(List<ComponentList> listList,
1288 1211 HashMap<SpanQuery, HashMap<Integer, Integer>> spansNumberData,
... ... @@ -1445,26 +1368,16 @@ public class CodecCollector {
1445 1368 /**
1446 1369 * Creates the group.
1447 1370 *
1448   - * @param groupList
1449   - * the group list
1450   - * @param spansMatchData
1451   - * the spans match data
1452   - * @param docSet
1453   - * the doc set
1454   - * @param fieldInfo
1455   - * the field info
1456   - * @param field
1457   - * the field
1458   - * @param docBase
1459   - * the doc base
1460   - * @param mtasCodecInfo
1461   - * the mtas codec info
1462   - * @param searcher
1463   - * the searcher
1464   - * @param lrc
1465   - * the lrc
1466   - * @throws IOException
1467   - * Signals that an I/O exception has occurred.
  1371 + * @param groupList the group list
  1372 + * @param spansMatchData the spans match data
  1373 + * @param docSet the doc set
  1374 + * @param fieldInfo the field info
  1375 + * @param field the field
  1376 + * @param docBase the doc base
  1377 + * @param mtasCodecInfo the mtas codec info
  1378 + * @param searcher the searcher
  1379 + * @param lrc the lrc
  1380 + * @throws IOException Signals that an I/O exception has occurred.
1468 1381 */
1469 1382 private static void createGroup(List<ComponentGroup> groupList,
1470 1383 HashMap<SpanQuery, HashMap<Integer, ArrayList<Match>>> spansMatchData,
... ... @@ -1651,10 +1564,8 @@ public class CodecCollector {
1651 1564 /**
1652 1565 * Available prefixes.
1653 1566 *
1654   - * @param group
1655   - * the group
1656   - * @param knownPrefixes
1657   - * the known prefixes
  1567 + * @param group the group
  1568 + * @param knownPrefixes the known prefixes
1658 1569 * @return true, if successful
1659 1570 */
1660 1571 private static boolean availablePrefixes(ComponentGroup group,
... ... @@ -1670,10 +1581,8 @@ public class CodecCollector {
1670 1581 /**
1671 1582 * Intersection prefixes.
1672 1583 *
1673   - * @param group
1674   - * the group
1675   - * @param intersectionPrefixes
1676   - * the intersection prefixes
  1584 + * @param group the group
  1585 + * @param intersectionPrefixes the intersection prefixes
1677 1586 * @return true, if successful
1678 1587 */
1679 1588 private static boolean intersectionPrefixes(ComponentGroup group,
... ... @@ -1689,10 +1598,8 @@ public class CodecCollector {
1689 1598 /**
1690 1599 * Creates the position hit.
1691 1600 *
1692   - * @param m
1693   - * the m
1694   - * @param group
1695   - * the group
  1601 + * @param m the m
  1602 + * @param group the group
1696 1603 * @return the interval tree node data
1697 1604 */
1698 1605 private static IntervalTreeNodeData<String> createPositionHit(Match m,
... ... @@ -1734,23 +1641,15 @@ public class CodecCollector {
1734 1641 /**
1735 1642 * Collect group using spans.
1736 1643 *
1737   - * @param list
1738   - * the list
1739   - * @param docSet
1740   - * the doc set
1741   - * @param docBase
1742   - * the doc base
1743   - * @param docCounter
1744   - * the doc counter
1745   - * @param matchData
1746   - * the match data
1747   - * @param occurencesSum
1748   - * the occurences sum
1749   - * @param occurencesN
1750   - * the occurences n
  1644 + * @param list the list
  1645 + * @param docSet the doc set
  1646 + * @param docBase the doc base
  1647 + * @param docCounter the doc counter
  1648 + * @param matchData the match data
  1649 + * @param occurencesSum the occurences sum
  1650 + * @param occurencesN the occurences n
1751 1651 * @return the int
1752   - * @throws IOException
1753   - * Signals that an I/O exception has occurred.
  1652 + * @throws IOException Signals that an I/O exception has occurred.
1754 1653 */
1755 1654 private static int collectGroupUsingSpans(HashMap<GroupHit, Spans> list,
1756 1655 List<Integer> docSet, int docBase, int docCounter,
... ... @@ -1960,8 +1859,7 @@ public class CodecCollector {
1960 1859 /**
1961 1860 * Sort match list.
1962 1861 *
1963   - * @param list
1964   - * the list
  1862 + * @param list the list
1965 1863 */
1966 1864 private static void sortMatchList(ArrayList<Match> list) {
1967 1865 if (list != null) {
... ... @@ -1981,6 +1879,20 @@ public class CodecCollector {
1981 1879 }
1982 1880 }
1983 1881  
  1882 + /**
  1883 + * Creates the distinct.
  1884 + *
  1885 + * @param distinctList the distinct list
  1886 + * @param docList the doc list
  1887 + * @param field the field
  1888 + * @param docBase the doc base
  1889 + * @param uniqueKeyField the unique key field
  1890 + * @param searcher the searcher
  1891 + * @param t the t
  1892 + * @param r the r
  1893 + * @param lrc the lrc
  1894 + * @throws IOException Signals that an I/O exception has occurred.
  1895 + */
1984 1896 private static void createDistinct(List<ComponentDistinct> distinctList,
1985 1897 List<Integer> docList, String field, int docBase, String uniqueKeyField,
1986 1898 IndexSearcher searcher, Terms t, LeafReader r, LeafReaderContext lrc)
... ... @@ -2069,24 +1981,15 @@ public class CodecCollector {
2069 1981 /**
2070 1982 * Creates the kwic.
2071 1983 *
2072   - * @param kwicList
2073   - * the kwic list
2074   - * @param spansMatchData
2075   - * the spans match data
2076   - * @param docList
2077   - * the doc list
2078   - * @param field
2079   - * the field
2080   - * @param docBase
2081   - * the doc base
2082   - * @param uniqueKeyField
2083   - * the unique key field
2084   - * @param mtasCodecInfo
2085   - * the mtas codec info
2086   - * @param searcher
2087   - * the searcher
2088   - * @throws IOException
2089   - * Signals that an I/O exception has occurred.
  1984 + * @param kwicList the kwic list
  1985 + * @param spansMatchData the spans match data
  1986 + * @param docList the doc list
  1987 + * @param field the field
  1988 + * @param docBase the doc base
  1989 + * @param uniqueKeyField the unique key field
  1990 + * @param mtasCodecInfo the mtas codec info
  1991 + * @param searcher the searcher
  1992 + * @throws IOException Signals that an I/O exception has occurred.
2090 1993 */
2091 1994 private static void createKwic(List<ComponentKwic> kwicList,
2092 1995 HashMap<SpanQuery, HashMap<Integer, ArrayList<Match>>> spansMatchData,
... ... @@ -2204,22 +2107,14 @@ public class CodecCollector {
2204 2107 /**
2205 2108 * Creates the facet base.
2206 2109 *
2207   - * @param cf
2208   - * the cf
2209   - * @param level
2210   - * the level
2211   - * @param dataCollector
2212   - * the data collector
2213   - * @param positionsData
2214   - * the positions data
2215   - * @param spansNumberData
2216   - * the spans number data
2217   - * @param facetData
2218   - * the facet data
2219   - * @param docSet
2220   - * the doc set
2221   - * @throws IOException
2222   - * Signals that an I/O exception has occurred.
  2110 + * @param cf the cf
  2111 + * @param level the level
  2112 + * @param dataCollector the data collector
  2113 + * @param positionsData the positions data
  2114 + * @param spansNumberData the spans number data
  2115 + * @param facetData the facet data
  2116 + * @param docSet the doc set
  2117 + * @throws IOException Signals that an I/O exception has occurred.
2223 2118 */
2224 2119 private static void createFacetBase(ComponentFacet cf, int level,
2225 2120 MtasDataCollector<?, ?> dataCollector,
... ... @@ -2490,28 +2385,17 @@ public class CodecCollector {
2490 2385 /**
2491 2386 * Creates the facet.
2492 2387 *
2493   - * @param facetList
2494   - * the facet list
2495   - * @param positionsData
2496   - * the positions data
2497   - * @param spansNumberData
2498   - * the spans number data
2499   - * @param facetData
2500   - * the facet data
2501   - * @param docSet
2502   - * the doc set
2503   - * @param field
2504   - * the field
2505   - * @param docBase
2506   - * the doc base
2507   - * @param uniqueKeyField
2508   - * the unique key field
2509   - * @param mtasCodecInfo
2510   - * the mtas codec info
2511   - * @param searcher
2512   - * the searcher
2513   - * @throws IOException
2514   - * Signals that an I/O exception has occurred.
  2388 + * @param facetList the facet list
  2389 + * @param positionsData the positions data
  2390 + * @param spansNumberData the spans number data
  2391 + * @param facetData the facet data
  2392 + * @param docSet the doc set
  2393 + * @param field the field
  2394 + * @param docBase the doc base
  2395 + * @param uniqueKeyField the unique key field
  2396 + * @param mtasCodecInfo the mtas codec info
  2397 + * @param searcher the searcher
  2398 + * @throws IOException Signals that an I/O exception has occurred.
2515 2399 */
2516 2400 private static void createFacet(List<ComponentFacet> facetList,
2517 2401 HashMap<Integer, Integer> positionsData,
... ... @@ -2531,6 +2415,18 @@ public class CodecCollector {
2531 2415 }
2532 2416 }
2533 2417  
  2418 + /**
  2419 + * Creates the termvector full.
  2420 + *
  2421 + * @param termVectorList the term vector list
  2422 + * @param positionsData the positions data
  2423 + * @param docSet the doc set
  2424 + * @param field the field
  2425 + * @param t the t
  2426 + * @param r the r
  2427 + * @param lrc the lrc
  2428 + * @throws IOException Signals that an I/O exception has occurred.
  2429 + */
2534 2430 private static void createTermvectorFull(
2535 2431 List<ComponentTermVector> termVectorList,
2536 2432 HashMap<Integer, Integer> positionsData, List<Integer> docSet,
... ... @@ -2691,22 +2587,14 @@ public class CodecCollector {
2691 2587 /**
2692 2588 * Creates the termvector first round.
2693 2589 *
2694   - * @param termVectorList
2695   - * the term vector list
2696   - * @param positionsData
2697   - * the positions data
2698   - * @param docSet
2699   - * the doc set
2700   - * @param field
2701   - * the field
2702   - * @param t
2703   - * the t
2704   - * @param r
2705   - * the r
2706   - * @param lrc
2707   - * the lrc
2708   - * @throws IOException
2709   - * Signals that an I/O exception has occurred.
  2590 + * @param termVectorList the term vector list
  2591 + * @param positionsData the positions data
  2592 + * @param docSet the doc set
  2593 + * @param field the field
  2594 + * @param t the t
  2595 + * @param r the r
  2596 + * @param lrc the lrc
  2597 + * @throws IOException Signals that an I/O exception has occurred.
2710 2598 */
2711 2599 private static void createTermvectorFirstRound(
2712 2600 List<ComponentTermVector> termVectorList,
... ... @@ -2851,22 +2739,14 @@ public class CodecCollector {
2851 2739 /**
2852 2740 * Creates the termvector second round.
2853 2741 *
2854   - * @param termVectorList
2855   - * the term vector list
2856   - * @param positionsData
2857   - * the positions data
2858   - * @param docSet
2859   - * the doc set
2860   - * @param field
2861   - * the field
2862   - * @param t
2863   - * the t
2864   - * @param r
2865   - * the r
2866   - * @param lrc
2867   - * the lrc
2868   - * @throws IOException
2869   - * Signals that an I/O exception has occurred.
  2742 + * @param termVectorList the term vector list
  2743 + * @param positionsData the positions data
  2744 + * @param docSet the doc set
  2745 + * @param field the field
  2746 + * @param t the t
  2747 + * @param r the r
  2748 + * @param lrc the lrc
  2749 + * @throws IOException Signals that an I/O exception has occurred.
2870 2750 */
2871 2751 private static void createTermvectorSecondRound(
2872 2752 List<ComponentTermVector> termVectorList,
... ... @@ -2944,11 +2824,9 @@ public class CodecCollector {
2944 2824 /**
2945 2825 * Need second round termvector.
2946 2826 *
2947   - * @param termVectorList
2948   - * the term vector list
  2827 + * @param termVectorList the term vector list
2949 2828 * @return true, if successful
2950   - * @throws IOException
2951   - * Signals that an I/O exception has occurred.
  2829 + * @throws IOException Signals that an I/O exception has occurred.
2952 2830 */
2953 2831 private static boolean needSecondRoundTermvector(
2954 2832 List<ComponentTermVector> termVectorList) throws IOException {
... ... @@ -3022,8 +2900,7 @@ public class CodecCollector {
3022 2900 /**
3023 2901 * Instantiates a new termvector number full.
3024 2902 *
3025   - * @param maxSize
3026   - * the max size
  2903 + * @param maxSize the max size
3027 2904 */
3028 2905 TermvectorNumberFull(int maxSize) {
3029 2906 args = new long[maxSize];
... ... @@ -3046,10 +2923,8 @@ public class CodecCollector {
3046 2923 /**
3047 2924 * Instantiates a new register status.
3048 2925 *
3049   - * @param sortValue
3050   - * the sort value
3051   - * @param force
3052   - * the force
  2926 + * @param sortValue the sort value
  2927 + * @param force the force
3053 2928 */
3054 2929 RegisterStatus(long sortValue, boolean force) {
3055 2930 this.sortValue = sortValue;
... ... @@ -3060,21 +2935,15 @@ public class CodecCollector {
3060 2935 /**
3061 2936 * Register value.
3062 2937 *
3063   - * @param term
3064   - * the term
3065   - * @param termVector
3066   - * the term vector
3067   - * @param number
3068   - * the number
3069   - * @param termNumberMaximum
3070   - * the term number maximum
3071   - * @param segmentNumber
3072   - * the segment number
3073   - * @param forceAccept
3074   - * the force accept
  2938 + * @param term the term
  2939 + * @param termVector the term vector
  2940 + * @param number the number
  2941 + * @param termNumberMaximum the term number maximum
  2942 + * @param segmentNumber the segment number
  2943 + * @param forceAccept the force accept
  2944 + * @param mutableKey the mutable key
3075 2945 * @return the register status
3076   - * @throws IOException
3077   - * Signals that an I/O exception has occurred.
  2946 + * @throws IOException Signals that an I/O exception has occurred.
3078 2947 */
3079 2948 @SuppressWarnings("unchecked")
3080 2949 private static RegisterStatus registerValue(BytesRef term,
... ... @@ -3109,7 +2978,7 @@ public class CodecCollector {
3109 2978 if (mutableKey[0] == null) {
3110 2979 mutableKey[0] = MtasToken.getPostfixFromValue(term);
3111 2980 }
3112   - String segmentStatus = dataCollector.validateSegmentValueOld(
  2981 + String segmentStatus = dataCollector.validateSegmentValue(
3113 2982 mutableKey[0], sortValue, termNumberMaximum, segmentNumber,
3114 2983 false);
3115 2984 if (segmentStatus != null) {
... ... @@ -3130,7 +2999,7 @@ public class CodecCollector {
3130 2999 } else if (segmentStatus
3131 3000 .equals(MtasDataCollector.SEGMENT_POSSIBLE_KEY)) {
3132 3001 mutableKey[0] = MtasToken.getPostfixFromValue(term);
3133   - segmentStatus = dataCollector.validateSegmentValueOld(mutableKey[0],
  3002 + segmentStatus = dataCollector.validateSegmentValue(mutableKey[0],
3134 3003 sortValue, termNumberMaximum, segmentNumber, true);
3135 3004 if (segmentStatus != null) {
3136 3005 possibleAddItem = true;
... ... @@ -3145,7 +3014,7 @@ public class CodecCollector {
3145 3014 if (mutableKey[0] == null) {
3146 3015 mutableKey[0] = MtasToken.getPostfixFromValue(term);
3147 3016 }
3148   - segmentStatus = dataCollector.validateSegmentValueOld(mutableKey[0],
  3017 + segmentStatus = dataCollector.validateSegmentValue(mutableKey[0],
3149 3018 sortValue, termNumberMaximum, segmentNumber, false);
3150 3019 if (segmentStatus != null) {
3151 3020 addItem = true;
... ... @@ -3202,19 +3071,14 @@ public class CodecCollector {
3202 3071 /**
3203 3072 * Preliminary register value.
3204 3073 *
3205   - * @param term
3206   - * the term
3207   - * @param termVector
3208   - * the term vector
3209   - * @param number
3210   - * the number
3211   - * @param termNumberMaximum
3212   - * the term number maximum
3213   - * @param segmentNumber
3214   - * the segment number
  3074 + * @param term the term
  3075 + * @param termVector the term vector
  3076 + * @param number the number
  3077 + * @param termNumberMaximum the term number maximum
  3078 + * @param segmentNumber the segment number
  3079 + * @param mutableKey the mutable key
3215 3080 * @return true, if successful
3216   - * @throws IOException
3217   - * Signals that an I/O exception has occurred.
  3081 + * @throws IOException Signals that an I/O exception has occurred.
3218 3082 */
3219 3083 private static boolean preliminaryRegisterValue(BytesRef term,
3220 3084 ComponentTermVector termVector, TermvectorNumberBasic number,
... ... @@ -3246,7 +3110,7 @@ public class CodecCollector {
3246 3110 } else if (segmentStatus
3247 3111 .equals(MtasDataCollector.SEGMENT_POSSIBLE_KEY)) {
3248 3112 mutableKey[0] = MtasToken.getPostfixFromValue(term);
3249   - segmentStatus = dataCollector.validateSegmentValueOld(mutableKey[0],
  3113 + segmentStatus = dataCollector.validateSegmentValue(mutableKey[0],
3250 3114 sortValue, termNumberMaximum, segmentNumber, true);
3251 3115 if (segmentStatus != null) {
3252 3116 return true;
... ... @@ -3266,18 +3130,13 @@ public class CodecCollector {
3266 3130 /**
3267 3131 * Register value.
3268 3132 *
3269   - * @param term
3270   - * the term
3271   - * @param termVector
3272   - * the term vector
3273   - * @param number
3274   - * the number
3275   - * @param termNumberMaximum
3276   - * the term number maximum
3277   - * @param segmentNumber
3278   - * the segment number
3279   - * @throws IOException
3280   - * Signals that an I/O exception has occurred.
  3133 + * @param term the term
  3134 + * @param termVector the term vector
  3135 + * @param number the number
  3136 + * @param termNumberMaximum the term number maximum
  3137 + * @param segmentNumber the segment number
  3138 + * @param mutableKey the mutable key
  3139 + * @throws IOException Signals that an I/O exception has occurred.
3281 3140 */
3282 3141 @SuppressWarnings("unchecked")
3283 3142 private static void registerValue(BytesRef term,
... ... @@ -3339,13 +3198,10 @@ public class CodecCollector {
3339 3198 /**
3340 3199 * Compute termvector number basic.
3341 3200 *
3342   - * @param termsEnum
3343   - * the terms enum
3344   - * @param r
3345   - * the r
  3201 + * @param termsEnum the terms enum
  3202 + * @param r the r
3346 3203 * @return the termvector number basic
3347   - * @throws IOException
3348   - * Signals that an I/O exception has occurred.
  3204 + * @throws IOException Signals that an I/O exception has occurred.
3349 3205 */
3350 3206 private static TermvectorNumberBasic computeTermvectorNumberBasic(
3351 3207 TermsEnum termsEnum, LeafReader r) throws IOException {
... ... @@ -3364,21 +3220,14 @@ public class CodecCollector {
3364 3220 /**
3365 3221 * Compute termvector number basic.
3366 3222 *
3367   - * @param docSet
3368   - * the doc set
3369   - * @param termDocId
3370   - * the term doc id
3371   - * @param termsEnum
3372   - * the terms enum
3373   - * @param r
3374   - * the r
3375   - * @param lrc
3376   - * the lrc
3377   - * @param postingsEnum
3378   - * the postings enum
  3223 + * @param docSet the doc set
  3224 + * @param termDocId the term doc id
  3225 + * @param termsEnum the terms enum
  3226 + * @param r the r
  3227 + * @param lrc the lrc
  3228 + * @param postingsEnum the postings enum
3379 3229 * @return the termvector number basic
3380   - * @throws IOException
3381   - * Signals that an I/O exception has occurred.
  3230 + * @throws IOException Signals that an I/O exception has occurred.
3382 3231 */
3383 3232 private static TermvectorNumberBasic computeTermvectorNumberBasic(
3384 3233 List<Integer> docSet, int termDocId, TermsEnum termsEnum, LeafReader r,
... ... @@ -3417,23 +3266,15 @@ public class CodecCollector {
3417 3266 /**
3418 3267 * Compute termvector number full.
3419 3268 *
3420   - * @param docSet
3421   - * the doc set
3422   - * @param termDocId
3423   - * the term doc id
3424   - * @param termsEnum
3425   - * the terms enum
3426   - * @param r
3427   - * the r
3428   - * @param lrc
3429   - * the lrc
3430   - * @param postingsEnum
3431   - * the postings enum
3432   - * @param positionsData
3433   - * the positions data
  3269 + * @param docSet the doc set
  3270 + * @param termDocId the term doc id
  3271 + * @param termsEnum the terms enum
  3272 + * @param r the r
  3273 + * @param lrc the lrc
  3274 + * @param postingsEnum the postings enum
  3275 + * @param positionsData the positions data
3434 3276 * @return the termvector number full
3435   - * @throws IOException
3436   - * Signals that an I/O exception has occurred.
  3277 + * @throws IOException Signals that an I/O exception has occurred.
3437 3278 */
3438 3279 private static TermvectorNumberFull computeTermvectorNumberFull(
3439 3280 List<Integer> docSet, int termDocId, TermsEnum termsEnum, LeafReader r,
... ...
src/mtas/codec/util/CodecComponent.java
... ... @@ -282,10 +282,10 @@ public class CodecComponent {
282 282 public HashMap<Integer, String> uniqueKey;
283 283  
284 284 /** The stats. */
285   - public HashMap<Integer, MtasDataCollector> stats;
  285 + public HashMap<Integer, MtasDataCollector<?,?>> stats;
286 286  
287 287 /** The list. */
288   - public HashMap<Integer, MtasDataCollector> list;
  288 + public HashMap<Integer, MtasDataCollector<?,?>> list;
289 289  
290 290 /**
291 291 * Instantiates a new component distinct.
... ... @@ -315,9 +315,9 @@ public class CodecComponent {
315 315 prefix + MtasToken.DELIMITER + regexp + "\u0000*");
316 316 compiledAutomaton = new CompiledAutomaton(re.toAutomaton());
317 317 }
318   - this.stats = new HashMap<Integer, MtasDataCollector>();
  318 + this.stats = new HashMap<Integer, MtasDataCollector<?,?>>();
319 319 if (this.number > 0) {
320   - this.list = new HashMap<Integer, MtasDataCollector>();
  320 + this.list = new HashMap<Integer, MtasDataCollector<?,?>>();
321 321 } else {
322 322 this.list = null;
323 323 }
... ... @@ -2106,12 +2106,12 @@ public class CodecComponent {
2106 2106 * @param newKey the new key
2107 2107 * @return the hash map[]
2108 2108 */
2109   - private static HashMap[] keyToSubSubObject(String key,
  2109 + private static HashMap<String, String>[] keyToSubSubObject(String key,
2110 2110 StringBuilder newKey) {
2111 2111 if (key != "") {
2112 2112 newKey.append(" [");
2113 2113 String prefix, postfix, parts[] = key.split(Pattern.quote("&"));
2114   - HashMap[] result = new HashMap[parts.length];
  2114 + HashMap<String,String>[] result = new HashMap[parts.length];
2115 2115 Pattern pattern = Pattern.compile("^([^\\.]*)\\.([^\\.]*)$");
2116 2116 Decoder decoder = Base64.getDecoder();
2117 2117 Matcher matcher;
... ... @@ -2164,8 +2164,8 @@ public class CodecComponent {
2164 2164 * @param newKey the new key
2165 2165 * @return the hash map
2166 2166 */
2167   - private static HashMap keyToSubObject(String key, StringBuilder newKey) {
2168   - HashMap<Integer, HashMap[]> result = new HashMap<Integer, HashMap[]>();
  2167 + private static HashMap<Integer, HashMap<String,String>[]> keyToSubObject(String key, StringBuilder newKey) {
  2168 + HashMap<Integer, HashMap<String,String>[]> result = new HashMap();
2169 2169 if (key == null || key.trim().equals("")) {
2170 2170 return null;
2171 2171 } else {
... ... @@ -2188,13 +2188,13 @@ public class CodecComponent {
2188 2188 * @param newKey the new key
2189 2189 * @return the hash map
2190 2190 */
2191   - public static HashMap keyToObject(String key, StringBuilder newKey) {
  2191 + public static HashMap<String, HashMap<Integer, HashMap<String,String>[]>> keyToObject(String key, StringBuilder newKey) {
2192 2192 if (key.startsWith(KEY_START)) {
2193 2193 String content = key.substring(KEY_START.length());
2194 2194 StringBuilder keyLeft = new StringBuilder(""),
2195 2195 keyHit = new StringBuilder(""), keyRight = new StringBuilder("");
2196   - HashMap<String, HashMap<Integer, HashMap[]>> result = new HashMap<String, HashMap<Integer, HashMap[]>>();
2197   - HashMap<Integer, HashMap[]> resultLeft = null, resultHit = null,
  2196 + HashMap<String, HashMap<Integer, HashMap<String,String>[]>> result = new HashMap<String, HashMap<Integer, HashMap<String,String>[]>>();
  2197 + HashMap<Integer, HashMap<String,String>[]> resultLeft = null, resultHit = null,
2198 2198 resultRight = null;
2199 2199 String[] parts = content.split(Pattern.quote("|"), -1);
2200 2200 if (parts.length == 3) {
... ...
src/mtas/codec/util/collector/MtasDataCollector.java
... ... @@ -111,7 +111,7 @@ public abstract class MtasDataCollector&lt;T1 extends Number &amp; Comparable&lt;T1&gt;, T2 e
111 111  
112 112 /** The segment keys. */
113 113 public transient HashSet<String> segmentKeys;
114   -
  114 +
115 115 /** The segment values boundary. */
116 116 protected transient LinkedHashMap<String, T1> segmentValuesBoundary;
117 117  
... ... @@ -180,7 +180,8 @@ public abstract class MtasDataCollector&lt;T1 extends Number &amp; Comparable&lt;T1&gt;, T2 e
180 180  
181 181 /** The new error list. */
182 182 protected transient HashMap<String, Integer>[] newErrorList;
183   -
  183 +
  184 + /** The new known key found in segment. */
184 185 public transient HashSet<String> newKnownKeyFoundInSegment;
185 186  
186 187 /** The new sub collector types. */
... ... @@ -560,7 +561,7 @@ public abstract class MtasDataCollector&lt;T1 extends Number &amp; Comparable&lt;T1&gt;, T2 e
560 561 "collector should be " + DataCollector.COLLECTOR_TYPE_LIST);
561 562 } else if (key == null) {
562 563 throw new IOException("key shouldn't be null");
563   - } else {
  564 + } else {
564 565 // check previous added
565 566 if ((newPosition > 0)
566 567 && newKeyList[(newPosition - 1)].compareTo(key) >= 0) {
... ... @@ -603,8 +604,8 @@ public abstract class MtasDataCollector&lt;T1 extends Number &amp; Comparable&lt;T1&gt;, T2 e
603 604 }
604 605 newCurrentPosition = newPosition - 1;
605 606 newCurrentExisting = true;
606   - //register known key found again in segment
607   - newKnownKeyFoundInSegment.add(key);
  607 + // register known key found again in segment
  608 + newKnownKeyFoundInSegment.add(key);
608 609 // ready
609 610 if (hasSub) {
610 611 return newSubCollectorListNextLevel[newCurrentPosition];
... ... @@ -892,7 +893,6 @@ public abstract class MtasDataCollector&lt;T1 extends Number &amp; Comparable&lt;T1&gt;, T2 e
892 893 * @param value the value
893 894 * @param maximumNumber the maximum number
894 895 * @param segmentNumber the segment number
895   - * @param alreadyFound the already found
896 896 * @return the string
897 897 * @throws IOException Signals that an I/O exception has occurred.
898 898 */
... ... @@ -923,7 +923,7 @@ public abstract class MtasDataCollector&lt;T1 extends Number &amp; Comparable&lt;T1&gt;, T2 e
923 923 }
924 924  
925 925 /**
926   - * Validate segment value old.
  926 + * Validate segment value.
927 927 *
928 928 * @param key the key
929 929 * @param value the value
... ... @@ -933,7 +933,7 @@ public abstract class MtasDataCollector&lt;T1 extends Number &amp; Comparable&lt;T1&gt;, T2 e
933 933 * @return the string
934 934 * @throws IOException Signals that an I/O exception has occurred.
935 935 */
936   - public String validateSegmentValueOld(String key, T1 value, int maximumNumber,
  936 + public String validateSegmentValue(String key, T1 value, int maximumNumber,
937 937 int segmentNumber, boolean test) throws IOException {
938 938 if (!closed) {
939 939 if (segmentRegistration != null) {
... ...
src/site/markdown/index.md
1 1 #Multi Tier Annotation Search
2 2  
3   -
4   -
5   -
  3 +In recent years, multiple solutions have come available providing search on huge amounts of plain text and metadata. Scalable searchability on annotated text however still appears to be problematic. We add annotational layers and structure to the existing Lucene approach of creating and searching indexes, and furthermore present an implementation as Solr plugin providing both searchability and scalability.
... ...
src/site/markdown/indexing.md
1 1 #Indexing
2 2  
3   -
  3 +To the existing Lucene approach of creating and searching indexes we add annotation and structure by using prefixes to distinguish between text and the different annotations and we use the payload to encode this additional information. The use of prefixes provides a direct solution to store and search for annotations on separate words within a text, and only an adjusted tokenizer is needed to offer the correct tokenstream to the indexer. To be able to store ranges of words (e.g. sentences, paragraphs, chapters), distinct sets of words (e.g. named entities), and hierarchical relations between all of these annotations or words, we encode this information as an array of bytes and store it as payload.
4 4  
5 5  
... ...
src/site/markdown/installation.md.vm
1   -#Getting started
  1 +#set($h1 = '#')
  2 +#set($h2 = '##')
  3 +#set($h3 = '###')
  4 +#set($h4 = '####')
  5 +$h1 Getting started
2 6  
3 7 Prebuilt jar libraries are available from the [download](download.html) page, current version is [${currentDevelopmentVersion}, release ${currentDevelopmentRelease}](https://github.com/meertensinstituut/mtas/releases/download/${currentDevelopmentRelease}/mtas-${currentDevelopmentVersion}.jar) (development)
4 8  
5   -###Build from Source with Maven
  9 +$h3 Build from Source with Maven
6 10  
7 11 Download the source code from GitHub
8 12  
... ... @@ -18,11 +22,13 @@ mvn package
18 22  
19 23 After a successful build, the directory `target` will contain the new jar library.
20 24  
21   -###Installation
  25 +$h3 Installation
22 26  
23 27 Mtas can be used as plugin for [Apache Solr](http://lucene.apache.org/solr/) or as library in combination with [Apache Lucene](http://lucene.apache.org/).
24 28  
25 29 - Getting started with [Mtas and Lucene](installation_lucene.html)
26 30 - Getting started with [Mtas and Solr](installation_solr.html)
27 31  
  32 +or see [Mtas and Docker](installation_docker.html) to get started quickly.
  33 +
28 34 Versioning follows the required version of both Solr and Lucene, the latest available version is ${currentDevelopmentVersion}.
... ...
src/site/markdown/installation_docker.md 0 → 100644
  1 +#Mtas and Docker
... ...
src/site/markdown/installation_lucene.md
1 1 #Mtas and Lucene
  2 +
... ...
src/site/markdown/installation_solr.md deleted
1   -#Mtas and Solr
src/site/markdown/installation_solr.md.vm 0 → 100644
  1 +#set($h1 = '#')
  2 +#set($h2 = '##')
  3 +#set($h3 = '###')
  4 +#set($h4 = '####')
  5 +$h1 Mtas and Solr
  6 +
  7 +Mtas can be used as plugin for Apache Solr
  8 +
  9 +Prerequisites
  10 +
  11 +- Installed [Apache Solr](http://lucene.apache.org/solr/)
  12 +- Currently supported and advised version is ${currentDevelopmentVersion}
  13 +
  14 +Start with a new Solr core.
  15 +
  16 +$h3 Libraries
  17 +
  18 +Add the `mtas-${currentDevelopmentVersion}.jar` to the `lib` directory of the new Solr core.
  19 +A prebuilt `mtas-${currentDevelopmentVersion}.jar` is available from the [download](download.html) page.
  20 +
  21 +Furthermore, add the [Apache Commons Mathematics Library](http://commons.apache.org/proper/commons-math/) to the `lib` directory of the new Solr core.
  22 +
  23 +$h3 Solrconfig.xml
  24 +
  25 +Some changes have to be made within the `solrconfig.xml` file, elements have to be added to the `<config/>` or existing elements have te be adjusted:
  26 +
  27 +Define a new **mtas searchComponent**:
  28 +
  29 +```console
  30 +<searchComponent name="mtas" class="mtas.solr.handler.component.MtasSolrSearchComponent"/>
  31 +```
  32 +
  33 +Add this component to the select requestHandler by inserting the following within the
  34 +`<requestHandler/>` with name `"/select"`:
  35 +
  36 +``` console
  37 +<arr name="last-components">
  38 + <str>mtas</str>
  39 +</arr>
  40 +```
  41 +
  42 +Define a new **mtas_cql queryParser**:
  43 +
  44 +```console
  45 +<queryParser name="mtas_cql" class="mtas.solr.search.MtasSolrCQLQParserPlugin"/>
  46 +```
  47 +
  48 +Define a new **mtas requestHandler**:
  49 +
  50 +```console
  51 +<requestHandler name="/mtas" class="mtas.solr.handler.MtasRequestHandler" />
  52 +```
  53 +
  54 +Define a new updateRequestProcessorChain:
  55 +
  56 +```console
  57 +<updateRequestProcessorChain name="mtasUpdateProcessor">
  58 + <processor class="mtas.solr.update.processor.MtasUpdateRequestProcessorFactory" />
  59 + <processor class="solr.LogUpdateProcessorFactory" />
  60 + <processor class="solr.RunUpdateProcessorFactory" />
  61 +</updateRequestProcessorChain>
  62 +```
  63 +
  64 +Define or adjust the update requestHandler with this updateRequestProcessorChain:
  65 +
  66 +```console
  67 +<requestHandler name="/update" class="solr.UpdateRequestHandler">
  68 + <lst name="defaults">
  69 + <str name="update.chain">mtasUpdateProcessor</str>
  70 + </lst>
  71 +</requestHandler>
  72 +```
  73 +
  74 +Finally, in this instruction we will use a classic schema instead of the managed-schema. Therefore define the correct schemaFactory.
  75 +
  76 +```console
  77 +<schemaFactory class="ClassicIndexSchemaFactory"/>
  78 +```
  79 +
  80 +
... ...
src/site/site.xml
... ... @@ -24,6 +24,7 @@
24 24 <item name="Getting started" href="installation.html" collapse="true">
25 25 <item name="Lucene" href="installation_lucene.html"/>
26 26 <item name="Solr" href="installation_solr.html"/>
  27 + <item name="Docker" href="installation_docker.html"/>
27 28 </item>
28 29 <item name="Indexing" href="indexing.html" collapse="true">
29 30 <item name="Configuration" href="indexing_configuration.html"/>
... ...