From 7179e4b769f7bea15982a6bc97cdd9f6e2a4adf0 Mon Sep 17 00:00:00 2001 From: Matthijs Brouwer <matthijs@brouwer.info> Date: Thu, 7 Sep 2017 07:24:55 +0200 Subject: [PATCH] documentation, bugfixes --- conf/parser/mtas/chat_test.xml | 9 --------- conf/parser/mtas/folia_pm.xml | 3 ++- conf/solr/solrconfig.xml | 11 ++++++----- docker/Dockerfile | 4 ++-- docker/solrconfig.xml | 8 ++++---- junit/data/conf/solrconfig.xml | 8 ++++---- junit/mtas/solr/MtasSolrBase.java | 182 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------- junit/mtas/solr/MtasSolrTestDistributedSearchConsistency.java | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- junit/mtas/solr/MtasSolrTestSearchConsistency.java | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------ pom.xml | 5 +++-- src/mtas/analysis/parser/MtasXMLParser.java | 2 +- src/mtas/analysis/util/MtasCharFilterFactory.java | 41 +++++++++++++++++++++++++++-------------- src/mtas/analysis/util/MtasTokenizerFactory.java | 4 +--- src/mtas/codec/MtasFieldsProducer.java | 47 +++++++++++++++++++++++++++++++++-------------- src/mtas/codec/util/CodecCollector.java | 83 +++++++++++++++++++++++++++++++++++++++++++++++------------------------------------ src/mtas/codec/util/CodecComponent.java |src/mtas/codec/util/CodecUtil.java | 14 +++++++------- src/mtas/codec/util/collector/MtasDataCollector.java | 2 +- src/mtas/solr/handler/component/MtasSolrSearchComponent.java | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------- src/mtas/solr/handler/component/util/MtasSolrCollectionResult.java | 355 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/mtas/solr/handler/component/util/MtasSolrComponentCollection.java |src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java | 10 ++++++---- src/mtas/solr/handler/component/util/MtasSolrComponentFacet.java | 4 ++-- src/mtas/solr/handler/component/util/MtasSolrComponentGroup.java | 12 +++++++++--- src/mtas/solr/handler/component/util/MtasSolrComponentJoin.java | 193 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/mtas/solr/handler/component/util/MtasSolrComponentPrefix.java | 4 ++-- src/mtas/solr/handler/component/util/MtasSolrComponentStats.java | 8 ++++---- src/mtas/solr/handler/component/util/MtasSolrComponentTermvector.java | 4 ++-- src/mtas/solr/handler/component/util/MtasSolrJoinResult.java | 55 ------------------------------------------------------- src/mtas/solr/handler/component/util/MtasSolrResultMerge.java | 76 +++++++++++++--------------------------------------------------------------- src/mtas/solr/handler/component/util/MtasSolrResultUtil.java | 163 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------- src/mtas/solr/search/MtasJoinQParser.java | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------- src/mtas/solr/search/MtasSolrCollectionCache.java | 565 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/mtas/solr/search/MtasSolrJoinCache.java | 364 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/mtas/solr/search/MtasSolrJoinQParserPlugin.java | 7 +++++++ src/mtas/solr/update/processor/MtasUpdateRequestProcessorFactory.java | 5 +++-- src/site/markdown/features.md | 6 +++--- src/site/markdown/index.md | 2 +- src/site/markdown/installation_lucene.md.vm | 2 +- src/site/markdown/search.md | 2 +- src/site/markdown/search_component.md | 41 +++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_component_collection.md | 26 ++++++++++++++++++++++++++ src/site/markdown/search_component_document.md | 284 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_component_facet.md | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_component_group.md |src/site/markdown/search_component_kwic.md | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_component_list.md | 347 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_component_prefix.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_component_stats.md | 10 ++++++++++ src/site/markdown/search_component_stats_positions.md | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_component_stats_spans.md | 416 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_component_stats_tokens.md | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_component_termvector.md | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/site/markdown/search_configuration.md | 6 ++---- src/site/markdown/search_cql.md | 4 ++-- src/site/markdown/search_handler.md | 1 + src/site/markdown/search_parser.md | 5 +++++ src/site/markdown/search_parser_cql.md | 4 ++++ src/site/markdown/search_parser_join.md | 4 ++++ src/site/markdown/search_query.md | 41 ----------------------------------------- src/site/markdown/search_query_document.md | 284 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/site/markdown/search_query_facet.md | 12 ------------ src/site/markdown/search_query_group.md | 11 ----------- src/site/markdown/search_query_kwic.md | 326 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/site/markdown/search_query_list.md | 347 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/site/markdown/search_query_prefix.md | 73 ------------------------------------------------------------------------- src/site/markdown/search_query_stats.md | 10 ---------- src/site/markdown/search_query_stats_positions.md | 142 ---------------------------------------------------------------------------------------------------------------------------------------------- src/site/markdown/search_query_stats_spans.md | 416 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/site/markdown/search_query_stats_tokens.md | 142 ---------------------------------------------------------------------------------------------------------------------------------------------- src/site/markdown/search_query_termvector.md | 326 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/site/markdown/search_sharding.md | 2 +- src/site/markdown/search_stats.md | 4 ++-- src/site/site.xml | 31 +++++++++++++++++++------------ 74 files changed, 6682 insertions(+), 3404 deletions(-) create mode 100644 src/mtas/solr/handler/component/util/MtasSolrCollectionResult.java create mode 100644 src/mtas/solr/handler/component/util/MtasSolrComponentCollection.java delete mode 100644 src/mtas/solr/handler/component/util/MtasSolrComponentJoin.java delete mode 100644 src/mtas/solr/handler/component/util/MtasSolrJoinResult.java create mode 100644 src/mtas/solr/search/MtasSolrCollectionCache.java delete mode 100644 src/mtas/solr/search/MtasSolrJoinCache.java create mode 100644 src/site/markdown/search_component.md create mode 100644 src/site/markdown/search_component_collection.md create mode 100644 src/site/markdown/search_component_document.md create mode 100644 src/site/markdown/search_component_facet.md create mode 100644 src/site/markdown/search_component_group.md create mode 100644 src/site/markdown/search_component_kwic.md create mode 100644 src/site/markdown/search_component_list.md create mode 100644 src/site/markdown/search_component_prefix.md create mode 100644 src/site/markdown/search_component_stats.md create mode 100644 src/site/markdown/search_component_stats_positions.md create mode 100644 src/site/markdown/search_component_stats_spans.md create mode 100644 src/site/markdown/search_component_stats_tokens.md create mode 100644 src/site/markdown/search_component_termvector.md create mode 100644 src/site/markdown/search_handler.md create mode 100644 src/site/markdown/search_parser.md create mode 100644 src/site/markdown/search_parser_cql.md create mode 100644 src/site/markdown/search_parser_join.md delete mode 100644 src/site/markdown/search_query.md delete mode 100644 src/site/markdown/search_query_document.md delete mode 100644 src/site/markdown/search_query_facet.md delete mode 100644 src/site/markdown/search_query_group.md delete mode 100644 src/site/markdown/search_query_kwic.md delete mode 100644 src/site/markdown/search_query_list.md delete mode 100644 src/site/markdown/search_query_prefix.md delete mode 100644 src/site/markdown/search_query_stats.md delete mode 100644 src/site/markdown/search_query_stats_positions.md delete mode 100644 src/site/markdown/search_query_stats_spans.md delete mode 100644 src/site/markdown/search_query_stats_tokens.md delete mode 100644 src/site/markdown/search_query_termvector.md diff --git a/conf/parser/mtas/chat_test.xml b/conf/parser/mtas/chat_test.xml index b537397..79fbfa4 100644 --- a/conf/parser/mtas/chat_test.xml +++ b/conf/parser/mtas/chat_test.xml @@ -103,15 +103,6 @@ <item type="text" /> </post> </token> - <token type="string" offset="false"> - <pre> - <item type="name" /> - <item type="string" value="_lc" /> - </pre> - <post> - <item type="text" filter="ascii,lowercase" /> - </post> - </token> </mapping> <mapping type="word" name="t"> <token type="string" offset="false"> diff --git a/conf/parser/mtas/folia_pm.xml b/conf/parser/mtas/folia_pm.xml index 46e63c3..3a6d6f4 100644 --- a/conf/parser/mtas/folia_pm.xml +++ b/conf/parser/mtas/folia_pm.xml @@ -25,7 +25,7 @@ <!-- START REFERENCES --> <references> <reference name="wref" ref="id" /> - </references> + </references> <!-- END REFERENCES --> <!-- START MAPPINGS --> @@ -284,6 +284,7 @@ <token type="string" offset="false" realoffset="false" parent="false"> <pre> <item type="name" /> + <item type="ancestorGroupAttribute" name="class" prefix="." /> <item type="attribute" name="subset" prefix="." /> </pre> <post> diff --git a/conf/solr/solrconfig.xml b/conf/solr/solrconfig.xml index e2450d1..8d59a17 100644 --- a/conf/solr/solrconfig.xml +++ b/conf/solr/solrconfig.xml @@ -1171,7 +1171,8 @@ http://wiki.apache.org/solr/TermVectorComponent --> - <searchComponent name="tvComponent" class="solr.TermVectorComponent"/> + <!-- <searchComponent name="tvComponent" class="solr.TermVectorComponent"/> --> + <searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/> <!-- A request handler for demonstrating the term vector component @@ -1347,10 +1348,10 @@ </searchComponent> <searchComponent name="mtas" class="mtas.solr.handler.component.MtasSolrSearchComponent"> - <!-- <str name="joinCacheDirectory">${solr.core.instanceDir}/cache/join</str> - <long name="joinLifetime">86400</long> - <int name="joinMaximumNumber">1000</int> - <int name="joinMaximumOverflow">10</int> --> + <str name="collectionCacheDirectory">${solr.core.instanceDir}/cache/collection</str> + <long name="collectionLifetime">86400</long> + <int name="collectionMaximumNumber">1000</int> + <int name="collectionMaximumOverflow">10</int> </searchComponent> <!-- Update Processors diff --git a/docker/Dockerfile b/docker/Dockerfile index 156a74d..0eba193 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,5 +1,5 @@ # Automatically generated Dockerfile -# - Build 2017-07-13 14:19 +# - Build 2017-09-06 06:47 # - Lucene/Solr version 6.6.0 # - Mtas release 20170713 # @@ -74,7 +74,7 @@ RUN service apache2 stop && \ chmod -R 755 /var/www/html && \ printf "echo\n" >> /start.sh && \ printf "echo \"================ Mtas -- Multi Tier Annotation Search =================\"\n" >> /start.sh && \ - printf "echo \" Timestamp 2017-07-13 14:19\"\n" >> /start.sh && \ + printf "echo \" Timestamp 2017-09-06 06:47\"\n" >> /start.sh && \ printf "echo \" Lucene/Solr version 6.6.0\"\n" >> /start.sh && \ printf "echo \" Mtas release 20170713\"\n" >> /start.sh && \ printf "echo \" See https://meertensinstituut.github.io/mtas/ for more information\"\n" >> /start.sh && \ diff --git a/docker/solrconfig.xml b/docker/solrconfig.xml index 0a358a8..38e523b 100644 --- a/docker/solrconfig.xml +++ b/docker/solrconfig.xml @@ -1246,10 +1246,10 @@ <!-- MTAS: searchComponent --> <searchComponent name="mtas" class="mtas.solr.handler.component.MtasSolrSearchComponent"> - <!-- <str name="joinCacheDirectory">${solr.core.instanceDir}/cache/join</str> - <long name="joinLifetime">86400</long> - <int name="joinMaximumNumber">1000</int> - <int name="joinMaximumOverflow">10</int> --> + <str name="collectionCacheDirectory">${solr.core.instanceDir}/cache/collection</str> + <long name="collectionLifetime">86400</long> + <int name="collectionMaximumNumber">1000</int> + <int name="collectionMaximumOverflow">10</int> </searchComponent> diff --git a/junit/data/conf/solrconfig.xml b/junit/data/conf/solrconfig.xml index d813a82..5d68f58 100644 --- a/junit/data/conf/solrconfig.xml +++ b/junit/data/conf/solrconfig.xml @@ -1347,10 +1347,10 @@ </searchComponent> <searchComponent name="mtas" class="mtas.solr.handler.component.MtasSolrSearchComponent"> - <!-- <str name="joinCacheDirectory">${solr.core.instanceDir}/cache/join</str> - <long name="joinLifetime">86400</long> - <int name="joinMaximumNumber">1000</int> - <int name="joinMaximumOverflow">10</int> --> + <str name="collectionCacheDirectory">${solr.core.instanceDir}/cache/collection</str> + <long name="collectionLifetime">86400</long> + <int name="collectionMaximumNumber">1000</int> + <int name="collectionMaximumOverflow">10</int> </searchComponent> <!-- Update Processors diff --git a/junit/mtas/solr/MtasSolrBase.java b/junit/mtas/solr/MtasSolrBase.java index 6466065..ace8bea 100644 --- a/junit/mtas/solr/MtasSolrBase.java +++ b/junit/mtas/solr/MtasSolrBase.java @@ -14,6 +14,7 @@ import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.util.NamedList; @@ -22,6 +23,24 @@ import org.apache.solr.common.util.NamedList; */ public class MtasSolrBase { + /** The field id. */ + public static String FIELD_ID = "id"; + + /** The field title. */ + public static String FIELD_TITLE = "title"; + + /** The field text. */ + public static String FIELD_TEXT = "text"; + + /** The field mtas. */ + public static String FIELD_MTAS = "mtas"; + + /** The field mtas advanced. */ + public static String FIELD_MTAS_ADVANCED = "mtasAdvanced"; + + /** The field source. */ + public static String FIELD_SOURCE = "source"; + /** * Instantiates a new mtas solr base. */ @@ -33,6 +52,28 @@ public class MtasSolrBase { private static Log log = LogFactory.getLog(MtasSolrBase.class); /** + * Gets the num found. + * + * @param response the response + * @return the num found + */ + public static long getNumFound(NamedList<Object> response) { + if (response == null) { + log.error("no (valid); response"); + } else { + Object mtasResponseRaw = response.get("response"); + if (mtasResponseRaw != null + && mtasResponseRaw instanceof SolrDocumentList) { + SolrDocumentList mtasResponse = (SolrDocumentList) mtasResponseRaw; + return mtasResponse.getNumFound(); + } else { + log.error("unexpected " + mtasResponseRaw); + } + } + return 0; + } + + /** * Gets the from stats. * * @param response the response @@ -52,11 +93,11 @@ public class MtasSolrBase { Object mtasStatsFieldsRaw = mtasStats.get("stats_fields"); if (mtasStatsFieldsRaw != null && mtasStatsFieldsRaw instanceof NamedList) { - NamedList<Object> mtasStatsFields = (NamedList) mtasStatsFieldsRaw; + NamedList<Object> mtasStatsFields = (NamedList<Object>) mtasStatsFieldsRaw; Object mtasStatsFieldsFieldRaw = mtasStatsFields.get(field); if (mtasStatsFieldsFieldRaw != null && mtasStatsFieldsFieldRaw instanceof NamedList) { - NamedList<Object> mtasStatsFieldsField = (NamedList) mtasStatsFieldsFieldRaw; + NamedList<Object> mtasStatsFieldsField = (NamedList<Object>) mtasStatsFieldsFieldRaw; Object mtasStatsFieldsFieldNameRaw = mtasStatsFieldsField.get(name); if (mtasStatsFieldsFieldNameRaw != null && mtasStatsFieldsFieldNameRaw instanceof Number) { @@ -148,18 +189,19 @@ public class MtasSolrBase { * @param key the key * @return the from mtas termvector */ - public static List<NamedList> getFromMtasTermvector( + public static List<NamedList<Object>> getFromMtasTermvector( NamedList<Object> response, String key) { if (response == null) { log.error("no (valid); response"); } else { Object mtasResponseRaw = response.get("mtas"); if (mtasResponseRaw != null && mtasResponseRaw instanceof NamedList) { - NamedList<Object> mtasResponse = (NamedList) response.get("mtas"); + NamedList<Object> mtasResponse = (NamedList<Object>) response + .get("mtas"); Object mtasTermvectorResponseRaw = mtasResponse.get("termvector"); if (mtasTermvectorResponseRaw != null && mtasTermvectorResponseRaw instanceof List) { - List<NamedList> mtasTermvectorResponse = (List) mtasTermvectorResponseRaw; + List<NamedList<Object>> mtasTermvectorResponse = (List) mtasTermvectorResponseRaw; if (mtasTermvectorResponse.isEmpty()) { log.error("no (valid) mtas termvector response"); } else { @@ -173,9 +215,9 @@ public class MtasSolrBase { } } assertFalse("no item with key " + key, item == null); - if (item.get("list") != null + if (item != null && item.get("list") != null && (item.get("list") instanceof List)) { - return (List<NamedList>) item.get("list"); + return (List<NamedList<Object>>) item.get("list"); } } } else { @@ -221,14 +263,16 @@ public class MtasSolrBase { } assertFalse("no item with key " + key, item == null); Map<String, List<String>> result = new HashMap<>(); - Iterator<Entry<String, Object>> it = item.iterator(); - Entry<String, Object> entry; - while (it.hasNext()) { - entry = it.next(); - if (!entry.getKey().equals("key")) { - assertTrue("invalid entry prefix", - entry.getValue() instanceof List); - result.put(entry.getKey(), (List) entry.getValue()); + if (item != null) { + Iterator<Entry<String, Object>> it = item.iterator(); + Entry<String, Object> entry; + while (it.hasNext()) { + entry = it.next(); + if (!entry.getKey().equals("key")) { + assertTrue("invalid entry prefix", + entry.getValue() instanceof List); + result.put(entry.getKey(), (List) entry.getValue()); + } } } return result; @@ -240,6 +284,78 @@ public class MtasSolrBase { } /** + * Gets the from mtas collection. + * + * @param response the response + * @param key the key + * @return the from mtas collection + */ + public static NamedList<Object> getFromMtasCollection( + NamedList<Object> response, String key) { + if (response == null) { + log.error("no (valid); response"); + } else { + Object mtasResponseRaw = response.get("mtas"); + if (mtasResponseRaw != null && mtasResponseRaw instanceof NamedList) { + NamedList<Object> mtasResponse = (NamedList<Object>) response + .get("mtas"); + Object mtasCollectionResponseRaw = mtasResponse.get("collection"); + if (mtasCollectionResponseRaw != null + && mtasCollectionResponseRaw instanceof List) { + List<NamedList<Object>> mtasCollectionResponse = (List<NamedList<Object>>) mtasCollectionResponseRaw; + if (mtasCollectionResponse.isEmpty()) { + log.error("no (valid) mtas join response"); + } else { + for (NamedList<Object> mtasCollectionResponseItem : mtasCollectionResponse) { + if (mtasCollectionResponseItem.get("key") != null + && (mtasCollectionResponseItem.get("key") instanceof String) + && mtasCollectionResponseItem.get("key").equals(key)) { + return mtasCollectionResponseItem; + } + } + } + } else { + log.error("unexpected " + mtasCollectionResponseRaw); + } + } else { + log.error("unexpected " + mtasResponseRaw); + } + } + return null; + } + + /** + * Gets the from mtas collection list. + * + * @param response the response + * @param key the key + * @param id the id + * @return the from mtas collection list + */ + public static NamedList<Object> getFromMtasCollectionList( + NamedList<Object> response, String key, String id) { + NamedList<Object> collectionResponse = getFromMtasCollection(response, key); + if (collectionResponse != null) { + Object collectionResponseListRaw = collectionResponse.get("list"); + if (collectionResponseListRaw != null && collectionResponseListRaw instanceof List) { + List<NamedList<Object>> collectionResponseList = (List<NamedList<Object>>) collectionResponseListRaw; + for (NamedList<Object> item : collectionResponseList) { + if (item.get("id") != null && item.get("id") instanceof String) { + if (id.equals((String) item.get("id"))) { + return item; + } + } + } + } else { + log.error("unexpected " + collectionResponseListRaw + " (searching list)"); + } + } else { + log.error("no collectionResponse (searching key " + key + ")"); + } + return null; + } + + /** * Delete directory. * * @param directory the directory @@ -273,38 +389,38 @@ public class MtasSolrBase { Path dataPath = Paths.get("junit").resolve("data"); // data SolrInputDocument newDoc1 = new SolrInputDocument(); - newDoc1.addField("id", "1"); - newDoc1.addField("title", "Een onaangenaam mens in de Haarlemmerhout"); - newDoc1.addField("text", "Een onaangenaam mens in de Haarlemmerhout"); - newDoc1.addField("mtas", dataPath.resolve("resources") + newDoc1.addField(FIELD_ID, "1"); + newDoc1.addField(FIELD_TITLE, "Een onaangenaam mens in de Haarlemmerhout"); + newDoc1.addField(FIELD_TEXT, "Een onaangenaam mens in de Haarlemmerhout"); + newDoc1.addField(FIELD_MTAS, dataPath.resolve("resources") .resolve("beets1.xml.gz").toFile().getAbsolutePath()); if (includeAdvanced) { - newDoc1.addField("source", "source1"); - newDoc1.addField("mtasAdvanced", dataPath.resolve("resources") + newDoc1.addField(FIELD_SOURCE, "source1"); + newDoc1.addField(FIELD_MTAS_ADVANCED, dataPath.resolve("resources") .resolve("beets1").toFile().getAbsolutePath()); } solrDocuments.put(1, newDoc1); SolrInputDocument newDoc2 = new SolrInputDocument(); - newDoc2.addField("id", "2"); - newDoc2.addField("title", "Een oude kennis"); - newDoc2.addField("text", "Een oude kennis"); - newDoc2.addField("mtas", dataPath.resolve("resources") + newDoc2.addField(FIELD_ID, "2"); + newDoc2.addField(FIELD_TITLE, "Een oude kennis"); + newDoc2.addField(FIELD_TEXT, "Een oude kennis"); + newDoc2.addField(FIELD_MTAS, dataPath.resolve("resources") .resolve("beets2.xml.gz").toFile().getAbsolutePath()); if (includeAdvanced) { - newDoc2.addField("source", "source2"); - newDoc2.addField("mtasAdvanced", dataPath.resolve("resources") + newDoc2.addField(FIELD_SOURCE, "source2"); + newDoc2.addField(FIELD_MTAS_ADVANCED, dataPath.resolve("resources") .resolve("beets2.xml").toFile().getAbsolutePath()); } SolrInputDocument newDoc3 = new SolrInputDocument(); solrDocuments.put(2, newDoc2); - newDoc3.addField("id", "3"); - newDoc3.addField("title", "Varen en Rijden"); - newDoc3.addField("text", "Varen en Rijden"); - newDoc3.addField("mtas", dataPath.resolve("resources") + newDoc3.addField(FIELD_ID, "3"); + newDoc3.addField(FIELD_TITLE, "Varen en Rijden"); + newDoc3.addField(FIELD_TEXT, "Varen en Rijden"); + newDoc3.addField(FIELD_MTAS, dataPath.resolve("resources") .resolve("beets3.xml.gz").toFile().getAbsolutePath()); if (includeAdvanced) { - newDoc3.addField("source", "source3"); - newDoc3.addField("mtasAdvanced", dataPath.resolve("resources") + newDoc3.addField(FIELD_SOURCE, "source3"); + newDoc3.addField(FIELD_MTAS_ADVANCED, dataPath.resolve("resources") .resolve("beets3.xml.gz").toFile().getAbsolutePath()); } solrDocuments.put(3, newDoc3); diff --git a/junit/mtas/solr/MtasSolrTestDistributedSearchConsistency.java b/junit/mtas/solr/MtasSolrTestDistributedSearchConsistency.java index be3b761..041c8aa 100644 --- a/junit/mtas/solr/MtasSolrTestDistributedSearchConsistency.java +++ b/junit/mtas/solr/MtasSolrTestDistributedSearchConsistency.java @@ -225,7 +225,7 @@ public class MtasSolrTestDistributedSearchConsistency { list.get(COLLECTION_DISTRIBUTED).getResponse(), "tv", new String[] { "n", "sum" }); for (Entry<String, QueryResponse> entry : list.entrySet()) { - List<NamedList> tv = MtasSolrBase + List<NamedList<Object>> tv = MtasSolrBase .getFromMtasTermvector(entry.getValue().getResponse(), "tv"); for (NamedList<Object> item : tv) { String key = item.get("key").toString(); @@ -311,6 +311,310 @@ public class MtasSolrTestDistributedSearchConsistency { } /** + * Mtas request handler collection 1. + * + * @throws IOException Signals that an I/O exception has occurred. + */ + @org.junit.Test + public void mtasRequestHandlerCollection1() throws IOException { + String[] collections = new String[] { COLLECTION_ALL_OPTIMIZED, + COLLECTION_ALL_MULTIPLE_SEGMENTS, COLLECTION_DISTRIBUTED }; + String[] collectionsParts = new String[] { COLLECTION_PART1_OPTIMIZED, + COLLECTION_PART2_MULTIPLE_SEGMENTS }; + Map<String, String> listCreateVersion = new HashMap<>(); + Map<String, Number> listCreateSize = new HashMap<>(); + Map<String, String> listPostVersion = new HashMap<>(); + Map<String, Number> listPostSize = new HashMap<>(); + // create + ModifiableSolrParams paramsCreate = new ModifiableSolrParams(); + paramsCreate.set("q", "*:*"); + paramsCreate.set("rows", "0"); + paramsCreate.set("mtas", "true"); + paramsCreate.set("mtas.collection", "true"); + paramsCreate.set("mtas.collection.0.key", "create"); + paramsCreate.set("mtas.collection.0.action", "create"); + paramsCreate.set("mtas.collection.0.id", "idCreate"); + paramsCreate.set("mtas.collection.0.field", MtasSolrBase.FIELD_ID); + Map<String, QueryResponse> listCreate = createResults(paramsCreate, + Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listCreate.entrySet()) { + long size = MtasSolrBase.getNumFound(entry.getValue().getResponse()); + NamedList<Object> create = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "create"); + createCollectionAssertions(create, entry.getKey(), "idCreate", null, size, + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + listCreateVersion.put(entry.getKey(), (String) create.get("version")); + listCreateSize.put(entry.getKey(), (Number) create.get("size")); + } + // post + ModifiableSolrParams paramsPost = new ModifiableSolrParams(); + paramsPost.set("q", "*:*"); + paramsPost.set("rows", "0"); + paramsPost.set("mtas", "true"); + paramsPost.set("mtas.collection", "true"); + paramsPost.set("mtas.collection.0.key", "post"); + paramsPost.set("mtas.collection.0.action", "post"); + paramsPost.set("mtas.collection.0.id", "idPost"); + paramsPost.set("mtas.collection.0.post", "[1,2,3,4]"); + Map<String, QueryResponse> listPost = createResults(paramsPost, + Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listPost.entrySet()) { + long size = 4; + NamedList<Object> post = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "post"); + createCollectionAssertions(post, entry.getKey(), "idPost", null, size, + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + listPostVersion.put(entry.getKey(), (String) post.get("version")); + listPostSize.put(entry.getKey(), (Number) post.get("size")); + } + // list + ModifiableSolrParams paramsList = new ModifiableSolrParams(); + paramsList.set("q", "*:*"); + paramsList.set("rows", "0"); + paramsList.set("mtas", "true"); + paramsList.set("mtas.collection", "true"); + paramsList.set("mtas.collection.0.key", "list"); + paramsList.set("mtas.collection.0.action", "list"); + Map<String, QueryResponse> listList = createResults(paramsList, + Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listList.entrySet()) { + // check create + NamedList<Object> listCreateItem1 = MtasSolrBase + .getFromMtasCollectionList(entry.getValue().getResponse(), "list", + "idCreate"); + createCollectionAssertions(listCreateItem1, entry.getKey(), "idCreate", + listCreateVersion.get(entry.getKey()), + listCreateSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + // check post + NamedList<Object> listPostItem1 = MtasSolrBase.getFromMtasCollectionList( + entry.getValue().getResponse(), "list", "idPost"); + createCollectionAssertions(listPostItem1, entry.getKey(), "idPost", + listPostVersion.get(entry.getKey()), listPostSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + } + // check + ModifiableSolrParams paramsCheck = new ModifiableSolrParams(); + paramsCheck.set("q", "*:*"); + paramsCheck.set("rows", "0"); + paramsCheck.set("mtas", "true"); + paramsCheck.set("mtas.collection", "true"); + paramsCheck.set("mtas.collection.0.key", "check1"); + paramsCheck.set("mtas.collection.0.action", "check"); + paramsCheck.set("mtas.collection.0.id", "idCreate"); + paramsCheck.set("mtas.collection.1.key", "check2"); + paramsCheck.set("mtas.collection.1.action", "check"); + paramsCheck.set("mtas.collection.1.id", "idPost"); + // check on all + Map<String, QueryResponse> listCheck = createResults(paramsCheck, + Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listCheck.entrySet()) { + NamedList<Object> listItemCheck1 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check1"); + createCollectionAssertions(listItemCheck1, entry.getKey(), "idCreate", + listCreateVersion.get(entry.getKey()), + listCreateSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + NamedList<Object> listItemCheck2 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check2"); + createCollectionAssertions(listItemCheck2, entry.getKey(), "idPost", + listPostVersion.get(entry.getKey()), listPostSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + } + // check on parts + createResults(paramsCheck, Arrays.asList(collectionsParts)); + for (Entry<String, QueryResponse> entry : listCheck.entrySet()) { + NamedList<Object> listItemCheck1 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check1"); + createCollectionAssertions(listItemCheck1, entry.getKey(), "idCreate", + listCreateVersion.get(entry.getKey()), + listCreateSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + NamedList<Object> listItemCheck2 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check2"); + createCollectionAssertions(listItemCheck2, entry.getKey(), "idPost", + listPostVersion.get(entry.getKey()), listPostSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + } + // delete + ModifiableSolrParams paramsDelete = new ModifiableSolrParams(); + paramsDelete.set("q", "*:*"); + paramsDelete.set("rows", "0"); + paramsDelete.set("mtas", "true"); + paramsDelete.set("mtas.collection", "true"); + paramsDelete.set("mtas.collection.0.key", "delete1"); + paramsDelete.set("mtas.collection.0.action", "delete"); + paramsDelete.set("mtas.collection.0.id", "idCreate"); + paramsDelete.set("mtas.collection.1.key", "delete2"); + paramsDelete.set("mtas.collection.1.action", "delete"); + paramsDelete.set("mtas.collection.1.id", "idPost"); + // delete on parts + createResults(paramsDelete, Arrays.asList(collectionsParts)); + // recheck on parts + Map<String, QueryResponse> listCheckParts = createResults(paramsCheck, + Arrays.asList(collectionsParts)); + for (Entry<String, QueryResponse> entry : listCheckParts.entrySet()) { + NamedList<Object> listItemCheck1 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check1"); + assertTrue( + entry.getKey() + " - create - should be removed: " + listItemCheck1, + listItemCheck1 != null && listItemCheck1.get("id") == null); + NamedList<Object> listItemCheck2 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check2"); + assertTrue( + entry.getKey() + " - post - should be removed: " + listItemCheck2, + listItemCheck2 != null && listItemCheck2.get("id") == null); + } + // list with empty parts + listList = createResults(paramsList, Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listList.entrySet()) { + // check create + NamedList<Object> listCreateItem1 = MtasSolrBase + .getFromMtasCollectionList(entry.getValue().getResponse(), "list", + "idCreate"); + createCollectionAssertions(listCreateItem1, entry.getKey(), "idCreate", + listCreateVersion.get(entry.getKey()), + listCreateSize.get(entry.getKey()), 0); + // check post + NamedList<Object> listPostItem1 = MtasSolrBase.getFromMtasCollectionList( + entry.getValue().getResponse(), "list", "idPost"); + createCollectionAssertions(listPostItem1, entry.getKey(), "idPost", + listPostVersion.get(entry.getKey()), listPostSize.get(entry.getKey()), + 0); + } + // recheck on all, assuming empty parts, autofix + listCheck = createResults(paramsCheck, Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listCheck.entrySet()) { + NamedList<Object> listItemCheck1 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check1"); + createCollectionAssertions(listItemCheck1, entry.getKey(), "idCreate", + listCreateVersion.get(entry.getKey()), + listCreateSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + NamedList<Object> listItemCheck2 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check2"); + createCollectionAssertions(listItemCheck2, entry.getKey(), "idPost", + listPostVersion.get(entry.getKey()), listPostSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + } + // recheck on parts + listCheckParts = createResults(paramsCheck, + Arrays.asList(collectionsParts)); + for (Entry<String, QueryResponse> entry : listCheck.entrySet()) { + NamedList<Object> listItemCheck1 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check1"); + createCollectionAssertions(listItemCheck1, entry.getKey(), "idCreate", + listCreateVersion.get(entry.getKey()), + listCreateSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + NamedList<Object> listItemCheck2 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check2"); + createCollectionAssertions(listItemCheck2, entry.getKey(), "idPost", + listPostVersion.get(entry.getKey()), listPostSize.get(entry.getKey()), + entry.getKey().equals(COLLECTION_DISTRIBUTED) ? 2 : 0); + } + // full delete + createResults(paramsDelete, Arrays.asList(collections)); + // final check + listCheck = createResults(paramsCheck, Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listCheck.entrySet()) { + NamedList<Object> listItemCheck1 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check1"); + assertTrue( + entry.getKey() + " - create - should be removed: " + listItemCheck1, + listItemCheck1 != null && listItemCheck1.get("id") == null); + NamedList<Object> listItemCheck2 = MtasSolrBase + .getFromMtasCollection(entry.getValue().getResponse(), "check2"); + assertTrue( + entry.getKey() + " - post - should be removed: " + listItemCheck2, + listItemCheck2 != null && listItemCheck2.get("id") == null); + } + } + + /** + * Mtas request handler collection 2. + * + * @throws IOException Signals that an I/O exception has occurred. + */ + @org.junit.Test + public void mtasRequestHandlerCollection2() throws IOException { + String[] collections = new String[] { COLLECTION_ALL_OPTIMIZED, + COLLECTION_ALL_MULTIPLE_SEGMENTS, COLLECTION_DISTRIBUTED }; + // post + ModifiableSolrParams paramsPost = new ModifiableSolrParams(); + paramsPost.set("q", "*:*"); + paramsPost.set("mtas", "true"); + paramsPost.set("mtas.collection", "true"); + paramsPost.set("mtas.collection.0.key", "postKey1"); + paramsPost.set("mtas.collection.0.action", "post"); + paramsPost.set("mtas.collection.0.id", "postSet1"); + paramsPost.set("mtas.collection.0.post", "[1,3,4]"); + paramsPost.set("mtas.collection.1.key", "postKey2"); + paramsPost.set("mtas.collection.1.action", "post"); + paramsPost.set("mtas.collection.1.id", "postSet2"); + paramsPost.set("mtas.collection.1.post", "[2]"); + paramsPost.set("mtas.collection.2.key", "createKey1"); + paramsPost.set("mtas.collection.2.action", "create"); + paramsPost.set("mtas.collection.2.id", "createSet1"); + paramsPost.set("mtas.collection.2.field", MtasSolrBase.FIELD_ID); + createResults(paramsPost, Arrays.asList(collections)); + // query set1 + ModifiableSolrParams paramsSelect1 = new ModifiableSolrParams(); + paramsSelect1.set("q", "{!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"postSet1\"}"); + paramsSelect1.set("rows", "0"); + Map<String, QueryResponse> listPost1 = createResults(paramsSelect1, + Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listPost1.entrySet()) { + long n = MtasSolrBase.getNumFound(entry.getValue().getResponse()); + assertTrue( + entry.getKey() + " - incorrect number of matching documents : " + n, + n == 2); + } + // query set2 + ModifiableSolrParams paramsSelect2 = new ModifiableSolrParams(); + paramsSelect2.set("q", "{!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"postSet2\"}"); + paramsSelect2.set("rows", "0"); + Map<String, QueryResponse> listPost2 = createResults(paramsSelect2, + Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listPost2.entrySet()) { + long n = MtasSolrBase.getNumFound(entry.getValue().getResponse()); + assertTrue( + entry.getKey() + " - incorrect number of matching documents : " + n, + n == 1); + } + // query set3 + ModifiableSolrParams paramsSelect3 = new ModifiableSolrParams(); + paramsSelect3.set("q", "{!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"createSet1\"}"); + paramsSelect3.set("rows", "0"); + Map<String, QueryResponse> listPost3 = createResults(paramsSelect3, + Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listPost3.entrySet()) { + long n = MtasSolrBase.getNumFound(entry.getValue().getResponse()); + assertTrue( + entry.getKey() + " - incorrect number of matching documents : " + n, + n == 3); + } + // query set1 or set2 + ModifiableSolrParams paramsSelect4 = new ModifiableSolrParams(); + paramsSelect4.set("q", + "({!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"postSet1\"}) OR ({!mtas_join field=\"" + + MtasSolrBase.FIELD_ID + "\" collection=\"postSet2\"})"); + paramsSelect4.set("rows", "0"); + Map<String, QueryResponse> listPost4 = createResults(paramsSelect4, + Arrays.asList(collections)); + for (Entry<String, QueryResponse> entry : listPost4.entrySet()) { + long n = MtasSolrBase.getNumFound(entry.getValue().getResponse()); + assertTrue( + entry.getKey() + " - incorrect number of matching documents : " + n, + n == 3); + } + } + + /** * Mtas request handler prefix. * * @throws IOException Signals that an I/O exception has occurred. @@ -336,12 +640,16 @@ public class MtasSolrTestDistributedSearchConsistency { /** * Creates the results. * - * @param params the params + * @param initialParams the initial params * @param collections the collections * @return the hash map */ private static HashMap<String, QueryResponse> createResults( - final ModifiableSolrParams params, List<String> collections) { + final ModifiableSolrParams initialParams, List<String> collections) { + // use initial params + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(initialParams); + // continue HashMap<String, QueryResponse> list = new HashMap<>(); CloudSolrClient client = cloudCluster.getSolrClient(); try { @@ -450,8 +758,10 @@ public class MtasSolrTestDistributedSearchConsistency { */ private static void createTermvectorAssertions(NamedList<Object> response1, NamedList<Object> response2, String key, String[] names) { - List<NamedList> list1 = MtasSolrBase.getFromMtasTermvector(response1, key); - List<NamedList> list2 = MtasSolrBase.getFromMtasTermvector(response2, key); + List<NamedList<Object>> list1 = MtasSolrBase + .getFromMtasTermvector(response1, key); + List<NamedList<Object>> list2 = MtasSolrBase + .getFromMtasTermvector(response2, key); assertFalse("list should be defined", list1 == null || list2 == null); if (list1 != null && list2 != null) { assertEquals("lists should have equal size", list1.size(), list2.size()); @@ -479,6 +789,57 @@ public class MtasSolrTestDistributedSearchConsistency { } /** + * Creates the collection assertions. + * + * @param create the create + * @param collection the collection + * @param id the id + * @param version the version + * @param size the size + * @param shards the shards + */ + private static void createCollectionAssertions(NamedList<Object> create, + String collection, String id, String version, Number size, int shards) { + assertFalse(collection + ": create - not found", create == null); + assertTrue(collection + ": create - no valid version", + create.get("id") != null && create.get("id") instanceof String); + assertTrue(collection + ": create - id incorrect, '" + id + + "' not equal to '" + create.get("id") + "'", + ((String) create.get("id")).equals(id)); + assertTrue( + collection + ": create - no valid version, '" + version + + "' not equal to '" + create.get("version") + "'", + create.get("version") != null + && create.get("version") instanceof String); + if (version != null) { + assertTrue(collection + ": create - version incorrect", + ((String) create.get("version")).equals(version)); + } + assertTrue(collection + ": create - no valid size", + create.get("size") != null && create.get("size") instanceof Number); + Number createSize = (Number) create.get("size"); + assertEquals(collection + ": number of values", size.longValue(), + createSize.longValue()); + if (shards > 0) { + assertTrue("no (valid) shards", + create.get("shards") != null && create.get("shards") instanceof List + && ((List) create.get("shards")).size() == shards); + for (Object shardItem : (List<Object>) create.get("shards")) { + assertTrue(collection + ": invalid shardItem", + shardItem instanceof NamedList); + Object sizeRaw = ((NamedList<Object>) shardItem).get("size"); + assertTrue(collection + ": incorrect size", + sizeRaw != null && sizeRaw instanceof Number + && ((Number) sizeRaw).longValue() == createSize.longValue()); + } + } else { + assertFalse(collection + ": shards found : " + create.get("shards"), + create.get("shards") != null && create.get("shards") instanceof List + && !((List) create.get("shards")).isEmpty()); + } + } + + /** * Creates the cloud. */ private static void createCloud() { diff --git a/junit/mtas/solr/MtasSolrTestSearchConsistency.java b/junit/mtas/solr/MtasSolrTestSearchConsistency.java index 4fabf4f..df4d2cb 100644 --- a/junit/mtas/solr/MtasSolrTestSearchConsistency.java +++ b/junit/mtas/solr/MtasSolrTestSearchConsistency.java @@ -221,13 +221,13 @@ public class MtasSolrTestSearchConsistency { params.set("mtas", "true"); params.set("mtas.stats", "true"); params.set("mtas.stats.spans", "true"); - params.set("mtas.stats.spans.0.field", "mtas"); + params.set("mtas.stats.spans.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.stats.spans.0.key", "statsKey"); params.set("mtas.stats.spans.0.query.0.type", "cql"); params.set("mtas.stats.spans.0.query.0.value", "[]"); params.set("mtas.stats.spans.0.type", "n,sum,mean"); params.set("mtas.stats.positions", "true"); - params.set("mtas.stats.positions.0.field", "mtas"); + params.set("mtas.stats.positions.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.stats.positions.0.key", "statsKey"); params.set("mtas.stats.positions.0.type", "n,sum,mean"); params.set("rows", "0"); @@ -264,7 +264,7 @@ public class MtasSolrTestSearchConsistency { params.set("mtas", "true"); params.set("mtas.stats", "true"); params.set("mtas.stats.tokens", "true"); - params.set("mtas.stats.tokens.0.field", "mtas"); + params.set("mtas.stats.tokens.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.stats.tokens.0.key", "statsKey"); params.set("mtas.stats.tokens.0.type", String.join(",", types)); params.set("mtas.stats.tokens.0.minimum", 1); @@ -293,7 +293,7 @@ public class MtasSolrTestSearchConsistency { params.set("rows", 0); params.set("mtas", "true"); params.set("mtas.termvector", "true"); - params.set("mtas.termvector.0.field", "mtas"); + params.set("mtas.termvector.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.termvector.0.prefix", "t_lc"); params.set("mtas.termvector.0.key", "tv"); params.set("mtas.termvector.0.sort.type", "sum"); @@ -342,7 +342,7 @@ public class MtasSolrTestSearchConsistency { params.set("rows", 0); params.set("mtas", "true"); params.set("mtas.termvector", "true"); - params.set("mtas.termvector.0.field", "mtas"); + params.set("mtas.termvector.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.termvector.0.prefix", "t_lc"); params.set("mtas.termvector.0.key", "tv"); params.set("mtas.termvector.0.type", String.join(",", types)); @@ -355,7 +355,8 @@ public class MtasSolrTestSearchConsistency { } catch (SolrServerException e) { throw new IOException(e); } - List<NamedList> tv = MtasSolrBase.getFromMtasTermvector(response, "tv"); + List<NamedList<Object>> tv = MtasSolrBase.getFromMtasTermvector(response, + "tv"); for (String key : list) { params.clear(); params.set("q", "*:*"); @@ -363,13 +364,13 @@ public class MtasSolrTestSearchConsistency { params.set("mtas", "true"); params.set("mtas.stats", "true"); params.set("mtas.stats.spans", "true"); - params.set("mtas.stats.spans.0.field", "mtas"); + params.set("mtas.stats.spans.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.stats.spans.0.key", "statsKey0"); params.set("mtas.stats.spans.0.minimum", 1); params.set("mtas.stats.spans.0.query.0.type", "cql"); params.set("mtas.stats.spans.0.query.0.value", "[t_lc=\"" + key + "\"]"); params.set("mtas.stats.spans.0.type", String.join(",", types)); - params.set("mtas.stats.spans.1.field", "mtas"); + params.set("mtas.stats.spans.1.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.stats.spans.1.key", "statsKey1"); params.set("mtas.stats.spans.1.minimum", 0); params.set("mtas.stats.spans.1.query.0.type", "cql"); @@ -419,7 +420,7 @@ public class MtasSolrTestSearchConsistency { params.set("rows", 0); params.set("mtas", "true"); params.set("mtas.termvector", "true"); - params.set("mtas.termvector.0.field", "mtas"); + params.set("mtas.termvector.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.termvector.0.prefix", "t_lc"); params.set("mtas.termvector.0.key", "tv"); params.set("mtas.termvector.0.regexp", "een[a-z]*"); @@ -435,7 +436,8 @@ public class MtasSolrTestSearchConsistency { } catch (SolrServerException e) { throw new IOException(e); } - List<NamedList> tv = MtasSolrBase.getFromMtasTermvector(response, "tv"); + List<NamedList<Object>> tv = MtasSolrBase.getFromMtasTermvector(response, + "tv"); Set<String> keys = new HashSet<>(); for (NamedList<Object> item : tv) { if (item != null && item.get("key") != null @@ -455,6 +457,350 @@ public class MtasSolrTestSearchConsistency { } /** + * Mtas request handler collection 1. + * + * @throws IOException Signals that an I/O exception has occurred. + */ + @org.junit.Test + public void mtasRequestHandlerCollection1() throws IOException { + // create + ModifiableSolrParams paramsCreate = new ModifiableSolrParams(); + paramsCreate.set("q", "*:*"); + paramsCreate.set("mtas", "true"); + paramsCreate.set("mtas.collection", "true"); + paramsCreate.set("mtas.collection.0.key", "create"); + paramsCreate.set("mtas.collection.0.action", "create"); + paramsCreate.set("mtas.collection.0.id", "idCreate"); + paramsCreate.set("mtas.collection.0.field", "id"); + SolrRequest<?> requestCreate = new QueryRequest(paramsCreate, METHOD.POST); + NamedList<Object> responseCreate; + try { + responseCreate = server.request(requestCreate, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + long n = MtasSolrBase.getNumFound(responseCreate); + NamedList<Object> create = MtasSolrBase + .getFromMtasCollection(responseCreate, "create"); + assertFalse("create - id not found", create == null); + assertTrue("create - no valid version", create.get("version") != null + && create.get("version") instanceof String); + assertTrue("create - no valid size", + create.get("size") != null && create.get("size") instanceof Number); + String createVersion = (String) create.get("version"); + Number createSize = (Number) create.get("size"); + assertEquals("number of values", n, createSize.longValue()); + // post + ModifiableSolrParams paramsPost = new ModifiableSolrParams(); + paramsPost.set("q", "*:*"); + paramsPost.set("mtas", "true"); + paramsPost.set("mtas.collection", "true"); + paramsPost.set("mtas.collection.0.key", "post"); + paramsPost.set("mtas.collection.0.action", "post"); + paramsPost.set("mtas.collection.0.id", "idPost"); + paramsPost.set("mtas.collection.0.post", "[1,2,3,4]"); + SolrRequest<?> requestPost = new QueryRequest(paramsPost, METHOD.POST); + NamedList<Object> responsePost; + try { + responsePost = server.request(requestPost, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + NamedList<Object> post = MtasSolrBase.getFromMtasCollection(responsePost, + "post"); + assertFalse("post - id not found", post == null); + assertTrue("post - no valid version", + post.get("version") != null && post.get("version") instanceof String); + assertTrue("post - no valid size", + post.get("size") != null && post.get("size") instanceof Number); + String postVersion = (String) post.get("version"); + Number postSize = (Number) post.get("size"); + assertTrue("post - incorrect size", postSize.longValue() == 4); + // list + ModifiableSolrParams paramsList = new ModifiableSolrParams(); + paramsList.set("q", "*:*"); + paramsList.set("mtas", "true"); + paramsList.set("mtas.collection", "true"); + paramsList.set("mtas.collection.0.key", "list"); + paramsList.set("mtas.collection.0.action", "list"); + SolrRequest<?> requestList1 = new QueryRequest(paramsList, METHOD.POST); + NamedList<Object> responseList1; + try { + responseList1 = server.request(requestList1, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + // check create + NamedList<Object> listCreateItem1 = MtasSolrBase + .getFromMtasCollectionList(responseList1, "list", "idCreate"); + assertFalse("list - create - id not found", listCreateItem1 == null); + assertTrue("list - create - incorrect version", + listCreateItem1.get("version") != null + && listCreateItem1.get("version") instanceof String + && listCreateItem1.get("version").equals(createVersion)); + assertTrue("list - create - incorrect size", + listCreateItem1.get("size") != null + && listCreateItem1.get("size") instanceof Number + && ((Number) listCreateItem1.get("size")).longValue() == createSize + .longValue()); + // check post + NamedList<Object> listPostItem1 = MtasSolrBase + .getFromMtasCollectionList(responseList1, "list", "idPost"); + assertFalse("list - post - id not found", listPostItem1 == null); + assertTrue("list - post - incorrect version", + listPostItem1.get("version") != null + && listPostItem1.get("version") instanceof String + && listPostItem1.get("version").equals(postVersion)); + assertTrue("list - post - incorrect size", + listPostItem1.get("size") != null + && listPostItem1.get("size") instanceof Number + && ((Number) listPostItem1.get("size")).longValue() == postSize + .longValue()); + // check + ModifiableSolrParams paramsCheck = new ModifiableSolrParams(); + paramsCheck.set("q", "*:*"); + paramsCheck.set("mtas", "true"); + paramsCheck.set("mtas.collection", "true"); + paramsCheck.set("mtas.collection.0.key", "check1"); + paramsCheck.set("mtas.collection.0.action", "check"); + paramsCheck.set("mtas.collection.0.id", "idCreate"); + paramsCheck.set("mtas.collection.1.key", "check2"); + paramsCheck.set("mtas.collection.1.action", "check"); + paramsCheck.set("mtas.collection.1.id", "idPost"); + SolrRequest<?> requestCheck = new QueryRequest(paramsCheck, METHOD.POST); + NamedList<Object> responseCheck; + try { + responseCheck = server.request(requestCheck, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + // check create + NamedList<Object> check1 = MtasSolrBase.getFromMtasCollection(responseCheck, + "check1"); + assertFalse("check - create - no response", check1 == null); + assertTrue("check - create - no valid version", + check1.get("version") != null + && check1.get("version") instanceof String); + assertTrue("check - create - no valid size", + check1.get("size") != null && check1.get("size") instanceof Number); + String check1Version = (String) check1.get("version"); + Number check1Size = (Number) check1.get("size"); + assertEquals("check - create - version", check1Version, createVersion); + assertEquals("check - create - number of values", check1Size.longValue(), + createSize.longValue()); + // check post + NamedList<Object> check2 = MtasSolrBase.getFromMtasCollection(responseCheck, + "check2"); + assertFalse("check - post - no response", check2 == null); + assertTrue("check - post - no valid version", check2.get("version") != null + && check2.get("version") instanceof String); + assertTrue("check - post - no valid size", + check2.get("size") != null && check2.get("size") instanceof Number); + String check2Version = (String) check2.get("version"); + Number check2Size = (Number) check2.get("size"); + assertEquals("check - post - version", check2Version, postVersion); + assertEquals("check - post - number of values", check2Size.longValue(), 4); + // delete + ModifiableSolrParams paramsDelete = new ModifiableSolrParams(); + paramsDelete.set("q", "*:*"); + paramsDelete.set("mtas", "true"); + paramsDelete.set("mtas.collection", "true"); + paramsDelete.set("mtas.collection.0.key", "delete1"); + paramsDelete.set("mtas.collection.0.action", "delete"); + paramsDelete.set("mtas.collection.0.id", "idCreate"); + paramsDelete.set("mtas.collection.1.key", "delete2"); + paramsDelete.set("mtas.collection.1.action", "delete"); + paramsDelete.set("mtas.collection.1.id", "idPost"); + SolrRequest<?> requestDelete = new QueryRequest(paramsDelete, METHOD.POST); + NamedList<Object> responseDelete; + try { + responseDelete = server.request(requestDelete, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + // check create + NamedList<Object> delete1 = MtasSolrBase + .getFromMtasCollection(responseDelete, "delete1"); + assertFalse("delete - create - no response", delete1 == null); + // check post + NamedList<Object> delete2 = MtasSolrBase + .getFromMtasCollection(responseDelete, "delete2"); + assertFalse("delete - post - no response", delete2 == null); + // list (again) + SolrRequest<?> requestList2 = new QueryRequest(paramsList, METHOD.POST); + NamedList<Object> responseList2; + try { + responseList2 = server.request(requestList2, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + // check create + NamedList<Object> listCreateItem2 = MtasSolrBase + .getFromMtasCollectionList(responseList2, "list", "idCreate"); + assertTrue("list - create - id found", listCreateItem2 == null); + // check post + NamedList<Object> listPostItem2 = MtasSolrBase + .getFromMtasCollectionList(responseList2, "list", "idPost"); + assertTrue("list - post - id found", listPostItem2 == null); + } + + /** + * Mtas request handler collection 2. + * + * @throws IOException Signals that an I/O exception has occurred. + */ + @org.junit.Test + public void mtasRequestHandlerCollection2() throws IOException { + // post + ModifiableSolrParams paramsPost = new ModifiableSolrParams(); + paramsPost.set("q", "*:*"); + paramsPost.set("mtas", "true"); + paramsPost.set("mtas.collection", "true"); + paramsPost.set("mtas.collection.0.key", "postKey1"); + paramsPost.set("mtas.collection.0.action", "post"); + paramsPost.set("mtas.collection.0.id", "postSet1"); + paramsPost.set("mtas.collection.0.post", "[1,3,4]"); + paramsPost.set("mtas.collection.1.key", "postKey2"); + paramsPost.set("mtas.collection.1.action", "post"); + paramsPost.set("mtas.collection.1.id", "postSet2"); + paramsPost.set("mtas.collection.1.post", "[2]"); + paramsPost.set("mtas.collection.2.key", "createKey1"); + paramsPost.set("mtas.collection.2.action", "create"); + paramsPost.set("mtas.collection.2.id", "createSet1"); + paramsPost.set("mtas.collection.2.field", MtasSolrBase.FIELD_ID); + SolrRequest<?> requestPost = new QueryRequest(paramsPost, METHOD.POST); + NamedList<Object> responsePost; + try { + responsePost = server.request(requestPost, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + MtasSolrBase.getFromMtasCollection(responsePost, "post"); + // query set1 + ModifiableSolrParams paramsSelect1 = new ModifiableSolrParams(); + paramsSelect1.set("q", "{!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"postSet1\"}"); + paramsSelect1.set("rows", "0"); + SolrRequest<?> request1 = new QueryRequest(paramsSelect1, METHOD.POST); + NamedList<Object> response1; + try { + response1 = server.request(request1, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + long n1 = MtasSolrBase.getNumFound(response1); + assertTrue("incorrect number of matching documents : " + n1, n1 == 2); + // query set2 + ModifiableSolrParams paramsSelect2 = new ModifiableSolrParams(); + paramsSelect2.set("q", "{!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"postSet2\"}"); + paramsSelect2.set("rows", "0"); + SolrRequest<?> request2 = new QueryRequest(paramsSelect2, METHOD.POST); + NamedList<Object> response2; + try { + response2 = server.request(request2, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + long n2 = MtasSolrBase.getNumFound(response2); + assertTrue("incorrect number of matching documents : " + n2, n2 == 1); + // query set3 + ModifiableSolrParams paramsSelect3 = new ModifiableSolrParams(); + paramsSelect3.set("q", "{!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"createSet1\"}"); + paramsSelect3.set("rows", "0"); + SolrRequest<?> request3 = new QueryRequest(paramsSelect3, METHOD.POST); + NamedList<Object> response3; + try { + response3 = server.request(request3, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + long n3 = MtasSolrBase.getNumFound(response3); + assertTrue("incorrect number of matching documents : " + n3, n3 == 3); + // query set1 or set2 + ModifiableSolrParams paramsSelect4 = new ModifiableSolrParams(); + paramsSelect4.set("q", + "({!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"postSet1\"}) OR ({!mtas_join field=\"" + + MtasSolrBase.FIELD_ID + "\" collection=\"postSet2\"})"); + paramsSelect4.set("rows", "0"); + SolrRequest<?> request4 = new QueryRequest(paramsSelect4, METHOD.POST); + NamedList<Object> response4; + try { + response4 = server.request(request4, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + long n4 = MtasSolrBase.getNumFound(response4); + assertTrue("incorrect number of matching documents : " + n4, n4 == 3); + } + + @org.junit.Test + public void mtasRequestHandlerCollection3() throws IOException { + // post + ModifiableSolrParams paramsPost = new ModifiableSolrParams(); + paramsPost.set("q", "*:*"); + paramsPost.set("mtas", "true"); + paramsPost.set("mtas.collection", "true"); + paramsPost.set("mtas.collection.0.key", "setCreatedByPost"); + paramsPost.set("mtas.collection.0.action", "post"); + paramsPost.set("mtas.collection.0.id", "setCreatedByPost"); + paramsPost.set("mtas.collection.0.post", "[1,3,4]"); + SolrRequest<?> requestPost = new QueryRequest(paramsPost, METHOD.POST); + try { + server.request(requestPost, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + // import + ModifiableSolrParams paramsImport = new ModifiableSolrParams(); + paramsImport.set("q", "*:*"); + paramsImport.set("mtas", "true"); + paramsImport.set("mtas.collection", "true"); + paramsImport.set("mtas.collection.0.key", "setCreatedByImport"); + paramsImport.set("mtas.collection.0.action", "post"); + paramsImport.set("mtas.collection.0.id", "setCreatedByImport"); + paramsImport.set("mtas.collection.0.post", "[1,3,4]"); + SolrRequest<?> requestImport = new QueryRequest(paramsImport, METHOD.POST); + try { + server.request(requestImport, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + // query post + ModifiableSolrParams paramsSelect1 = new ModifiableSolrParams(); + paramsSelect1.set("q", "{!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"setCreatedByPost\"}"); + paramsSelect1.set("rows", "0"); + SolrRequest<?> request1 = new QueryRequest(paramsSelect1, METHOD.POST); + NamedList<Object> response1; + try { + response1 = server.request(request1, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + long n1 = MtasSolrBase.getNumFound(response1); + assertTrue ("no matching documents for posted set: " + n1, n1>0); + // query import + ModifiableSolrParams paramsSelect2 = new ModifiableSolrParams(); + paramsSelect2.set("q", "{!mtas_join field=\"" + MtasSolrBase.FIELD_ID + + "\" collection=\"setCreatedByImport\"}"); + paramsSelect2.set("rows", "0"); + SolrRequest<?> request2 = new QueryRequest(paramsSelect2, METHOD.POST); + NamedList<Object> response2; + try { + response2 = server.request(request2, "collection1"); + } catch (SolrServerException e) { + throw new IOException(e); + } + long n2 = MtasSolrBase.getNumFound(response2); + assertTrue ("no matching documents for imported set: " + n2, n2>0); + //compare + assertTrue("posted set and imported set give different results : "+n1+" and "+n2, n1==n2); + } + + /** * Mtas solr schema pre analyzed parser and field. * * @throws IOException Signals that an I/O exception has occurred. @@ -468,17 +814,17 @@ public class MtasSolrTestSearchConsistency { params.set("mtas", "true"); params.set("mtas.stats", "true"); params.set("mtas.stats.spans", "true"); - params.set("mtas.stats.spans.0.field", "mtas"); + params.set("mtas.stats.spans.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.stats.spans.0.key", "statsKey"); params.set("mtas.stats.spans.0.query.0.type", "cql"); params.set("mtas.stats.spans.0.query.0.value", "[]"); params.set("mtas.stats.spans.0.type", "n,sum,sumsq"); params.set("mtas.stats.positions", "true"); - params.set("mtas.stats.positions.0.field", "mtas"); + params.set("mtas.stats.positions.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.stats.positions.0.key", "statsKey"); params.set("mtas.stats.positions.0.type", "n,sum,sumsq"); params.set("mtas.stats.tokens", "true"); - params.set("mtas.stats.tokens.0.field", "mtas"); + params.set("mtas.stats.tokens.0.field", MtasSolrBase.FIELD_MTAS); params.set("mtas.stats.tokens.0.key", "statsKey"); params.set("mtas.stats.tokens.0.type", "n,sum,sumsq"); params.set("rows", "0"); @@ -493,9 +839,10 @@ public class MtasSolrTestSearchConsistency { params.remove("mtas.stats.spans.0.field"); params.remove("mtas.stats.positions.0.field"); params.remove("mtas.stats.tokens.0.field"); - params.set("mtas.stats.spans.0.field", "mtasAdvanced"); - params.set("mtas.stats.positions.0.field", "mtasAdvanced"); - params.set("mtas.stats.tokens.0.field", "mtasAdvanced"); + params.set("mtas.stats.spans.0.field", MtasSolrBase.FIELD_MTAS_ADVANCED); + params.set("mtas.stats.positions.0.field", + MtasSolrBase.FIELD_MTAS_ADVANCED); + params.set("mtas.stats.tokens.0.field", MtasSolrBase.FIELD_MTAS_ADVANCED); try { response2 = server.request(request, "collection1"); } catch (SolrServerException e) { @@ -523,8 +870,10 @@ public class MtasSolrTestSearchConsistency { */ private static void createTermvectorAssertions(NamedList<Object> response1, NamedList<Object> response2, String key, String[] names) { - List<NamedList> list1 = MtasSolrBase.getFromMtasTermvector(response1, key); - List<NamedList> list2 = MtasSolrBase.getFromMtasTermvector(response2, key); + List<NamedList<Object>> list1 = MtasSolrBase + .getFromMtasTermvector(response1, key); + List<NamedList<Object>> list2 = MtasSolrBase + .getFromMtasTermvector(response2, key); assertFalse("list should be defined", list1 == null || list2 == null); if (list1 != null && list2 != null) { assertFalse("first list should not be longer", diff --git a/pom.xml b/pom.xml index 66e68ae..616e448 100644 --- a/pom.xml +++ b/pom.xml @@ -144,7 +144,8 @@ <artifactId>maven-site-plugin</artifactId> <version>3.6</version> <configuration> - <outputDirectory>${project.basedir}/gh-pages/</outputDirectory> + <siteDirectory>${project.basedir}/src/site/</siteDirectory> + <outputDirectory>${project.basedir}/gh-pages/</outputDirectory> </configuration> </plugin> <plugin> @@ -306,7 +307,7 @@ <!-- <plugin> <groupId>org.codehaus.mojo</groupId> <artifactId>versions-maven-plugin</artifactId> <version>2.2</version> <reportSets> <reportSet> <reports> <report>dependency-updates-report</report> <report>plugin-updates-report</report> <report>property-updates-report</report> - </reports> </reportSet> </reportSets> </plugin> --> + </reports> </reportSet> </reportSets> </plugin> --> </plugins> </reporting> <repositories> diff --git a/src/mtas/analysis/parser/MtasXMLParser.java b/src/mtas/analysis/parser/MtasXMLParser.java index eb6f1c4..dcc46a7 100644 --- a/src/mtas/analysis/parser/MtasXMLParser.java +++ b/src/mtas/analysis/parser/MtasXMLParser.java @@ -126,7 +126,7 @@ abstract class MtasXMLParser extends MtasBasicParser { super(config); try { initParser(); - // System.out.print(printConfig()); + //System.out.print(printConfig()); } catch (MtasConfigException e) { log.error(e); } diff --git a/src/mtas/analysis/util/MtasCharFilterFactory.java b/src/mtas/analysis/util/MtasCharFilterFactory.java index 6e3f6cb..e6801b1 100644 --- a/src/mtas/analysis/util/MtasCharFilterFactory.java +++ b/src/mtas/analysis/util/MtasCharFilterFactory.java @@ -66,8 +66,10 @@ public class MtasCharFilterFactory extends CharFilterFactory /** * Instantiates a new mtas char filter factory. * - * @param args the args - * @throws IOException Signals that an I/O exception has occurred. + * @param args + * the args + * @throws IOException + * Signals that an I/O exception has occurred. */ public MtasCharFilterFactory(Map<String, String> args) throws IOException { this(args, null); @@ -76,12 +78,15 @@ public class MtasCharFilterFactory extends CharFilterFactory /** * Instantiates a new mtas char filter factory. * - * @param args the args - * @param resourceLoader the resource loader - * @throws IOException Signals that an I/O exception has occurred. + * @param args + * the args + * @param resourceLoader + * the resource loader + * @throws IOException + * Signals that an I/O exception has occurred. */ public MtasCharFilterFactory(Map<String, String> args, - SolrResourceLoader resourceLoader) throws IOException { + ResourceLoader resourceLoader) throws IOException { super(args); typeArgument = get(args, ARGUMENT_TYPE); prefixArgument = get(args, ARGUMENT_PREFIX); @@ -110,8 +115,10 @@ public class MtasCharFilterFactory extends CharFilterFactory /** * Inits the. * - * @param resourceLoader the resource loader - * @throws IOException Signals that an I/O exception has occurred. + * @param resourceLoader + * the resource loader + * @throws IOException + * Signals that an I/O exception has occurred. */ private void init(ResourceLoader resourceLoader) throws IOException { if (config == null && configs == null) { @@ -164,10 +171,13 @@ public class MtasCharFilterFactory extends CharFilterFactory /** * Creates the. * - * @param input the input - * @param configuration the configuration + * @param input + * the input + * @param configuration + * the configuration * @return the reader - * @throws IOException Signals that an I/O exception has occurred. + * @throws IOException + * Signals that an I/O exception has occurred. */ public Reader create(Reader input, String configuration) throws IOException { if (configs != null && configs.size() > 0) { @@ -210,10 +220,13 @@ public class MtasCharFilterFactory extends CharFilterFactory /** * Creates the. * - * @param input the input - * @param config the config + * @param input + * the input + * @param config + * the config * @return the reader - * @throws IOException Signals that an I/O exception has occurred. + * @throws IOException + * Signals that an I/O exception has occurred. */ public Reader create(Reader input, MtasConfiguration config) throws IOException { diff --git a/src/mtas/analysis/util/MtasTokenizerFactory.java b/src/mtas/analysis/util/MtasTokenizerFactory.java index 7d6350e..b0f2e12 100644 --- a/src/mtas/analysis/util/MtasTokenizerFactory.java +++ b/src/mtas/analysis/util/MtasTokenizerFactory.java @@ -9,8 +9,6 @@ import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.AttributeFactory; -import org.apache.solr.core.SolrResourceLoader; - import java.io.IOException; import java.util.HashMap; import java.util.Map; @@ -66,7 +64,7 @@ public class MtasTokenizerFactory extends TokenizerFactory * @throws IOException Signals that an I/O exception has occurred. */ public MtasTokenizerFactory(Map<String, String> args, - SolrResourceLoader resourceLoader) throws IOException { + ResourceLoader resourceLoader) throws IOException { super(args); configFileArgument = get(args, ARGUMENT_CONFIGFILE); configArgument = get(args, ARGUMENT_CONFIG); diff --git a/src/mtas/codec/MtasFieldsProducer.java b/src/mtas/codec/MtasFieldsProducer.java index 39526e5..6d2c3df 100644 --- a/src/mtas/codec/MtasFieldsProducer.java +++ b/src/mtas/codec/MtasFieldsProducer.java @@ -1,6 +1,9 @@ package mtas.codec; +import java.io.EOFException; +import java.io.FileNotFoundException; import java.io.IOException; +import java.nio.file.NoSuchFileException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -52,8 +55,8 @@ public class MtasFieldsProducer extends FieldsProducer { public MtasFieldsProducer(SegmentReadState state, String name) throws IOException { String postingsFormatName = null; - indexInputList = new HashMap<String, IndexInput>(); - indexInputOffsetList = new HashMap<String, Long>(); + indexInputList = new HashMap<>(); + indexInputOffsetList = new HashMap<>(); version = MtasCodecPostingsFormat.VERSION_CURRENT; postingsFormatName = addIndexInputToList("object", openMtasFile(state, name, @@ -109,20 +112,25 @@ public class MtasFieldsProducer extends FieldsProducer { * @throws IOException Signals that an I/O exception has occurred. */ private String addIndexInputToList(String name, IndexInput in, - String postingsFormatName) throws IOException { + String postingsFormatName) throws IOException { if (indexInputList.get(name) != null) { indexInputList.get(name).close(); } - String localPostingsFormatName = postingsFormatName; - if (localPostingsFormatName == null) { - localPostingsFormatName = in.readString(); - } else if (!in.readString().equals(localPostingsFormatName)) { - throw new IOException("delegate codec " + name + " doesn't equal " - + localPostingsFormatName); + if(in!=null) { + String localPostingsFormatName = postingsFormatName; + if (localPostingsFormatName == null) { + localPostingsFormatName = in.readString(); + } else if (!in.readString().equals(localPostingsFormatName)) { + throw new IOException("delegate codec " + name + " doesn't equal " + + localPostingsFormatName); + } + indexInputList.put(name, in); + indexInputOffsetList.put(name, in.getFilePointer()); + return localPostingsFormatName; + } else { + log.debug("no "+name+" registered"); + return null; } - indexInputList.put(name, in); - indexInputOffsetList.put(name, in.getFilePointer()); - return localPostingsFormatName; } /* @@ -232,7 +240,13 @@ public class MtasFieldsProducer extends FieldsProducer { String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, extension); IndexInput object; - object = state.directory.openInput(fileName, state.context); + try { + object = state.directory.openInput(fileName, state.context); + } catch (FileNotFoundException | NoSuchFileException e) { + log.debug(e); + //throw new NoSuchFileException(e.getMessage()); + return null; + } int minVersion = (minimum == null) ? MtasCodecPostingsFormat.VERSION_START : minimum.intValue(); int maxVersion = (maximum == null) ? MtasCodecPostingsFormat.VERSION_CURRENT @@ -245,7 +259,12 @@ public class MtasFieldsProducer extends FieldsProducer { log.debug(e); throw new IndexFormatTooOldException(e.getMessage(), e.getVersion(), e.getMinVersion(), e.getMaxVersion()); - } + } catch (EOFException e) { + object.close(); + log.debug(e); + //throw new EOFException(e.getMessage()); + return null; + } return object; } diff --git a/src/mtas/codec/util/CodecCollector.java b/src/mtas/codec/util/CodecCollector.java index 73d80eb..0dcdf58 100644 --- a/src/mtas/codec/util/CodecCollector.java +++ b/src/mtas/codec/util/CodecCollector.java @@ -27,7 +27,7 @@ import mtas.codec.util.CodecComponent.ComponentDocument; import mtas.codec.util.CodecComponent.ComponentFacet; import mtas.codec.util.CodecComponent.ComponentField; import mtas.codec.util.CodecComponent.ComponentGroup; -import mtas.codec.util.CodecComponent.ComponentJoin; +import mtas.codec.util.CodecComponent.ComponentCollection; import mtas.codec.util.CodecComponent.ComponentKwic; import mtas.codec.util.CodecComponent.ComponentList; import mtas.codec.util.CodecComponent.ComponentPosition; @@ -203,40 +203,52 @@ public class CodecCollector { } /** - * Collect join. + * Collect collection. * * @param reader the reader * @param docSet the doc set - * @param joinInfo the join info + * @param collectionInfo the collection info * @throws IOException Signals that an I/O exception has occurred. */ - public static void collectJoin(IndexReader reader, List<Integer> docSet, - ComponentJoin joinInfo) throws IOException { - BytesRef term = null; - PostingsEnum postingsEnum = null; - Integer docId; - Integer termDocId = -1; - Terms terms; - LeafReaderContext lrc; - LeafReader r; - ListIterator<LeafReaderContext> iterator = reader.leaves().listIterator(); - while (iterator.hasNext()) { - lrc = iterator.next(); - r = lrc.reader(); - for (String field : joinInfo.fields()) { - if ((terms = r.fields().terms(field)) != null) { - TermsEnum termsEnum = terms.iterator(); - termDocId = -1; - while ((term = termsEnum.next()) != null) { - Iterator<Integer> docIterator = docSet.iterator(); - postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); - while (docIterator.hasNext()) { - docId = docIterator.next() - lrc.docBase; - if ((docId >= termDocId) && ((docId.equals(termDocId)) - || ((termDocId = postingsEnum.advance(docId)) - .equals(docId)))) { - joinInfo.add(term.utf8ToString()); - break; + public static void collectCollection(IndexReader reader, List<Integer> docSet, + ComponentCollection collectionInfo) throws IOException { + if (collectionInfo.action().equals(ComponentCollection.ACTION_CHECK)) { + // can't do anything in lucene for check + } else if (collectionInfo.action() + .equals(ComponentCollection.ACTION_LIST)) { + // can't do anything in lucene for list + } else if (collectionInfo.action() + .equals(ComponentCollection.ACTION_CREATE)) { + BytesRef term = null; + PostingsEnum postingsEnum = null; + Integer docId; + Integer termDocId = -1; + Terms terms; + LeafReaderContext lrc; + LeafReader r; + ListIterator<LeafReaderContext> iterator = reader.leaves().listIterator(); + while (iterator.hasNext()) { + lrc = iterator.next(); + r = lrc.reader(); + for (String field : collectionInfo.fields()) { + if ((terms = r.fields().terms(field)) != null) { + TermsEnum termsEnum = terms.iterator(); + while ((term = termsEnum.next()) != null) { + Iterator<Integer> docIterator = docSet.iterator(); + postingsEnum = termsEnum.postings(postingsEnum, + PostingsEnum.NONE); + termDocId = -1; + while (docIterator.hasNext()) { + docId = docIterator.next() - lrc.docBase; + if ((docId >= termDocId) && ((docId.equals(termDocId)) + || ((termDocId = postingsEnum.advance(docId)) + .equals(docId)))) { + collectionInfo.addValue(term.utf8ToString()); + break; + } + if (termDocId.equals(PostingsEnum.NO_MORE_DOCS)) { + break; + } } } } @@ -270,7 +282,7 @@ public class CodecCollector { boolean needSpans = false; boolean needPositions = false; boolean needTokens = false; - + // results Map<Integer, Integer> positionsData = null; Map<Integer, Integer> tokensData = null; @@ -1517,7 +1529,6 @@ public class CodecCollector { } } } - } else { int maximumNumberOfDocuments = 0; int boundaryMinimumNumberOfDocuments = 1; @@ -1671,9 +1682,9 @@ public class CodecCollector { m.endPosition - 1); } if (group.hitRight != null) { - start = Math.min(m.endPosition - group.hitRight.length + 1, + start = Math.min(m.endPosition - group.hitRight.length, m.startPosition); - end = end == null ? m.endPosition : Math.max(end, m.endPosition); + end = end == null ? (m.endPosition - 1) : Math.max(end, (m.endPosition - 1)); } if (group.left != null) { start = start == null ? m.startPosition - group.left.length @@ -1683,8 +1694,8 @@ public class CodecCollector { } if (group.right != null) { start = start == null ? m.endPosition : Math.min(m.endPosition, start); - end = end == null ? m.endPosition + group.right.length - : Math.max(m.endPosition + group.right.length, end); + end = end == null ? m.endPosition + group.right.length - 1 + : Math.max(m.endPosition + group.right.length - 1, end); } return new IntervalTreeNodeData<>(start, end, m.startPosition, m.endPosition - 1); diff --git a/src/mtas/codec/util/CodecComponent.java b/src/mtas/codec/util/CodecComponent.java index 35ff9ab..1cfdb78 100644 --- a/src/mtas/codec/util/CodecComponent.java +++ b/src/mtas/codec/util/CodecComponent.java @@ -2,8 +2,13 @@ package mtas.codec.util; import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.io.StringReader; import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; @@ -31,8 +36,12 @@ import mtas.parser.function.util.MtasFunctionParserFunction; import mtas.parser.function.util.MtasFunctionParserFunctionDefault; import mtas.search.spans.util.MtasSpanQuery; +import org.apache.commons.io.IOUtils; import org.apache.commons.lang.ArrayUtils; import org.apache.lucene.util.BytesRef; +import org.noggit.JSONParser; +import org.noggit.ObjectBuilder; + /** * The Class CodecComponent. @@ -53,8 +62,8 @@ public class CodecComponent { /** The list. */ public Map<String, ComponentField> list; - /** The join. */ - public ComponentJoin join; + /** The collection. */ + public List<ComponentCollection> collection; /** The do document. */ public boolean doDocument; @@ -89,15 +98,15 @@ public class CodecComponent { /** The do facet. */ public boolean doFacet; - /** The do join. */ - public boolean doJoin; + /** The do collection. */ + public boolean doCollection; /** * Instantiates a new component fields. */ public ComponentFields() { list = new HashMap<>(); - join = null; + collection = new ArrayList<>(); doDocument = false; doKwic = false; doList = false; @@ -109,7 +118,7 @@ public class CodecComponent { doStatsTokens = false; doPrefix = false; doFacet = false; - doJoin = false; + doCollection = false; } } @@ -163,7 +172,8 @@ public class CodecComponent { /** * Instantiates a new component field. * - * @param uniqueKeyField the unique key field + * @param uniqueKeyField + * the unique key field */ public ComponentField(String uniqueKeyField) { this.uniqueKeyField = uniqueKeyField; @@ -205,7 +215,8 @@ public class CodecComponent { /** * Instantiates a new component prefix. * - * @param key the key + * @param key + * the key */ public ComponentPrefix(String key) { this.key = key; @@ -218,7 +229,8 @@ public class CodecComponent { /** * Adds the single position. * - * @param prefix the prefix + * @param prefix + * the prefix */ public void addSinglePosition(String prefix) { if (!prefix.trim().isEmpty() && !singlePositionList.contains(prefix) @@ -230,7 +242,8 @@ public class CodecComponent { /** * Adds the multiple position. * - * @param prefix the prefix + * @param prefix + * the prefix */ public void addMultiplePosition(String prefix) { if (!prefix.trim().isEmpty()) { @@ -248,7 +261,8 @@ public class CodecComponent { /** * Adds the set position. * - * @param prefix the prefix + * @param prefix + * the prefix */ public void addSetPosition(String prefix) { if (!prefix.trim().isEmpty()) { @@ -266,7 +280,8 @@ public class CodecComponent { /** * Adds the intersecting. * - * @param prefix the prefix + * @param prefix + * the prefix */ public void addIntersecting(String prefix) { if (!prefix.trim().isEmpty()) { @@ -335,19 +350,32 @@ public class CodecComponent { /** * Instantiates a new component document. * - * @param key the key - * @param prefix the prefix - * @param statsType the stats type - * @param regexp the regexp - * @param list the list - * @param listNumber the list number - * @param listRegexp the list regexp - * @param listExpand the list expand - * @param listExpandNumber the list expand number - * @param ignoreRegexp the ignore regexp - * @param ignoreList the ignore list - * @param ignoreListRegexp the ignore list regexp - * @throws IOException Signals that an I/O exception has occurred. + * @param key + * the key + * @param prefix + * the prefix + * @param statsType + * the stats type + * @param regexp + * the regexp + * @param list + * the list + * @param listNumber + * the list number + * @param listRegexp + * the list regexp + * @param listExpand + * the list expand + * @param listExpandNumber + * the list expand number + * @param ignoreRegexp + * the ignore regexp + * @param ignoreList + * the ignore list + * @param ignoreListRegexp + * the ignore list regexp + * @throws IOException + * Signals that an I/O exception has occurred. */ public ComponentDocument(String key, String prefix, String statsType, String regexp, String[] list, int listNumber, Boolean listRegexp, @@ -451,15 +479,24 @@ public class CodecComponent { /** * Instantiates a new component kwic. * - * @param query the query - * @param key the key - * @param prefixes the prefixes - * @param number the number - * @param start the start - * @param left the left - * @param right the right - * @param output the output - * @throws IOException Signals that an I/O exception has occurred. + * @param query + * the query + * @param key + * the key + * @param prefixes + * the prefixes + * @param number + * the number + * @param start + * the start + * @param left + * the left + * @param right + * the right + * @param output + * the output + * @throws IOException + * Signals that an I/O exception has occurred. */ public ComponentKwic(MtasSpanQuery query, String key, String prefixes, Integer number, int start, int left, int right, String output) @@ -585,22 +622,38 @@ public class CodecComponent { /** * Instantiates a new component list. * - * @param spanQuery the span query - * @param field the field - * @param queryValue the query value - * @param queryType the query type - * @param queryPrefix the query prefix - * @param queryVariables the query variables - * @param queryIgnore the query ignore - * @param queryMaximumIgnoreLength the query maximum ignore length - * @param key the key - * @param prefix the prefix - * @param start the start - * @param number the number - * @param left the left - * @param right the right - * @param output the output - * @throws IOException Signals that an I/O exception has occurred. + * @param spanQuery + * the span query + * @param field + * the field + * @param queryValue + * the query value + * @param queryType + * the query type + * @param queryPrefix + * the query prefix + * @param queryVariables + * the query variables + * @param queryIgnore + * the query ignore + * @param queryMaximumIgnoreLength + * the query maximum ignore length + * @param key + * the key + * @param prefix + * the prefix + * @param start + * the start + * @param number + * the number + * @param left + * the left + * @param right + * the right + * @param output + * the output + * @throws IOException + * Signals that an I/O exception has occurred. */ public ComponentList(MtasSpanQuery spanQuery, String field, String queryValue, String queryType, String queryPrefix, @@ -715,23 +768,40 @@ public class CodecComponent { /** * Instantiates a new component group. * - * @param spanQuery the span query - * @param key the key - * @param number the number - * @param groupingHitInsidePrefixes the grouping hit inside prefixes - * @param groupingHitInsideLeftPosition the grouping hit inside left position - * @param groupingHitInsideLeftPrefixes the grouping hit inside left prefixes - * @param groupingHitInsideRightPosition the grouping hit inside right position - * @param groupingHitInsideRightPrefixes the grouping hit inside right prefixes - * @param groupingHitLeftPosition the grouping hit left position - * @param groupingHitLeftPrefixes the grouping hit left prefixes - * @param groupingHitRightPosition the grouping hit right position - * @param groupingHitRightPrefixes the grouping hit right prefixes - * @param groupingLeftPosition the grouping left position - * @param groupingLeftPrefixes the grouping left prefixes - * @param groupingRightPosition the grouping right position - * @param groupingRightPrefixes the grouping right prefixes - * @throws IOException Signals that an I/O exception has occurred. + * @param spanQuery + * the span query + * @param key + * the key + * @param number + * the number + * @param groupingHitInsidePrefixes + * the grouping hit inside prefixes + * @param groupingHitInsideLeftPosition + * the grouping hit inside left position + * @param groupingHitInsideLeftPrefixes + * the grouping hit inside left prefixes + * @param groupingHitInsideRightPosition + * the grouping hit inside right position + * @param groupingHitInsideRightPrefixes + * the grouping hit inside right prefixes + * @param groupingHitLeftPosition + * the grouping hit left position + * @param groupingHitLeftPrefixes + * the grouping hit left prefixes + * @param groupingHitRightPosition + * the grouping hit right position + * @param groupingHitRightPrefixes + * the grouping hit right prefixes + * @param groupingLeftPosition + * the grouping left position + * @param groupingLeftPrefixes + * the grouping left prefixes + * @param groupingRightPosition + * the grouping right position + * @param groupingRightPrefixes + * the grouping right prefixes + * @throws IOException + * Signals that an I/O exception has occurred. */ public ComponentGroup(MtasSpanQuery spanQuery, String key, int number, String groupingHitInsidePrefixes, @@ -791,11 +861,15 @@ public class CodecComponent { /** * Creates the positioned prefixes. * - * @param prefixList the prefix list - * @param position the position - * @param prefixes the prefixes + * @param prefixList + * the prefix list + * @param position + * the position + * @param prefixes + * the prefixes * @return the hash set[] - * @throws IOException Signals that an I/O exception has occurred. + * @throws IOException + * Signals that an I/O exception has occurred. */ private static HashSet<String>[] createPositionedPrefixes( HashSet<String> prefixList, String[] position, String[] prefixes) @@ -958,24 +1032,42 @@ public class CodecComponent { /** * Instantiates a new component facet. * - * @param spanQueries the span queries - * @param field the field - * @param key the key - * @param baseFields the base fields - * @param baseFieldTypes the base field types - * @param baseTypes the base types - * @param baseRangeSizes the base range sizes - * @param baseRangeBases the base range bases - * @param baseSortTypes the base sort types - * @param baseSortDirections the base sort directions - * @param baseNumbers the base numbers - * @param baseMinimumDoubles the base minimum doubles - * @param baseMaximumDoubles the base maximum doubles - * @param baseFunctionKeys the base function keys - * @param baseFunctionExpressions the base function expressions - * @param baseFunctionTypes the base function types - * @throws IOException Signals that an I/O exception has occurred. - * @throws ParseException the parse exception + * @param spanQueries + * the span queries + * @param field + * the field + * @param key + * the key + * @param baseFields + * the base fields + * @param baseFieldTypes + * the base field types + * @param baseTypes + * the base types + * @param baseRangeSizes + * the base range sizes + * @param baseRangeBases + * the base range bases + * @param baseSortTypes + * the base sort types + * @param baseSortDirections + * the base sort directions + * @param baseNumbers + * the base numbers + * @param baseMinimumDoubles + * the base minimum doubles + * @param baseMaximumDoubles + * the base maximum doubles + * @param baseFunctionKeys + * the base function keys + * @param baseFunctionExpressions + * the base function expressions + * @param baseFunctionTypes + * the base function types + * @throws IOException + * Signals that an I/O exception has occurred. + * @throws ParseException + * the parse exception */ @SuppressWarnings("unchecked") public ComponentFacet(MtasSpanQuery[] spanQueries, String field, String key, @@ -1235,26 +1327,46 @@ public class CodecComponent { /** * Instantiates a new component term vector. * - * @param key the key - * @param prefix the prefix - * @param regexp the regexp - * @param full the full - * @param type the type - * @param sortType the sort type - * @param sortDirection the sort direction - * @param startValue the start value - * @param number the number - * @param functionKey the function key - * @param functionExpression the function expression - * @param functionType the function type - * @param boundary the boundary - * @param list the list - * @param listRegexp the list regexp - * @param ignoreRegexp the ignore regexp - * @param ignoreList the ignore list - * @param ignoreListRegexp the ignore list regexp - * @throws IOException Signals that an I/O exception has occurred. - * @throws ParseException the parse exception + * @param key + * the key + * @param prefix + * the prefix + * @param regexp + * the regexp + * @param full + * the full + * @param type + * the type + * @param sortType + * the sort type + * @param sortDirection + * the sort direction + * @param startValue + * the start value + * @param number + * the number + * @param functionKey + * the function key + * @param functionExpression + * the function expression + * @param functionType + * the function type + * @param boundary + * the boundary + * @param list + * the list + * @param listRegexp + * the list regexp + * @param ignoreRegexp + * the ignore regexp + * @param ignoreList + * the ignore list + * @param ignoreListRegexp + * the ignore list regexp + * @throws IOException + * Signals that an I/O exception has occurred. + * @throws ParseException + * the parse exception */ @SuppressWarnings({ "unchecked", "rawtypes" }) public ComponentTermVector(String key, String prefix, String regexp, @@ -1461,16 +1573,26 @@ public class CodecComponent { /** * Instantiates a new component span. * - * @param queries the queries - * @param key the key - * @param minimumDouble the minimum double - * @param maximumDouble the maximum double - * @param type the type - * @param functionKey the function key - * @param functionExpression the function expression - * @param functionType the function type - * @throws IOException Signals that an I/O exception has occurred. - * @throws ParseException the parse exception + * @param queries + * the queries + * @param key + * the key + * @param minimumDouble + * the minimum double + * @param maximumDouble + * the maximum double + * @param type + * the type + * @param functionKey + * the function key + * @param functionExpression + * the function expression + * @param functionType + * the function type + * @throws IOException + * Signals that an I/O exception has occurred. + * @throws ParseException + * the parse exception */ public ComponentSpan(MtasSpanQuery[] queries, String key, Double minimumDouble, Double maximumDouble, String type, @@ -1603,12 +1725,18 @@ public class CodecComponent { /** * Instantiates a new component position. * - * @param key the key - * @param minimumDouble the minimum double - * @param maximumDouble the maximum double - * @param statsType the stats type - * @throws IOException Signals that an I/O exception has occurred. - * @throws ParseException the parse exception + * @param key + * the key + * @param minimumDouble + * the minimum double + * @param maximumDouble + * the maximum double + * @param statsType + * the stats type + * @throws IOException + * Signals that an I/O exception has occurred. + * @throws ParseException + * the parse exception */ public ComponentPosition(String key, Double minimumDouble, Double maximumDouble, String statsType) @@ -1662,12 +1790,18 @@ public class CodecComponent { /** * Instantiates a new component token. * - * @param key the key - * @param minimumDouble the minimum double - * @param maximumDouble the maximum double - * @param statsType the stats type - * @throws IOException Signals that an I/O exception has occurred. - * @throws ParseException the parse exception + * @param key + * the key + * @param minimumDouble + * the minimum double + * @param maximumDouble + * the maximum double + * @param statsType + * the stats type + * @throws IOException + * Signals that an I/O exception has occurred. + * @throws ParseException + * the parse exception */ public ComponentToken(String key, Double minimumDouble, Double maximumDouble, String statsType) @@ -1693,65 +1827,267 @@ public class CodecComponent { } /** - * The Class ComponentJoin. + * The Class ComponentCollection. */ - public static class ComponentJoin implements BasicComponent { + public static class ComponentCollection implements BasicComponent { + + /** The Constant ACTION_CREATE. */ + public static final String ACTION_CREATE = "create"; + + /** The Constant ACTION_CHECK. */ + public static final String ACTION_CHECK = "check"; + + /** The Constant ACTION_LIST. */ + public static final String ACTION_LIST = "list"; + + /** The Constant ACTION_POST. */ + public static final String ACTION_POST = "post"; + + /** The Constant ACTION_IMPORT. */ + public static final String ACTION_IMPORT = "import"; + + /** The Constant ACTION_DELETE. */ + public static final String ACTION_DELETE = "delete"; + + /** The Constant ACTION_EMPTY. */ + public static final String ACTION_EMPTY = "empty"; + + /** The Constant ACTION_GET. */ + public static final String ACTION_GET = "get"; + + /** The key. */ + public String key; + + /** The version. */ + public String version; + + /** The id. */ + public String id; + + /** The action. */ + private String action; /** The fields. */ private Set<String> fields; /** The values. */ - private Set<String> values; - - /** The key. */ - private String key; + private HashSet<String> values; /** - * Instantiates a new component join. + * Instantiates a new component collection. * - * @param fields the fields - * @param key the key + * @param key + * the key + * @param action + * the action */ - public ComponentJoin(Set<String> fields, String key) { - this.fields = fields; + public ComponentCollection(String key, String action) { this.key = key; - this.values = new HashSet<>(); + this.action = action; + this.version = null; + values = new HashSet<>(); } /** - * Adds the. + * Sets the list variables. * - * @param value the value + * @throws IOException + * Signals that an I/O exception has occurred. */ - public void add(String value) { - values.add(value); + public void setListVariables() throws IOException { + if (action.equals(ACTION_LIST)) { + // do nothing + } else { + throw new IOException("not allowed with action " + action); + } } /** - * Adds the. + * Sets the create variables. * - * @param values the values + * @param id + * the id + * @param fields + * the fields + * @throws IOException + * Signals that an I/O exception has occurred. */ - public void add(Set<String> values) { - this.values.addAll(values); + public void setCreateVariables(String id, Set<String> fields) + throws IOException { + if (action.equals(ACTION_CREATE)) { + this.id = id; + this.fields = fields; + } else { + throw new IOException("not allowed with action " + action); + } } /** - * Values. + * Sets the check variables. * - * @return the sets the + * @param id + * the new check variables + * @throws IOException + * Signals that an I/O exception has occurred. */ - public Set<String> values() { - return values; + public void setCheckVariables(String id) throws IOException { + if (action.equals(ACTION_CHECK)) { + this.id = id; + } else { + throw new IOException("not allowed with action " + action); + } + } + + /** + * Sets the gets the variables. + * + * @param id + * the new gets the variables + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public void setGetVariables(String id) throws IOException { + if (action.equals(ACTION_GET)) { + this.id = id; + } else { + throw new IOException("not allowed with action " + action); + } + } + + /** + * Sets the post variables. + * + * @param id + * the id + * @param values + * the values + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public void setPostVariables(String id, HashSet<String> values) + throws IOException { + if (action.equals(ACTION_POST)) { + this.id = id; + this.values = values; + } else { + throw new IOException("not allowed with action " + action); + } + } + + public void setImportVariables(String id, String url, String collection) + throws IOException { + if (action.equals(ACTION_IMPORT)) { + this.id = id; + StringBuilder importUrlBuffer = new StringBuilder(url); + importUrlBuffer.append("select"); + importUrlBuffer.append("?q=*:*&rows=0&wt=json"); + importUrlBuffer.append("&mtas=true&mtas.collection=true"); + importUrlBuffer.append("&mtas.collection.0.key=0"); + importUrlBuffer.append("&mtas.collection.0.action=get"); + importUrlBuffer.append( + "&mtas.collection.0.id=" + URLEncoder.encode(collection, "UTF-8")); + Map<String, Object> params = getImport(importUrlBuffer.toString()); + try { + if (params.containsKey("mtas") + && params.get("mtas") instanceof Map) { + Map<String, Object> mtasParams = (Map<String, Object>) params + .get("mtas"); + if (mtasParams.containsKey("collection") + && mtasParams.get("collection") instanceof List) { + List<Object> mtasCollectionList = (List<Object>) mtasParams + .get("collection"); + if (mtasCollectionList.size() == 1 + && mtasCollectionList.get(0) instanceof Map) { + Map<String, Object> collectionData = (Map<String, Object>) mtasCollectionList + .get(0); + if (collectionData.containsKey("values") + && collectionData.get("values") instanceof List) { + List<String> valuesList = (List<String>) collectionData + .get("values"); + for (String valueItem : valuesList) { + values.add(valueItem); + } + } else { + throw new IOException("no values in response"); + } + } else { + throw new IOException( + "no valid mtas collection item in response"); + } + } else { + throw new IOException("no valid mtas collection in response"); + } + } else { + throw new IOException("no valid mtas in response"); + } + } catch (ClassCastException e) { + throw new IOException("unexpected response", e); + } + } else { + throw new IOException("not allowed with action " + action); + } + } + + private Map<String, Object> getImport(String collectionGetUrl) + throws IOException { + // get data + URL url = new URL(collectionGetUrl); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setDoOutput(false); + connection.setDoInput(true); + connection.setInstanceFollowRedirects(false); + connection.setRequestMethod("GET"); + connection.setRequestProperty("Content-Type", + "application/json; charset=UTF-8"); + connection.setRequestProperty("charset", "utf-8"); + connection.setUseCaches(false); + // process response + InputStream is = null; + try { + is = connection.getInputStream(); + } catch (IOException ioe) { + throw new IOException("Couldn't get data from url"); + } + InputStreamReader in = new InputStreamReader((InputStream) is, "UTF8"); + Map<String, Object> params = new HashMap<>(); + getParamsFromJSON(params, + IOUtils.toString(in)); + connection.disconnect(); + return params; } /** - * Key. + * Sets the delete variables. + * + * @param id + * the new delete variables + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public void setDeleteVariables(String id) throws IOException { + if (action.equals(ACTION_DELETE)) { + this.id = id; + } else { + throw new IOException("not allowed with action " + action); + } + } + + /** + * Action. * * @return the string */ - public String key() { - return key; + public String action() { + return action; + } + + /** + * Values. + * + * @return the hash set + */ + public HashSet<String> values() { + return values; } /** @@ -1763,6 +2099,61 @@ public class CodecComponent { return fields; } + /** + * Adds the value. + * + * @param value + * the value + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public void addValue(String value) throws IOException { + if (action.equals(ACTION_CREATE)) { + if (version == null) { + values.add(value); + } else { + throw new IOException("version already set"); + } + } else { + throw new IOException("not allowed for action '" + action + "'"); + } + } + + private static void getParamsFromJSON(Map<String, Object> params, + String json) { + JSONParser parser = new JSONParser(json); + try { + Object o = ObjectBuilder.getVal(parser); + if (!(o instanceof Map)) + return; + Map<String, Object> map = (Map<String, Object>) o; + // To make consistent with json.param handling, we should make query + // params come after json params (i.e. query params should + // appear to overwrite json params. + + // Solr params are based on String though, so we need to convert + for (Map.Entry<String, Object> entry : map.entrySet()) { + String key = entry.getKey(); + Object val = entry.getValue(); + if (params.get(key) != null) { + continue; + } + + if (val == null) { + params.remove(key); + } else { + params.put(key, val); + } + } + + } catch (Exception e) { + // ignore parse exceptions at this stage, they may be caused by incomplete + // macro expansions + return; + } + + } + } /** @@ -1803,18 +2194,30 @@ public class CodecComponent { /** * Instantiates a new sub component function. * - * @param collectorType the collector type - * @param key the key - * @param type the type - * @param parserFunction the parser function - * @param sortType the sort type - * @param sortDirection the sort direction - * @param start the start - * @param number the number - * @param segmentRegistration the segment registration - * @param boundary the boundary - * @throws ParseException the parse exception - * @throws IOException Signals that an I/O exception has occurred. + * @param collectorType + * the collector type + * @param key + * the key + * @param type + * the type + * @param parserFunction + * the parser function + * @param sortType + * the sort type + * @param sortDirection + * the sort direction + * @param start + * the start + * @param number + * the number + * @param segmentRegistration + * the segment registration + * @param boundary + * the boundary + * @throws ParseException + * the parse exception + * @throws IOException + * Signals that an I/O exception has occurred. */ public SubComponentFunction(String collectorType, String key, String type, MtasFunctionParserFunction parserFunction, String sortType, @@ -1847,12 +2250,18 @@ public class CodecComponent { /** * Instantiates a new sub component function. * - * @param collectorType the collector type - * @param key the key - * @param expression the expression - * @param type the type - * @throws ParseException the parse exception - * @throws IOException Signals that an I/O exception has occurred. + * @param collectorType + * the collector type + * @param key + * the key + * @param expression + * the expression + * @param type + * the type + * @throws ParseException + * the parse exception + * @throws IOException + * Signals that an I/O exception has occurred. */ public SubComponentFunction(String collectorType, String key, String expression, String type) throws ParseException, IOException { @@ -1895,8 +2304,10 @@ public class CodecComponent { /** * Instantiates a new kwic token. * - * @param match the match - * @param tokens the tokens + * @param match + * the match + * @param tokens + * the tokens */ public KwicToken(Match match, List<MtasTokenString> tokens) { startPosition = match.startPosition; @@ -1923,8 +2334,10 @@ public class CodecComponent { /** * Instantiates a new kwic hit. * - * @param match the match - * @param hits the hits + * @param match + * the match + * @param hits + * the hits */ public KwicHit(Match match, Map<Integer, List<String>> hits) { startPosition = match.startPosition; @@ -1996,7 +2409,8 @@ public class CodecComponent { /** * Sort. * - * @param data the data + * @param data + * the data * @return the list */ private List<MtasTreeHit<String>> sort(List<MtasTreeHit<String>> data) { @@ -2015,30 +2429,35 @@ public class CodecComponent { /** * Instantiates a new group hit. * - * @param list the list - * @param start the start - * @param end the end - * @param hitStart the hit start - * @param hitEnd the hit end - * @param group the group - * @param knownPrefixes the known prefixes - * @throws UnsupportedEncodingException the unsupported encoding exception + * @param list + * the list + * @param start + * the start + * @param end + * the end + * @param hitStart + * the hit start + * @param hitEnd + * the hit end + * @param group + * the group + * @param knownPrefixes + * the known prefixes + * @throws UnsupportedEncodingException + * the unsupported encoding exception */ @SuppressWarnings("unchecked") public GroupHit(List<MtasTreeHit<String>> list, int start, int end, int hitStart, int hitEnd, ComponentGroup group, Set<String> knownPrefixes) throws UnsupportedEncodingException { - // System.out.println("init: "+start+"-"+end+"\t"+hitStart+"-"+hitEnd); // compute dimensions int leftRangeStart = start; - int leftRangeEnd = Math.min(end - 1, hitStart - 1); + int leftRangeEnd = Math.min(end, hitStart - 1); int leftRangeLength = Math.max(0, 1 + leftRangeEnd - leftRangeStart); int hitLength = 1 + hitEnd - hitStart; int rightRangeStart = Math.max(start, hitEnd + 1); int rightRangeEnd = end; int rightRangeLength = Math.max(0, 1 + rightRangeEnd - rightRangeStart); - // System.out.println(leftRangeStart+"\t"+leftRangeEnd+"\t"+leftRangeLength+" - // - "+rightRangeStart+"\t"+rightRangeEnd+"\t"+rightRangeLength); // create initial arrays if (leftRangeLength > 0) { keyLeft = ""; @@ -2112,15 +2531,8 @@ public class CodecComponent { } } if (group.hitInsideRight != null) { - // System.out.println(missingHit.length + " items in missingHit"); - // System.out.println( - // group.hitInsideRight.length + " items in group.hitInsideRight"); - for (int p = 0; p < group.hitInsideRight.length; p++) { - // System.out.println(" - " + group.hitInsideRight[p]); - } for (int p = Math.max(hitStart, hitEnd - group.hitInsideRight.length + 1); p <= hitEnd; p++) { - // System.out.println("Test voor p is " + (p - hitStart)); if (group.hitInsideRight[hitEnd - p] != null) { missingHit[p - hitStart].addAll(group.hitInsideRight[hitEnd - p]); } @@ -2142,7 +2554,7 @@ public class CodecComponent { } } if (group.hitRight != null) { - for (int p = 0; p <= Math.min(leftRangeLength, + for (int p = 0; p < Math.min(leftRangeLength, group.hitRight.length - dataHit.length); p++) { if (group.hitRight[p + dataHit.length] != null) { missingLeft[p].addAll(group.hitRight[p + dataHit.length]); @@ -2152,13 +2564,17 @@ public class CodecComponent { if (group.right != null) { for (int p = 0; p < Math.min(rightRangeLength, group.right.length); p++) { - missingRight[p].addAll(group.right[p]); + if (group.right[p] != null) { + missingRight[p].addAll(group.right[p]); + } } } - if (group.hitRight != null) { - for (int p = 0; p <= Math.min(rightRangeLength, + if (group.hitLeft != null) { + for (int p = 0; p < Math.min(rightRangeLength, group.hitLeft.length - dataHit.length); p++) { - missingRight[p].addAll(group.hitLeft[p + dataHit.length]); + if(group.hitLeft[p + dataHit.length]!=null) { + missingRight[p].addAll(group.hitLeft[p + dataHit.length]); + } } } @@ -2170,12 +2586,9 @@ public class CodecComponent { && group.hitInside.contains(hit.idData)) { for (int p = Math.max(hitStart, hit.startPosition); p <= Math .min(hitEnd, hit.endPosition); p++) { - // keyHit += hit.refData; dataHit[p - hitStart].add(hit.refData); missingHit[p - hitStart] - .remove(MtasToken.getPrefixFromValue(hit.refData)); - // System.out.print(p + "." + hit.idData + ":" + hit.refData + - // "\t"); + .remove(MtasToken.getPrefixFromValue(hit.refData)); } } else if ((group.hitInsideLeft != null || group.hitLeft != null || group.hitInsideRight != null || group.hitRight != null) @@ -2191,9 +2604,7 @@ public class CodecComponent { // keyHit += hit.refData; dataHit[p - hitStart].add(hit.refData); missingHit[p - hitStart] - .remove(MtasToken.getPrefixFromValue(hit.refData)); - // System.out.print(p+"."+hit.idData + ":" + hit.additionalRef + - // "\t"); + .remove(MtasToken.getPrefixFromValue(hit.refData)); } else if (group.hitLeft != null && pHitLeft <= (group.hitLeft.length - 1) && group.hitLeft[pHitLeft] != null @@ -2201,29 +2612,21 @@ public class CodecComponent { // keyHit += hit.refData; dataHit[p - hitStart].add(hit.refData); missingHit[p - hitStart] - .remove(MtasToken.getPrefixFromValue(hit.refData)); - // System.out.print(p+"."+hit.idData + ":" + hit.additionalRef + - // "\t"); + .remove(MtasToken.getPrefixFromValue(hit.refData)); } else if (group.hitInsideRight != null && pHitRight <= (group.hitInsideRight.length - 1) && group.hitInsideRight[pHitRight] != null && group.hitInsideRight[pHitRight].contains(hit.idData)) { - // keyHit += hit.refData; dataHit[p - hitStart].add(hit.refData); missingHit[p - hitStart] - .remove(MtasToken.getPrefixFromValue(hit.refData)); - // System.out.print(p+"."+hit.idData + ":" + hit.additionalRef + - // "\t"); + .remove(MtasToken.getPrefixFromValue(hit.refData)); } else if (group.hitRight != null && pHitRight <= (group.hitRight.length - 1) && group.hitRight[pHitRight] != null && group.hitRight[pHitRight].contains(hit.idData)) { - // keyHit += hit.refData; dataHit[p - hitStart].add(hit.refData); missingHit[p - hitStart] - .remove(MtasToken.getPrefixFromValue(hit.refData)); - // System.out.print(p+"."+hit.idData + ":" + hit.additionalRef + - // "\t"); + .remove(MtasToken.getPrefixFromValue(hit.refData)); } } } @@ -2239,18 +2642,16 @@ public class CodecComponent { if (group.left != null && pLeft <= (group.left.length - 1) && group.left[pLeft] != null && group.left[pLeft].contains(hit.idData)) { - dataLeft[p - leftRangeStart].add(hit.refData); - missingLeft[p - leftRangeStart] + dataLeft[hitStart - 1 - p].add(hit.refData); + missingLeft[hitStart - 1 - p] .remove(MtasToken.getPrefixFromValue(hit.refData)); - // System.out.print("L"+p+"."+prefix + ":" + value + "\t"); } else if (group.hitRight != null && pHitRight <= (group.hitRight.length - 1) && group.hitRight[pHitRight] != null && group.hitRight[pHitRight].contains(hit.idData)) { - dataLeft[p - leftRangeStart].add(hit.refData); - missingLeft[p - leftRangeStart] + dataLeft[hitStart - 1 - p].add(hit.refData); + missingLeft[hitStart - 1 - p] .remove(MtasToken.getPrefixFromValue(hit.refData)); - // System.out.print("L"+p+"."+prefix + ":" + value + "\t"); } } } @@ -2270,7 +2671,6 @@ public class CodecComponent { dataRight[p - rightRangeStart].add(hit.refData); missingRight[p - rightRangeStart] .remove(MtasToken.getPrefixFromValue(hit.refData)); - // System.out.print("R"+p+"."+prefix + ":" + value + "\t"); } else if (group.hitLeft != null && pHitLeft <= (group.hitLeft.length - 1) && group.hitLeft[pHitLeft] != null @@ -2278,7 +2678,6 @@ public class CodecComponent { dataRight[p - rightRangeStart].add(hit.refData); missingRight[p - rightRangeStart] .remove(MtasToken.getPrefixFromValue(hit.refData)); - // System.out.print("R"+p+"."+prefix + ":" + value + "\t"); } } } @@ -2354,8 +2753,10 @@ public class CodecComponent { /** * Data equals. * - * @param d1 the d 1 - * @param d2 the d 2 + * @param d1 + * the d 1 + * @param d2 + * the d 2 * @return true, if successful */ private boolean dataEquals(List<String>[] d1, List<String>[] d2) { @@ -2415,10 +2816,13 @@ public class CodecComponent { /** * Data to string. * - * @param data the data - * @param missing the missing + * @param data + * the data + * @param missing + * the missing * @return the string - * @throws UnsupportedEncodingException the unsupported encoding exception + * @throws UnsupportedEncodingException + * the unsupported encoding exception */ private String dataToString(List<String>[] data, Set<String>[] missing) throws UnsupportedEncodingException { @@ -2475,8 +2879,10 @@ public class CodecComponent { /** * Key to sub sub object. * - * @param key the key - * @param newKey the new key + * @param key + * the key + * @param newKey + * the new key * @return the map[] */ private static Map<String, String>[] keyToSubSubObject(String key, @@ -2543,8 +2949,10 @@ public class CodecComponent { /** * Key to sub object. * - * @param key the key - * @param newKey the new key + * @param key + * the key + * @param newKey + * the new key * @return the map */ private static Map<Integer, Map<String, String>[]> keyToSubObject( @@ -2568,8 +2976,10 @@ public class CodecComponent { /** * Key to object. * - * @param key the key - * @param newKey the new key + * @param key + * the key + * @param newKey + * the new key * @return the map */ public static Map<String, Map<Integer, Map<String, String>[]>> keyToObject( @@ -2634,10 +3044,14 @@ public class CodecComponent { /** * Instantiates a new list token. * - * @param docId the doc id - * @param docPosition the doc position - * @param match the match - * @param tokens the tokens + * @param docId + * the doc id + * @param docPosition + * the doc position + * @param match + * the match + * @param tokens + * the tokens */ public ListToken(Integer docId, Integer docPosition, Match match, List<MtasTokenString> tokens) { @@ -2672,10 +3086,14 @@ public class CodecComponent { /** * Instantiates a new list hit. * - * @param docId the doc id - * @param docPosition the doc position - * @param match the match - * @param hits the hits + * @param docId + * the doc id + * @param docPosition + * the doc position + * @param match + * the match + * @param hits + * the hits */ public ListHit(Integer docId, Integer docPosition, Match match, Map<Integer, List<String>> hits) { @@ -2701,8 +3119,10 @@ public class CodecComponent { /** * Instantiates a new match. * - * @param startPosition the start position - * @param endPosition the end position + * @param startPosition + * the start position + * @param endPosition + * the end position */ public Match(int startPosition, int endPosition) { this.startPosition = startPosition; diff --git a/src/mtas/codec/util/CodecUtil.java b/src/mtas/codec/util/CodecUtil.java index 9da9f94..6bf1be9 100644 --- a/src/mtas/codec/util/CodecUtil.java +++ b/src/mtas/codec/util/CodecUtil.java @@ -16,7 +16,7 @@ import mtas.codec.MtasCodecPostingsFormat; import mtas.parser.function.util.MtasFunctionParserFunction; import mtas.search.spans.util.MtasSpanQuery; import mtas.codec.util.CodecComponent.ComponentField; -import mtas.codec.util.CodecComponent.ComponentJoin; +import mtas.codec.util.CodecComponent.ComponentCollection; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexReader; @@ -247,18 +247,18 @@ public class CodecUtil { } /** - * Collect join. + * Collect collection. * * @param reader the reader * @param fullDocSet the full doc set - * @param joinInfo the join info + * @param collectionInfo the collection info * @throws IOException Signals that an I/O exception has occurred. */ - public static void collectJoin(IndexReader reader, - ArrayList<Integer> fullDocSet, ComponentJoin joinInfo) + public static void collectCollection(IndexReader reader, + List<Integer> fullDocSet, ComponentCollection collectionInfo) throws IOException { - if (joinInfo != null) { - CodecCollector.collectJoin(reader, fullDocSet, joinInfo); + if (collectionInfo != null) { + CodecCollector.collectCollection(reader, fullDocSet, collectionInfo); } } diff --git a/src/mtas/codec/util/collector/MtasDataCollector.java b/src/mtas/codec/util/collector/MtasDataCollector.java index 2e57f3a..976acbc 100644 --- a/src/mtas/codec/util/collector/MtasDataCollector.java +++ b/src/mtas/codec/util/collector/MtasDataCollector.java @@ -1336,7 +1336,7 @@ public abstract class MtasDataCollector<T1 extends Number & Comparable<T1>, T2 e text.append("\tclosed: " + closed + "\n"); text.append("\tkeylist: " + Arrays.asList(keyList) + "\n"); text.append("\tkeylist: " + Arrays.asList(keyList).contains("1") + "\n"); - text.append("\tsegmentKeys: " + segmentKeys.contains("1") + "\n"); + text.append("\tsegmentKeys: " + (segmentKeys!=null?segmentKeys.contains("1"):"null") + "\n"); text.append("\tnewKeys: " + Arrays.asList(newKeyList).contains("1") + "\n"); return text.toString().trim(); } diff --git a/src/mtas/solr/handler/component/MtasSolrSearchComponent.java b/src/mtas/solr/handler/component/MtasSolrSearchComponent.java index e781f57..188d3ca 100644 --- a/src/mtas/solr/handler/component/MtasSolrSearchComponent.java +++ b/src/mtas/solr/handler/component/MtasSolrSearchComponent.java @@ -11,6 +11,7 @@ import mtas.codec.util.CodecComponent.ComponentDocument; import mtas.codec.util.CodecComponent.ComponentFacet; import mtas.codec.util.CodecComponent.ComponentFields; import mtas.codec.util.CodecComponent.ComponentGroup; +import mtas.codec.util.CodecComponent.ComponentCollection; import mtas.codec.util.CodecComponent.ComponentKwic; import mtas.codec.util.CodecComponent.ComponentList; import mtas.codec.util.CodecComponent.ComponentPosition; @@ -19,11 +20,11 @@ import mtas.codec.util.CodecComponent.ComponentTermVector; import mtas.codec.util.CodecComponent.ComponentToken; import mtas.codec.util.CodecUtil; import mtas.solr.handler.component.util.MtasSolrResultMerge; -import mtas.solr.search.MtasSolrJoinCache; +import mtas.solr.search.MtasSolrCollectionCache; import mtas.solr.handler.component.util.MtasSolrComponentDocument; import mtas.solr.handler.component.util.MtasSolrComponentFacet; import mtas.solr.handler.component.util.MtasSolrComponentGroup; -import mtas.solr.handler.component.util.MtasSolrComponentJoin; +import mtas.solr.handler.component.util.MtasSolrComponentCollection; import mtas.solr.handler.component.util.MtasSolrComponentKwic; import mtas.solr.handler.component.util.MtasSolrComponentList; import mtas.solr.handler.component.util.MtasSolrComponentPrefix; @@ -51,17 +52,17 @@ public class MtasSolrSearchComponent extends SearchComponent { /** The search component. */ MtasSolrSearchComponent searchComponent; - /** The Constant CONFIG_JOIN_CACHE_DIRECTORY. */ - public static final String CONFIG_JOIN_CACHE_DIRECTORY = "joinCacheDirectory"; + /** The Constant CONFIG_COLLECTION_CACHE_DIRECTORY. */ + public static final String CONFIG_COLLECTION_CACHE_DIRECTORY = "collectionCacheDirectory"; - /** The Constant CONFIG_JOIN_LIFETIME. */ - public static final String CONFIG_JOIN_LIFETIME = "joinLifetime"; + /** The Constant CONFIG_COLLECTION_LIFETIME. */ + public static final String CONFIG_COLLECTION_LIFETIME = "collectionLifetime"; - /** The Constant CONFIG_JOIN_MAXIMUM_NUMBER. */ - public static final String CONFIG_JOIN_MAXIMUM_NUMBER = "joinMaximumNumber"; + /** The Constant CONFIG_COLLECTION_MAXIMUM_NUMBER. */ + public static final String CONFIG_COLLECTION_MAXIMUM_NUMBER = "collectionMaximumNumber"; - /** The Constant CONFIG_JOIN_MAXIMUM_OVERFLOW. */ - public static final String CONFIG_JOIN_MAXIMUM_OVERFLOW = "joinMaximumOverflow"; + /** The Constant CONFIG_COLLECTION_MAXIMUM_OVERFLOW. */ + public static final String CONFIG_COLLECTION_MAXIMUM_OVERFLOW = "collectionMaximumOverflow"; /** The Constant PARAM_MTAS. */ public static final String PARAM_MTAS = "mtas"; @@ -97,8 +98,13 @@ public class MtasSolrSearchComponent extends SearchComponent { public static final int STAGE_GROUP = ResponseBuilder.STAGE_EXECUTE_QUERY + 60; - /** The Constant STAGE_JOIN. */ - public static final int STAGE_JOIN = ResponseBuilder.STAGE_EXECUTE_QUERY + 70; + /** The Constant STAGE_COLLECTION_INIT. */ + public static final int STAGE_COLLECTION_INIT = ResponseBuilder.STAGE_EXECUTE_QUERY + + 70; + + /** The Constant STAGE_COLLECTION_FINISH. */ + public static final int STAGE_COLLECTION_FINISH = ResponseBuilder.STAGE_EXECUTE_QUERY + + 71; /** The Constant STAGE_DOCUMENT. */ public static final int STAGE_DOCUMENT = ResponseBuilder.STAGE_GET_FIELDS @@ -131,11 +137,11 @@ public class MtasSolrSearchComponent extends SearchComponent { /** The search document. */ private MtasSolrComponentDocument searchDocument; - /** The search join. */ - private MtasSolrComponentJoin searchJoin; + /** The search collection. */ + private MtasSolrComponentCollection searchCollection; - /** The join cache. */ - private MtasSolrJoinCache joinCache = null; + /** The collection cache. */ + private MtasSolrCollectionCache collectionCache = null; /* * (non-Javadoc) @@ -148,7 +154,7 @@ public class MtasSolrSearchComponent extends SearchComponent { public void init(NamedList args) { super.init(args); // init components - searchDocument = new MtasSolrComponentDocument(); + searchDocument = new MtasSolrComponentDocument(this); searchKwic = new MtasSolrComponentKwic(this); searchList = new MtasSolrComponentList(this); searchGroup = new MtasSolrComponentGroup(this); @@ -156,30 +162,33 @@ public class MtasSolrSearchComponent extends SearchComponent { searchPrefix = new MtasSolrComponentPrefix(this); searchStats = new MtasSolrComponentStats(this); searchFacet = new MtasSolrComponentFacet(this); - searchJoin = new MtasSolrComponentJoin(this); - // init join - String joinCacheDirectory = null; - Long joinLifetime = null; - Integer joinMaximumNumber = null; - Integer joinMaximumOverflow = null; - if (args.get(CONFIG_JOIN_CACHE_DIRECTORY) != null - && args.get(CONFIG_JOIN_CACHE_DIRECTORY) instanceof String) { - joinCacheDirectory = (String) args.get(CONFIG_JOIN_CACHE_DIRECTORY); + searchCollection = new MtasSolrComponentCollection(this); + // init collection + String collectionCacheDirectory = null; + Long collectionLifetime = null; + Integer collectionMaximumNumber = null; + Integer collectionMaximumOverflow = null; + if (args.get(CONFIG_COLLECTION_CACHE_DIRECTORY) != null + && args.get(CONFIG_COLLECTION_CACHE_DIRECTORY) instanceof String) { + collectionCacheDirectory = (String) args + .get(CONFIG_COLLECTION_CACHE_DIRECTORY); } - if (args.get(CONFIG_JOIN_LIFETIME) != null - && args.get(CONFIG_JOIN_LIFETIME) instanceof Long) { - joinLifetime = (Long) args.get(CONFIG_JOIN_LIFETIME); + if (args.get(CONFIG_COLLECTION_LIFETIME) != null + && args.get(CONFIG_COLLECTION_LIFETIME) instanceof Long) { + collectionLifetime = (Long) args.get(CONFIG_COLLECTION_LIFETIME); } - if (args.get(CONFIG_JOIN_MAXIMUM_NUMBER) != null - && args.get(CONFIG_JOIN_MAXIMUM_NUMBER) instanceof Integer) { - joinMaximumNumber = (Integer) args.get(CONFIG_JOIN_MAXIMUM_NUMBER); + if (args.get(CONFIG_COLLECTION_MAXIMUM_NUMBER) != null + && args.get(CONFIG_COLLECTION_MAXIMUM_NUMBER) instanceof Integer) { + collectionMaximumNumber = (Integer) args + .get(CONFIG_COLLECTION_MAXIMUM_NUMBER); } - if (args.get(CONFIG_JOIN_MAXIMUM_OVERFLOW) != null - && args.get(CONFIG_JOIN_MAXIMUM_OVERFLOW) instanceof Integer) { - joinMaximumNumber = (Integer) args.get(CONFIG_JOIN_MAXIMUM_OVERFLOW); + if (args.get(CONFIG_COLLECTION_MAXIMUM_OVERFLOW) != null + && args.get(CONFIG_COLLECTION_MAXIMUM_OVERFLOW) instanceof Integer) { + collectionMaximumNumber = (Integer) args + .get(CONFIG_COLLECTION_MAXIMUM_OVERFLOW); } - joinCache = new MtasSolrJoinCache(joinCacheDirectory, joinLifetime, - joinMaximumNumber, joinMaximumOverflow); + collectionCache = new MtasSolrCollectionCache(collectionCacheDirectory, + collectionLifetime, collectionMaximumNumber, collectionMaximumOverflow); } /* @@ -202,6 +211,15 @@ public class MtasSolrSearchComponent extends SearchComponent { return "Mtas"; } + /** + * Gets the collection cache. + * + * @return the collection cache + */ + public MtasSolrCollectionCache getCollectionCache() { + return collectionCache; + } + /* * (non-Javadoc) * @@ -258,10 +276,10 @@ public class MtasSolrSearchComponent extends SearchComponent { false)) { searchFacet.prepare(rb, mtasFields); } - // get settings join - if (rb.req.getParams().getBool(MtasSolrComponentJoin.PARAM_MTAS_JOIN, - false)) { - searchJoin.prepare(rb, mtasFields); + // get settings collection + if (rb.req.getParams() + .getBool(MtasSolrComponentCollection.PARAM_MTAS_COLLECTION, false)) { + searchCollection.prepare(rb, mtasFields); } rb.req.getContext().put(ComponentFields.class, mtasFields); } @@ -286,7 +304,7 @@ public class MtasSolrSearchComponent extends SearchComponent { DocList docList = rb.getResults().docList; if (mtasFields.doStats || mtasFields.doDocument || mtasFields.doKwic || mtasFields.doList || mtasFields.doGroup || mtasFields.doFacet - || mtasFields.doJoin || mtasFields.doTermVector + || mtasFields.doCollection || mtasFields.doTermVector || mtasFields.doPrefix) { SolrIndexSearcher searcher = rb.req.getSearcher(); ArrayList<Integer> docSetList = null; @@ -317,8 +335,10 @@ public class MtasSolrSearchComponent extends SearchComponent { throw new IOException(e); } } - CodecUtil.collectJoin(searcher.getRawReader(), docSetList, - mtasFields.join); + for (ComponentCollection collection : mtasFields.collection) { + CodecUtil.collectCollection(searcher.getRawReader(), docSetList, + collection); + } NamedList<Object> mtasResponse = new SimpleOrderedMap<>(); if (mtasFields.doDocument) { ArrayList<NamedList<?>> mtasDocumentResponses = new ArrayList<>(); @@ -355,13 +375,19 @@ public class MtasSolrSearchComponent extends SearchComponent { // add to response mtasResponse.add("facet", mtasFacetResponses); } - if (mtasFields.doJoin) { - // add to response - if (rb.req.getParams().getBool("isShard", false)) { - mtasResponse.add("join", searchJoin.create(mtasFields.join, true)); - } else { - mtasResponse.add("join", searchJoin.create(mtasFields.join, false)); + if (mtasFields.doCollection) { + ArrayList<NamedList<?>> mtasCollectionResponses = new ArrayList<>(); + for (ComponentCollection collection : mtasFields.collection) { + if (rb.req.getParams().getBool("isShard", false)) { + mtasCollectionResponses + .add(searchCollection.create(collection, true)); + } else { + mtasCollectionResponses + .add(searchCollection.create(collection, false)); + } } + // add to response + mtasResponse.add("collection", mtasCollectionResponses); } if (mtasFields.doList) { ArrayList<NamedList<?>> mtasListResponses = new ArrayList<>(); @@ -492,7 +518,8 @@ public class MtasSolrSearchComponent extends SearchComponent { @Override public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) { - // System.out.println(Thread.currentThread().getId() + " - " + // System.out.println(System.nanoTime() + " - " + + // Thread.currentThread().getId() + " - " // + rb.req.getParams().getBool("isShard", false) + " MODIFY REQUEST " // + rb.stage + " " + rb.req.getParamString()); if (sreq.params.getBool(PARAM_MTAS, false)) { @@ -510,8 +537,9 @@ public class MtasSolrSearchComponent extends SearchComponent { if (sreq.params.getBool(MtasSolrComponentFacet.PARAM_MTAS_FACET, false)) { searchFacet.modifyRequest(rb, who, sreq); } - if (sreq.params.getBool(MtasSolrComponentJoin.PARAM_MTAS_JOIN, false)) { - searchJoin.modifyRequest(rb, who, sreq); + if (sreq.params.getBool(MtasSolrComponentCollection.PARAM_MTAS_COLLECTION, + false)) { + searchCollection.modifyRequest(rb, who, sreq); } if (sreq.params.getBool(MtasSolrComponentGroup.PARAM_MTAS_GROUP, false)) { searchGroup.modifyRequest(rb, who, sreq); @@ -574,9 +602,9 @@ public class MtasSolrSearchComponent extends SearchComponent { false)) { searchFacet.finishStage(rb); } - if (rb.req.getParams().getBool(MtasSolrComponentJoin.PARAM_MTAS_JOIN, - false)) { - searchJoin.finishStage(rb); + if (rb.req.getParams() + .getBool(MtasSolrComponentCollection.PARAM_MTAS_COLLECTION, false)) { + searchCollection.finishStage(rb); } if (rb.req.getParams().getBool(MtasSolrComponentGroup.PARAM_MTAS_GROUP, false)) { @@ -630,9 +658,10 @@ public class MtasSolrSearchComponent extends SearchComponent { } else if (rb.stage == STAGE_FACET) { ComponentFields mtasFields = getMtasFields(rb); searchFacet.distributedProcess(rb, mtasFields); - } else if (rb.stage == STAGE_JOIN) { + } else if (rb.stage == STAGE_COLLECTION_INIT + || rb.stage == STAGE_COLLECTION_FINISH) { ComponentFields mtasFields = getMtasFields(rb); - searchJoin.distributedProcess(rb, mtasFields); + searchCollection.distributedProcess(rb, mtasFields); } else if (rb.stage == STAGE_GROUP) { ComponentFields mtasFields = getMtasFields(rb); searchGroup.distributedProcess(rb, mtasFields); @@ -670,9 +699,14 @@ public class MtasSolrSearchComponent extends SearchComponent { } else if (rb.stage < STAGE_GROUP && rb.req.getParams() .getBool(MtasSolrComponentGroup.PARAM_MTAS_GROUP, false)) { return STAGE_GROUP; - } else if (rb.stage < STAGE_JOIN && rb.req.getParams() - .getBool(MtasSolrComponentJoin.PARAM_MTAS_JOIN, false)) { - return STAGE_JOIN; + } else if (rb.stage < STAGE_COLLECTION_INIT + && rb.req.getParams().getBool( + MtasSolrComponentCollection.PARAM_MTAS_COLLECTION, false)) { + return STAGE_COLLECTION_INIT; + } else if (rb.stage < STAGE_COLLECTION_FINISH + && rb.req.getParams().getBool( + MtasSolrComponentCollection.PARAM_MTAS_COLLECTION, false)) { + return STAGE_COLLECTION_FINISH; } } else if (rb.stage >= ResponseBuilder.STAGE_GET_FIELDS && rb.stage < ResponseBuilder.STAGE_DONE) { diff --git a/src/mtas/solr/handler/component/util/MtasSolrCollectionResult.java b/src/mtas/solr/handler/component/util/MtasSolrCollectionResult.java new file mode 100644 index 0000000..e676ce6 --- /dev/null +++ b/src/mtas/solr/handler/component/util/MtasSolrCollectionResult.java @@ -0,0 +1,355 @@ +package mtas.solr.handler.component.util; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map.Entry; +import org.apache.solr.common.util.SimpleOrderedMap; + +import mtas.codec.util.CodecComponent.ComponentCollection; +import mtas.solr.handler.component.MtasSolrSearchComponent; + +/** + * The Class MtasSolrCollectionResult. + */ +public class MtasSolrCollectionResult implements Serializable { + + /** The Constant serialVersionUID. */ + private static final long serialVersionUID = 1L; + + /** The values. */ + private HashSet<String> values; + + /** The id. */ + private String id; + + /** The action. */ + private String action; + + /** The now. */ + private Long now; + + /** The list. */ + private List<SimpleOrderedMap<Object>> list; + + /** The status. */ + public SimpleOrderedMap<Object> status; + + /** The component collection. */ + private transient ComponentCollection componentCollection = null; + + /** + * Instantiates a new mtas solr collection result. + * + * @param componentCollection the component collection + * @throws IOException Signals that an I/O exception has occurred. + */ + public MtasSolrCollectionResult(ComponentCollection componentCollection) + throws IOException { + this.componentCollection = componentCollection; + if (componentCollection != null) { + action = componentCollection.action(); + id = null; + values = null; + now = null; + list = null; + switch (action) { + case ComponentCollection.ACTION_CREATE: + values = componentCollection.values(); + id = componentCollection.id; + break; + case ComponentCollection.ACTION_CHECK: + case ComponentCollection.ACTION_GET: + case ComponentCollection.ACTION_DELETE: + id = componentCollection.id; + break; + case ComponentCollection.ACTION_POST: + case ComponentCollection.ACTION_IMPORT: + id = componentCollection.id; + values = componentCollection.values(); + break; + case ComponentCollection.ACTION_LIST: + case ComponentCollection.ACTION_EMPTY: + // do nothing + break; + default: + throw new IOException("action " + action + " not allowed"); + } + } else { + throw new IOException("no componentCollection available"); + } + } + + /** + * Sets the list. + * + * @param now the now + * @param list the list + * @throws IOException Signals that an I/O exception has occurred. + */ + public void setList(long now, List<SimpleOrderedMap<Object>> list) + throws IOException { + if (action.equals(ComponentCollection.ACTION_LIST)) { + this.now = now; + this.list = list; + } else { + throw new IOException("not allowed with action '" + action + "'"); + } + } + + /** + * Sets the check. + * + * @param now the now + * @param status the status + * @throws IOException Signals that an I/O exception has occurred. + */ + public void setCheck(long now, SimpleOrderedMap<Object> status) + throws IOException { + if (action.equals(ComponentCollection.ACTION_CHECK)) { + this.now = now; + this.status = status; + } else { + throw new IOException("not allowed with action '" + action + "'"); + } + } + + /** + * Sets the get. + * + * @param now the now + * @param status the status + * @param stringValues the string values + * @throws IOException Signals that an I/O exception has occurred. + */ + public void setGet(long now, SimpleOrderedMap<Object> status, + HashSet<String> stringValues) throws IOException { + if (action.equals(ComponentCollection.ACTION_GET)) { + this.now = now; + this.status = status; + this.values = stringValues; + } else { + throw new IOException("not allowed with action '" + action + "'"); + } + } + + /** + * Sets the post. + * + * @param now the now + * @param status the status + * @throws IOException Signals that an I/O exception has occurred. + */ + public void setPost(long now, SimpleOrderedMap<Object> status) + throws IOException { + if (action.equals(ComponentCollection.ACTION_POST)) { + this.now = now; + this.status = status; + } else { + throw new IOException("not allowed with action '" + action + "'"); + } + } + + public void setImport(long now, SimpleOrderedMap<Object> status) + throws IOException { + if (action.equals(ComponentCollection.ACTION_IMPORT)) { + this.now = now; + this.status = status; + } else { + throw new IOException("not allowed with action '" + action + "'"); + } + } + + /** + * Sets the create. + * + * @param now the now + * @param status the status + * @throws IOException Signals that an I/O exception has occurred. + */ + public void setCreate(long now, SimpleOrderedMap<Object> status) + throws IOException { + if (action.equals(ComponentCollection.ACTION_CREATE)) { + this.now = now; + this.status = status; + } else { + throw new IOException("not allowed with action '" + action + "'"); + } + } + + /** + * Id. + * + * @return the string + */ + public String id() { + return id; + } + + /** + * Action. + * + * @return the string + */ + public String action() { + return action; + } + + /** + * Rewrite. + * + * @param searchComponent the search component + * @return the simple ordered map + * @throws IOException Signals that an I/O exception has occurred. + */ + public SimpleOrderedMap<Object> rewrite( + MtasSolrSearchComponent searchComponent) throws IOException { + SimpleOrderedMap<Object> response = new SimpleOrderedMap<>(); + Iterator<Entry<String, Object>> it; + switch (action) { + case ComponentCollection.ACTION_LIST: + response.add("now", now); + response.add("list", list); + break; + case ComponentCollection.ACTION_CREATE: + case ComponentCollection.ACTION_POST: + case ComponentCollection.ACTION_IMPORT: + if (componentCollection != null && status != null) { + it = status.iterator(); + while (it.hasNext()) { + Entry<String, Object> entry = it.next(); + response.add(entry.getKey(), entry.getValue()); + } + } + break; + case ComponentCollection.ACTION_CHECK: + if (status != null) { + it = status.iterator(); + while (it.hasNext()) { + Entry<String, Object> entry = it.next(); + response.add(entry.getKey(), entry.getValue()); + } + } + break; + case ComponentCollection.ACTION_GET: + if (status != null) { + it = status.iterator(); + while (it.hasNext()) { + Entry<String, Object> entry = it.next(); + response.add(entry.getKey(), entry.getValue()); + } + } + if (values != null) { + response.add("values", values); + } + break; + default: + break; + } + return response; + } + + /** + * Merge. + * + * @param newItem the new item + * @throws IOException Signals that an I/O exception has occurred. + */ + public void merge(MtasSolrCollectionResult newItem) throws IOException { + if (action != null && newItem.action != null) { + if (action.equals(ComponentCollection.ACTION_CREATE) + && newItem.action.equals(ComponentCollection.ACTION_CREATE)) { + values.addAll(newItem.values); + if (id != null && (newItem.id == null || !newItem.id.equals(id))) { + id = null; + } + } else if (action.equals(ComponentCollection.ACTION_LIST)) { + if (list != null) { + HashMap<String, SimpleOrderedMap<Object>> index = new HashMap<>(); + for (SimpleOrderedMap<Object> item : list) { + if (item.get("id") != null && item.get("id") instanceof String) { + index.put((String) item.get("id"), item); + if (item.get("shards") == null + || !(item.get("shards") instanceof List)) { + item.add("shards", new ArrayList<>()); + } + } + } + for (SimpleOrderedMap<Object> item : newItem.list) { + if (item.get("id") != null && item.get("id") instanceof String) { + String id = (String) item.get("id"); + if (index.containsKey(id)) { + SimpleOrderedMap<Object> indexItem = index.get(id); + List<SimpleOrderedMap<Object>> shards; + if (indexItem.get("shards") != null + && indexItem.get("shards") instanceof List) { + shards = (List<SimpleOrderedMap<Object>>) indexItem + .get("shards"); + } else { + shards = new ArrayList<>(); + indexItem.add("shards", shards); + } + shards.add(item); + } + } + } + } + } else if (action.equals(ComponentCollection.ACTION_CHECK) + || action.equals(ComponentCollection.ACTION_POST) + || action.equals(ComponentCollection.ACTION_IMPORT) + || action.equals(ComponentCollection.ACTION_CREATE) + || action.equals(ComponentCollection.ACTION_GET)) { + if (status != null && status.get("id") != null + && status.get("id") instanceof String) { + String id = (String) status.get("id"); + if (id.equals(newItem.id)) { + List<SimpleOrderedMap<Object>> shards; + if (status.get("shards") != null + && status.get("shards") instanceof List) { + shards = (List<SimpleOrderedMap<Object>>) status.get("shards"); + } else { + shards = new ArrayList<>(); + status.add("shards", shards); + } + if (newItem.status != null) { + if (action.equals(ComponentCollection.ACTION_GET)) { + newItem.status.add("values", newItem.values); + } + shards.add(newItem.status); + } + } + } + } else { + throw new IOException("not allowed for action '" + action + "'"); + } + } + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + StringBuilder text = new StringBuilder(""); + text.append(MtasSolrCollectionResult.class.getSimpleName() + "["); + text.append(action + ", "); + text.append(id + ", "); + if (componentCollection != null) { + text.append(componentCollection.version + ", "); + } else if (status != null) { + text.append(status.get("version") + ", "); + } else { + text.append("null, "); + } + text.append((values != null) ? values.size() : "null"); + text.append("]"); + return text.toString(); + } + +} diff --git a/src/mtas/solr/handler/component/util/MtasSolrComponentCollection.java b/src/mtas/solr/handler/component/util/MtasSolrComponentCollection.java new file mode 100644 index 0000000..69a7d73 --- /dev/null +++ b/src/mtas/solr/handler/component/util/MtasSolrComponentCollection.java @@ -0,0 +1,766 @@ +package mtas.solr.handler.component.util; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.Map.Entry; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.lucene.util.BytesRef; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; +import org.apache.solr.handler.component.ResponseBuilder; +import org.apache.solr.handler.component.SearchComponent; +import org.apache.solr.handler.component.ShardRequest; +import org.apache.solr.handler.component.ShardResponse; +import org.noggit.JSONParser; +import org.noggit.JSONUtil; + +import mtas.codec.util.CodecComponent.ComponentFields; +import mtas.codec.util.CodecComponent.ComponentCollection; +import mtas.solr.handler.component.MtasSolrSearchComponent; + +/** + * The Class MtasSolrComponentCollection. + */ +public class MtasSolrComponentCollection + implements MtasSolrComponent<ComponentCollection> { + + /** The Constant log. */ + private static final Log log = LogFactory + .getLog(MtasSolrComponentCollection.class); + + /** The Constant PARAM_MTAS_COLLECTION. */ + public static final String PARAM_MTAS_COLLECTION = MtasSolrSearchComponent.PARAM_MTAS + + ".collection"; + + /** The Constant NAME_MTAS_COLLECTION_ACTION. */ + public static final String NAME_MTAS_COLLECTION_ACTION = "action"; + + /** The Constant NAME_MTAS_COLLECTION_ID. */ + public static final String NAME_MTAS_COLLECTION_ID = "id"; + + /** The Constant NAME_MTAS_COLLECTION_FIELD. */ + public static final String NAME_MTAS_COLLECTION_FIELD = "field"; + + /** The Constant NAME_MTAS_COLLECTION_POST. */ + public static final String NAME_MTAS_COLLECTION_POST = "post"; + + /** The Constant NAME_MTAS_COLLECTION_URL. */ + public static final String NAME_MTAS_COLLECTION_URL = "url"; + + /** The Constant NAME_MTAS_COLLECTION_COLLECTION. */ + public static final String NAME_MTAS_COLLECTION_COLLECTION = "collection"; + + /** The Constant NAME_MTAS_COLLECTION_KEY. */ + public static final String NAME_MTAS_COLLECTION_KEY = "key"; + + /** The search component. */ + private MtasSolrSearchComponent searchComponent; + + /** + * Instantiates a new mtas solr component collection. + * + * @param searchComponent + * the search component + */ + public MtasSolrComponentCollection(MtasSolrSearchComponent searchComponent) { + this.searchComponent = searchComponent; + } + + /* + * (non-Javadoc) + * + * @see + * mtas.solr.handler.component.util.MtasSolrComponent#prepare(org.apache.solr. + * handler.component.ResponseBuilder, + * mtas.codec.util.CodecComponent.ComponentFields) + */ + public void prepare(ResponseBuilder rb, ComponentFields mtasFields) + throws IOException { + // System.out.println( + // "collection: " + System.nanoTime() + " - " + + // Thread.currentThread().getId() + // + " - " + rb.req.getParams().getBool("isShard", false) + " PREPARE " + // + rb.stage + " " + rb.req.getParamString()); + Set<String> ids = MtasSolrResultUtil + .getIdsFromParameters(rb.req.getParams(), PARAM_MTAS_COLLECTION); + if (!ids.isEmpty()) { + int tmpCounter = 0; + String[] keys = new String[ids.size()]; + String[] actions = new String[ids.size()]; + String[] fields = new String[ids.size()]; + String[] collectionIds = new String[ids.size()]; + String[] posts = new String[ids.size()]; + String[] urls = new String[ids.size()]; + String[] collections = new String[ids.size()]; + for (String id : ids) { + actions[tmpCounter] = rb.req.getParams().get(PARAM_MTAS_COLLECTION + "." + + id + "." + NAME_MTAS_COLLECTION_ACTION, null); + keys[tmpCounter] = rb.req.getParams().get( + PARAM_MTAS_COLLECTION + "." + id + "." + NAME_MTAS_COLLECTION_KEY, + String.valueOf(tmpCounter)).trim(); + fields[tmpCounter] = rb.req.getParams().get( + PARAM_MTAS_COLLECTION + "." + id + "." + NAME_MTAS_COLLECTION_FIELD, + null); + collectionIds[tmpCounter] = rb.req.getParams().get( + PARAM_MTAS_COLLECTION + "." + id + "." + NAME_MTAS_COLLECTION_ID, + null); + posts[tmpCounter] = rb.req.getParams().get( + PARAM_MTAS_COLLECTION + "." + id + "." + NAME_MTAS_COLLECTION_POST, + null); + urls[tmpCounter] = rb.req.getParams().get( + PARAM_MTAS_COLLECTION + "." + id + "." + NAME_MTAS_COLLECTION_URL, + null); + collections[tmpCounter] = rb.req.getParams().get( + PARAM_MTAS_COLLECTION + "." + id + "." + NAME_MTAS_COLLECTION_COLLECTION, + null); + tmpCounter++; + } + mtasFields.doCollection = true; + MtasSolrResultUtil.compareAndCheck(keys, actions, + NAME_MTAS_COLLECTION_KEY, NAME_MTAS_COLLECTION_ACTION, true); + MtasSolrResultUtil.compareAndCheck(keys, fields, NAME_MTAS_COLLECTION_KEY, + NAME_MTAS_COLLECTION_FIELD, false); + MtasSolrResultUtil.compareAndCheck(keys, collectionIds, + NAME_MTAS_COLLECTION_KEY, NAME_MTAS_COLLECTION_ID, false); + MtasSolrResultUtil.compareAndCheck(keys, posts, NAME_MTAS_COLLECTION_KEY, + NAME_MTAS_COLLECTION_POST, false); + MtasSolrResultUtil.compareAndCheck(keys, urls, NAME_MTAS_COLLECTION_KEY, + NAME_MTAS_COLLECTION_URL, false); + MtasSolrResultUtil.compareAndCheck(keys, collections, NAME_MTAS_COLLECTION_KEY, + NAME_MTAS_COLLECTION_COLLECTION, false); + for (int i = 0; i < actions.length; i++) { + if (actions[i] != null) { + ComponentCollection componentCollection; + switch (actions[i]) { + case ComponentCollection.ACTION_LIST: + componentCollection = new ComponentCollection(keys[i], + ComponentCollection.ACTION_LIST); + componentCollection.setListVariables(); + mtasFields.collection.add(componentCollection); + break; + case ComponentCollection.ACTION_CHECK: + if (collectionIds[i] != null) { + componentCollection = new ComponentCollection(keys[i], + ComponentCollection.ACTION_CHECK); + componentCollection.setCheckVariables(collectionIds[i]); + mtasFields.collection.add(componentCollection); + } else { + throw new IOException( + "no id defined for collection (" + actions[i] + ")"); + } + break; + case ComponentCollection.ACTION_GET: + if (collectionIds[i] != null) { + componentCollection = new ComponentCollection(keys[i], + ComponentCollection.ACTION_GET); + componentCollection.setGetVariables(collectionIds[i]); + mtasFields.collection.add(componentCollection); + } else { + throw new IOException( + "no id defined for collection (" + actions[i] + ")"); + } + break; + case ComponentCollection.ACTION_CREATE: + if (fields[i] != null) { + Set<String> fieldList = new HashSet<>( + Arrays.asList(fields[i].split(","))); + componentCollection = new ComponentCollection(keys[i], + ComponentCollection.ACTION_CREATE); + componentCollection.setCreateVariables(collectionIds[i], + fieldList); + mtasFields.doCollection = true; + mtasFields.collection.add(componentCollection); + rb.setNeedDocSet(true); + } else { + throw new IOException( + "no field defined for collection (" + actions[i] + ")"); + } + break; + case ComponentCollection.ACTION_POST: + if (posts[i] != null) { + componentCollection = new ComponentCollection(keys[i], + ComponentCollection.ACTION_POST); + componentCollection.setPostVariables(collectionIds[i], + stringToStringValues(posts[i])); + mtasFields.collection.add(componentCollection); + } else { + throw new IOException( + "no post defined for collection (" + actions[i] + ")"); + } + break; + case ComponentCollection.ACTION_IMPORT: + if (urls[i] != null && collections[i]!=null) { + componentCollection = new ComponentCollection(keys[i], + ComponentCollection.ACTION_IMPORT); + componentCollection.setImportVariables(collectionIds[i], + urls[i], collections[i]); + mtasFields.collection.add(componentCollection); + } else { + throw new IOException( + "no post defined for collection (" + actions[i] + ")"); + } + break; + case ComponentCollection.ACTION_DELETE: + if (collectionIds[i] != null) { + componentCollection = new ComponentCollection(keys[i], + ComponentCollection.ACTION_DELETE); + componentCollection.setDeleteVariables(collectionIds[i]); + mtasFields.collection.add(componentCollection); + } else { + throw new IOException( + "no id defined for collection (" + actions[i] + ")"); + } + break; + case ComponentCollection.ACTION_EMPTY: + componentCollection = new ComponentCollection(keys[i], + ComponentCollection.ACTION_EMPTY); + mtasFields.collection.add(componentCollection); + break; + default: + throw new IOException( + "unrecognized action '" + actions[i] + "' for collection"); + } + } else { + throw new IOException("no action defined for collection"); + } + } + } + } + + /* + * (non-Javadoc) + * + * @see + * mtas.solr.handler.component.util.MtasSolrComponent#modifyRequest(org.apache + * .solr.handler.component.ResponseBuilder, + * org.apache.solr.handler.component.SearchComponent, + * org.apache.solr.handler.component.ShardRequest) + */ + public void modifyRequest(ResponseBuilder rb, SearchComponent who, + ShardRequest sreq) { + // System.out.println( + // "collection: " + System.nanoTime() + " - " + + // Thread.currentThread().getId() + // + " - " + rb.req.getParams().getBool("isShard", false) + // + " MODIFYREQUEST " + rb.stage + " " + rb.req.getParamString()); + if (sreq.params.getBool(MtasSolrSearchComponent.PARAM_MTAS, false) + && sreq.params.getBool(PARAM_MTAS_COLLECTION, false)) { + if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) { + // do nothing + } else { + // remove for other requests + Set<String> keys = MtasSolrResultUtil + .getIdsFromParameters(rb.req.getParams(), PARAM_MTAS_COLLECTION); + sreq.params.remove(PARAM_MTAS_COLLECTION); + for (String key : keys) { + sreq.params.remove(PARAM_MTAS_COLLECTION + "." + key + "." + + NAME_MTAS_COLLECTION_ACTION); + sreq.params.remove(PARAM_MTAS_COLLECTION + "." + key + "." + + NAME_MTAS_COLLECTION_ID); + sreq.params.remove(PARAM_MTAS_COLLECTION + "." + key + "." + + NAME_MTAS_COLLECTION_FIELD); + sreq.params.remove(PARAM_MTAS_COLLECTION + "." + key + "." + + NAME_MTAS_COLLECTION_POST); + sreq.params.remove(PARAM_MTAS_COLLECTION + "." + key + "." + + NAME_MTAS_COLLECTION_KEY); + sreq.params.remove(PARAM_MTAS_COLLECTION + "." + key + "." + + NAME_MTAS_COLLECTION_URL); + sreq.params.remove(PARAM_MTAS_COLLECTION + "." + key + "." + + NAME_MTAS_COLLECTION_COLLECTION); + } + } + } + } + + /* + * (non-Javadoc) + * + * @see + * mtas.solr.handler.component.util.MtasSolrComponent#create(mtas.codec.util. + * CodecComponent.BasicComponent, java.lang.Boolean) + */ + public SimpleOrderedMap<Object> create( + ComponentCollection componentCollection, Boolean encode) + throws IOException { + MtasSolrCollectionResult data = createMtasSolrCollectionResult( + componentCollection, encode ? false : true); + // Create response + SimpleOrderedMap<Object> mtasCollectionResponse = new SimpleOrderedMap<>(); + mtasCollectionResponse.add("key", componentCollection.key); + if (encode) { + mtasCollectionResponse.add("_encoded_data", + MtasSolrResultUtil.encode(data)); + } else { + mtasCollectionResponse.add("data", data); + MtasSolrResultUtil.rewrite(mtasCollectionResponse, searchComponent); + } + return mtasCollectionResponse; + } + + /** + * Creates the mtas solr collection result. + * + * @param componentCollection the component collection + * @param storeIfRelevant the store if relevant + * @return the mtas solr collection result + * @throws IOException Signals that an I/O exception has occurred. + */ + /* + * (non-Javadoc) + * + * @see + * mtas.solr.handler.component.util.MtasSolrComponent#create(mtas.codec.util. + * CodecComponent.BasicComponent, java.lang.Boolean) + */ + private MtasSolrCollectionResult createMtasSolrCollectionResult( + ComponentCollection componentCollection, boolean storeIfRelevant) + throws IOException { + // System.out.println("collection: " + System.nanoTime() + " - " + // + Thread.currentThread().getId() + " - " + " CREATE "); + if (componentCollection != null) { + // Create response + MtasSolrCollectionResult data = new MtasSolrCollectionResult( + componentCollection); + if (componentCollection.action() + .equals(ComponentCollection.ACTION_CREATE)) { + if (storeIfRelevant && componentCollection.version == null) { + componentCollection.version = searchComponent.getCollectionCache() + .create(componentCollection.id, + componentCollection.values().size(), + componentCollection.values()); + } + data.setCreate(searchComponent.getCollectionCache().now(), + searchComponent.getCollectionCache().check(componentCollection.id)); + } else if (componentCollection.action() + .equals(ComponentCollection.ACTION_LIST)) { + // retrieve and add list to result + data.setList(searchComponent.getCollectionCache().now(), + searchComponent.getCollectionCache().list()); + } else if (componentCollection.action() + .equals(ComponentCollection.ACTION_CHECK)) { + // retrieve and add status to result + data.setCheck(searchComponent.getCollectionCache().now(), + searchComponent.getCollectionCache().check(componentCollection.id)); + } else if (componentCollection.action() + .equals(ComponentCollection.ACTION_GET)) { + // retrieve and add status to result + HashSet<String> values = searchComponent.getCollectionCache() + .getDataById(componentCollection.id); + if (values != null) { + data.setGet(searchComponent.getCollectionCache().now(), + searchComponent.getCollectionCache() + .check(componentCollection.id), + values); + } + } else if (componentCollection.action() + .equals(ComponentCollection.ACTION_EMPTY)) { + // empty + searchComponent.getCollectionCache().empty(); + } else if (componentCollection.action() + .equals(ComponentCollection.ACTION_POST)) { + // store if not already stored + if (componentCollection.version == null) { + componentCollection.version = searchComponent.getCollectionCache() + .create(componentCollection.id, + componentCollection.values().size(), + componentCollection.values()); + } + // add status to result + data.setPost(searchComponent.getCollectionCache().now(), + searchComponent.getCollectionCache().check(componentCollection.id)); + } else if (componentCollection.action() + .equals(ComponentCollection.ACTION_IMPORT)) { + // import if not already stored + if (componentCollection.version == null) { + componentCollection.version = searchComponent.getCollectionCache() + .create(componentCollection.id, + componentCollection.values().size(), + componentCollection.values()); + } + // add status to result + data.setImport(searchComponent.getCollectionCache().now(), + searchComponent.getCollectionCache().check(componentCollection.id)); + } else if (componentCollection.action() + .equals(ComponentCollection.ACTION_DELETE)) { + searchComponent.getCollectionCache().deleteById(componentCollection.id); + } + return data; + } else { + throw new IOException("no componentCollection available"); + } + } + + /* + * (non-Javadoc) + * + * @see + * mtas.solr.handler.component.util.MtasSolrComponent#finishStage(org.apache. + * solr.handler.component.ResponseBuilder) + */ + public void finishStage(ResponseBuilder rb) { + // System.out.println( + // "collection: " + System.nanoTime() + " - " + Thread.currentThread().getId() + // + " - " + rb.req.getParams().getBool("isShard", false) + // + " FINISHSTAGE " + rb.stage + " " + rb.req.getParamString()); + if (rb.req.getParams().getBool(MtasSolrSearchComponent.PARAM_MTAS, false)) { + if (rb.stage >= ResponseBuilder.STAGE_EXECUTE_QUERY + && rb.stage < ResponseBuilder.STAGE_GET_FIELDS) { + ComponentFields mtasFields = getMtasFields(rb); + if (mtasFields.doCollection) { + if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) { + // mtas response + NamedList<Object> mtasResponse = null; + try { + mtasResponse = (NamedList<Object>) rb.rsp.getValues().get("mtas"); + } catch (ClassCastException e) { + log.debug(e); + mtasResponse = null; + } + if (mtasResponse == null) { + mtasResponse = new SimpleOrderedMap<>(); + rb.rsp.add("mtas", mtasResponse); + } + ArrayList<Object> mtasCollectionResponses; + if (mtasResponse.get("collection") != null + && mtasResponse.get("collection") instanceof ArrayList) { + mtasCollectionResponses = (ArrayList<Object>) mtasResponse.get("collection"); + } else { + mtasCollectionResponses = new ArrayList<>(); + mtasResponse.add("collection", mtasCollectionResponses); + } + MtasSolrCollectionResult collectionResult; + for (ComponentCollection componentCollection : mtasFields.collection) { + try { + collectionResult = createMtasSolrCollectionResult(componentCollection, + false); + // Create response + SimpleOrderedMap<Object> mtasCollectionResponse = new SimpleOrderedMap<>(); + mtasCollectionResponse.add("key", componentCollection.key); + mtasCollectionResponse.add("data", collectionResult); + mtasCollectionResponses.add(mtasCollectionResponse); + } catch (IOException e) { + log.debug(e); + } + } + } + // decode shard responses + for (ShardRequest sreq : rb.finished) { + if (sreq.params.getBool(MtasSolrSearchComponent.PARAM_MTAS, false) + && sreq.params.getBool(PARAM_MTAS_COLLECTION, false)) { + for (ShardResponse shardResponse : sreq.responses) { + NamedList<Object> solrShardResponse = shardResponse + .getSolrResponse().getResponse(); + try { + ArrayList<SimpleOrderedMap<Object>> data = (ArrayList<SimpleOrderedMap<Object>>) solrShardResponse + .findRecursive("mtas", "collection"); + if (data != null) { + MtasSolrResultUtil.decode(data); + if (rb.stage > ResponseBuilder.STAGE_EXECUTE_QUERY) { + ArrayList<SimpleOrderedMap<Object>> filteredData = new ArrayList<>(); + for (SimpleOrderedMap<Object> dataItem : data) { + if (dataItem.get("data") != null && dataItem + .get("data") instanceof MtasSolrCollectionResult) { + MtasSolrCollectionResult collectionResult = (MtasSolrCollectionResult) dataItem + .get("data"); + if (rb.stage <= MtasSolrSearchComponent.STAGE_COLLECTION_INIT) { + if (!collectionResult.action() + .equals(ComponentCollection.ACTION_CREATE) + && !collectionResult.action() + .equals(ComponentCollection.ACTION_LIST) + && !collectionResult.action() + .equals(ComponentCollection.ACTION_CHECK)) { + filteredData.add(dataItem); + } + } else if (rb.stage <= MtasSolrSearchComponent.STAGE_COLLECTION_FINISH) { + if (!collectionResult.action() + .equals(ComponentCollection.ACTION_POST) + && !collectionResult.action() + .equals(ComponentCollection.ACTION_IMPORT) && !collectionResult.action() + .equals(ComponentCollection.ACTION_CHECK)) { + filteredData.add(dataItem); + } + } + } else { + filteredData.add(dataItem); + } + } + data.clear(); + data.addAll(filteredData); + } + } + } catch (ClassCastException e) { + log.debug(e); + // shouldn't happen + } + } + + } + } + } + } + } + } + + /* + * (non-Javadoc) + * + * @see + * mtas.solr.handler.component.util.MtasSolrComponent#distributedProcess(org. + * apache.solr.handler.component.ResponseBuilder, + * mtas.codec.util.CodecComponent.ComponentFields) + */ + @SuppressWarnings("unchecked") + public void distributedProcess(ResponseBuilder rb, ComponentFields mtasFields) + throws IOException { + // System.out.println("collection: " + System.nanoTime() + " - " + // + Thread.currentThread().getId() + " - " + // + rb.req.getParams().getBool("isShard", false) + " DISTRIBUTEDPROCESS " + // + rb.stage + " " + rb.req.getParamString()); + NamedList<Object> mtasResponse = null; + try { + mtasResponse = (NamedList<Object>) rb.rsp.getValues().get("mtas"); + } catch (ClassCastException e) { + log.debug(e); + mtasResponse = null; + } + if (mtasResponse != null) { + if (rb.stage == MtasSolrSearchComponent.STAGE_COLLECTION_INIT) { + // build index + Map<String, MtasSolrCollectionResult> index = new HashMap<>(); + ArrayList<Object> mtasResponseCollection; + try { + mtasResponseCollection = (ArrayList<Object>) mtasResponse.get("collection"); + for (Object item : mtasResponseCollection) { + if (item instanceof SimpleOrderedMap) { + SimpleOrderedMap<Object> itemMap = (SimpleOrderedMap<Object>) item; + if (itemMap.get("data") != null + && itemMap.get("data") instanceof MtasSolrCollectionResult) { + MtasSolrCollectionResult collectionItem = (MtasSolrCollectionResult) itemMap + .get("data"); + index.put(collectionItem.id(), collectionItem); + } + } + } + } catch (ClassCastException e) { + log.debug(e); + mtasResponse.remove("collection"); + } + // check and remove previous responses + Map<String, Set<String>> createPostAfterMissingCheckResult = new HashMap<>(); + for (ShardRequest sreq : rb.finished) { + if (sreq.params.getBool(MtasSolrSearchComponent.PARAM_MTAS, false) + && sreq.params.getBool(PARAM_MTAS_COLLECTION, false)) { + for (ShardResponse shardResponse : sreq.responses) { + NamedList<Object> solrShardResponse = shardResponse + .getSolrResponse().getResponse(); + try { + ArrayList<SimpleOrderedMap<Object>> data = (ArrayList<SimpleOrderedMap<Object>>) solrShardResponse + .findRecursive("mtas", "collection"); + if (data != null) { + for (SimpleOrderedMap<Object> dataItem : data) { + if (dataItem.get("data") != null && dataItem + .get("data") instanceof MtasSolrCollectionResult) { + MtasSolrCollectionResult dataItemResult = (MtasSolrCollectionResult) dataItem + .get("data"); + if (index.containsKey(dataItemResult.id()) + && index.get(dataItemResult.id()).action() + .equals(ComponentCollection.ACTION_CHECK)) { + if (dataItemResult.status == null) { + if (!createPostAfterMissingCheckResult + .containsKey(shardResponse.getShard())) { + createPostAfterMissingCheckResult + .put(shardResponse.getShard(), new HashSet<>()); + } + createPostAfterMissingCheckResult + .get(shardResponse.getShard()) + .add(dataItemResult.id()); + } + } + } + } + data.clear(); + } + } catch (ClassCastException e) { + log.debug(e); + // shouldn't happen + } + } + } + } + // construct new requests + HashMap<String, ModifiableSolrParams> requestParamList = new HashMap<>(); + int id = 0; + for (ComponentCollection componentCollection : mtasFields.collection) { + if (componentCollection.action().equals(ComponentCollection.ACTION_CHECK)) { + for (String shardAddress : rb.shards) { + if (createPostAfterMissingCheckResult.containsKey(shardAddress)) { + if (createPostAfterMissingCheckResult.get(shardAddress) + .contains(componentCollection.id)) { + HashSet<String> values = searchComponent.getCollectionCache() + .getDataById(componentCollection.id); + if (values != null) { + ModifiableSolrParams paramsNewRequest; + if (!requestParamList.containsKey(shardAddress)) { + paramsNewRequest = new ModifiableSolrParams(); + requestParamList.put(shardAddress, paramsNewRequest); + } else { + paramsNewRequest = requestParamList.get(shardAddress); + } + paramsNewRequest.add(PARAM_MTAS_COLLECTION + "." + id + "." + + NAME_MTAS_COLLECTION_KEY, componentCollection.key); + paramsNewRequest.add(PARAM_MTAS_COLLECTION + "." + id + "." + + NAME_MTAS_COLLECTION_ID, componentCollection.id); + paramsNewRequest.add( + PARAM_MTAS_COLLECTION + "." + id + "." + + NAME_MTAS_COLLECTION_ACTION, + ComponentCollection.ACTION_POST); + paramsNewRequest.add( + PARAM_MTAS_COLLECTION + "." + id + "." + + NAME_MTAS_COLLECTION_POST, + stringValuesToString(values)); + id++; + } + } + } + } + } else if (componentCollection.action() + .equals(ComponentCollection.ACTION_CREATE)) { + if (componentCollection.version == null) { + componentCollection.version = searchComponent.getCollectionCache() + .create(componentCollection.id, componentCollection.values().size(), + componentCollection.values()); + } + if (index.containsKey(componentCollection.id)) { + index.get(componentCollection.id).setCreate( + searchComponent.getCollectionCache().now(), + searchComponent.getCollectionCache().check(componentCollection.id)); + } + for (String shardAddress : rb.shards) { + ModifiableSolrParams paramsNewRequest; + if (!requestParamList.containsKey(shardAddress)) { + paramsNewRequest = new ModifiableSolrParams(); + requestParamList.put(shardAddress, paramsNewRequest); + } else { + paramsNewRequest = requestParamList.get(shardAddress); + } + paramsNewRequest.add(PARAM_MTAS_COLLECTION + "." + id + "." + + NAME_MTAS_COLLECTION_KEY, componentCollection.key); + paramsNewRequest.add(PARAM_MTAS_COLLECTION + "." + id + "." + + NAME_MTAS_COLLECTION_ID, componentCollection.id); + paramsNewRequest.add( + PARAM_MTAS_COLLECTION + "." + id + "." + + NAME_MTAS_COLLECTION_ACTION, + ComponentCollection.ACTION_POST); + paramsNewRequest.add( + PARAM_MTAS_COLLECTION + "." + id + "." + + NAME_MTAS_COLLECTION_POST, + stringValuesToString(componentCollection.values())); + } + } + id++; + } + // add new requests + for (Entry<String, ModifiableSolrParams> entry : requestParamList + .entrySet()) { + ShardRequest newSreq = new ShardRequest(); + newSreq.shards = new String[] { entry.getKey() }; + newSreq.purpose = ShardRequest.PURPOSE_PRIVATE; + newSreq.params = entry.getValue(); + newSreq.params.add("q", "*"); + newSreq.params.add("rows", "0"); + newSreq.params.add(MtasSolrSearchComponent.PARAM_MTAS, + rb.req.getOriginalParams() + .getParams(MtasSolrSearchComponent.PARAM_MTAS)); + newSreq.params.add(PARAM_MTAS_COLLECTION, + rb.req.getOriginalParams().getParams(PARAM_MTAS_COLLECTION)); + rb.addRequest(searchComponent, newSreq); + } + } else if (rb.stage == MtasSolrSearchComponent.STAGE_COLLECTION_FINISH) { + // just rewrite + ArrayList<Object> mtasResponseCollection; + try { + mtasResponseCollection = (ArrayList<Object>) mtasResponse.get("collection"); + if (mtasResponseCollection != null) { + MtasSolrResultUtil.rewrite(mtasResponseCollection, searchComponent); + } + } catch (ClassCastException e) { + log.debug(e); + mtasResponse.remove("collection"); + } + } + } + } + + /** + * Gets the mtas fields. + * + * @param rb + * the rb + * @return the mtas fields + */ + private ComponentFields getMtasFields(ResponseBuilder rb) { + return (ComponentFields) rb.req.getContext().get(ComponentFields.class); + } + + /** + * String values to string. + * + * @param stringValues + * the string values + * @return the string + */ + private static String stringValuesToString(HashSet<String> stringValues) { + return JSONUtil.toJSON(stringValues); + } + + /** + * String to string values. + * + * @param stringValue + * the string value + * @return the hash set + * @throws IOException + * Signals that an I/O exception has occurred. + */ + private static HashSet<String> stringToStringValues(String stringValue) + throws IOException { + // should be improved to support escaped characters + HashSet<String> stringValues = new HashSet<>(); + JSONParser jsonParser = new JSONParser(stringValue); + int event = jsonParser.nextEvent(); + if (event == JSONParser.ARRAY_START) { + while ((event = jsonParser.nextEvent()) != JSONParser.ARRAY_END) { + if (jsonParser.getLevel() == 1) { + switch (event) { + case JSONParser.STRING: + stringValues.add(jsonParser.getString()); + break; + case JSONParser.BIGNUMBER: + case JSONParser.NUMBER: + case JSONParser.LONG: + stringValues.add(jsonParser.getNumberChars().toString()); + break; + case JSONParser.BOOLEAN: + stringValues.add(Boolean.toString(jsonParser.getBoolean())); + break; + default: + // do nothing + } + } + } + } else { + throw new IOException("unsupported json structure"); + } + return stringValues; + } + +} diff --git a/src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java b/src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java index 58abef1..ea15c59 100644 --- a/src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java +++ b/src/mtas/solr/handler/component/util/MtasSolrComponentDocument.java @@ -71,12 +71,14 @@ public class MtasSolrComponentDocument /** The Constant NAME_MTAS_DOCUMENT_NUMBER. */ public static final String NAME_MTAS_DOCUMENT_NUMBER = "number"; + + private MtasSolrSearchComponent searchComponent; /** * Instantiates a new mtas solr component document. */ - public MtasSolrComponentDocument() { - // do nothing for now + public MtasSolrComponentDocument(MtasSolrSearchComponent searchComponent) { + this.searchComponent = searchComponent; } /* @@ -257,7 +259,7 @@ public class MtasSolrComponentDocument mtasDocumentItemResponses.add(mtasDocumentItemResponse); } mtasDocumentResponse.add("list", mtasDocumentItemResponses); - MtasSolrResultUtil.rewrite(mtasDocumentResponse); + MtasSolrResultUtil.rewrite(mtasDocumentResponse, searchComponent); return mtasDocumentResponse; } @@ -337,7 +339,7 @@ public class MtasSolrComponentDocument try { mtasResponseDocument = (ArrayList<Object>) mtasResponse.get("document"); if (mtasResponseDocument != null) { - MtasSolrResultUtil.rewrite(mtasResponseDocument); + MtasSolrResultUtil.rewrite(mtasResponseDocument, searchComponent); } } catch (ClassCastException e) { log.debug(e); diff --git a/src/mtas/solr/handler/component/util/MtasSolrComponentFacet.java b/src/mtas/solr/handler/component/util/MtasSolrComponentFacet.java index ddb5019..a8e1f64 100644 --- a/src/mtas/solr/handler/component/util/MtasSolrComponentFacet.java +++ b/src/mtas/solr/handler/component/util/MtasSolrComponentFacet.java @@ -573,7 +573,7 @@ public class MtasSolrComponentFacet mtasFacetResponse.add("_encoded_list", MtasSolrResultUtil.encode(data)); } else { mtasFacetResponse.add("list", data); - MtasSolrResultUtil.rewrite(mtasFacetResponse); + MtasSolrResultUtil.rewrite(mtasFacetResponse, searchComponent); } return mtasFacetResponse; } @@ -636,7 +636,7 @@ public class MtasSolrComponentFacet try { mtasResponseFacet = (ArrayList<Object>) mtasResponse.get("facet"); if (mtasResponseFacet != null) { - MtasSolrResultUtil.rewrite(mtasResponseFacet); + MtasSolrResultUtil.rewrite(mtasResponseFacet, searchComponent); } } catch (ClassCastException e) { log.debug(e); diff --git a/src/mtas/solr/handler/component/util/MtasSolrComponentGroup.java b/src/mtas/solr/handler/component/util/MtasSolrComponentGroup.java index 14841e7..4e33e57 100644 --- a/src/mtas/solr/handler/component/util/MtasSolrComponentGroup.java +++ b/src/mtas/solr/handler/component/util/MtasSolrComponentGroup.java @@ -333,9 +333,10 @@ public class MtasSolrComponentGroup * @param name the name * @param positions the positions * @param prefixes the prefixes + * @throws IOException */ private void prepare(SolrParams solrParams, SortedSet<String> gids, - String name, String[] positions, String[] prefixes) { + String name, String[] positions, String[] prefixes) throws IOException { if (!gids.isEmpty()) { int tmpSubCounter = 0; for (String gid : gids) { @@ -343,6 +344,11 @@ public class MtasSolrComponentGroup name + "." + gid + "." + NAME_MTAS_GROUP_GROUPING_POSITION, null); prefixes[tmpSubCounter] = solrParams.get( name + "." + gid + "." + NAME_MTAS_GROUP_GROUPING_PREFIXES, null); + if(positions[tmpSubCounter]==null) { + throw new IOException("no position for "+gid); + } else if(prefixes[tmpSubCounter]==null) { + throw new IOException("no prefix for "+gid); + } tmpSubCounter++; } } @@ -465,7 +471,7 @@ public class MtasSolrComponentGroup mtasGroupResponse.add("_encoded_list", MtasSolrResultUtil.encode(data)); } else { mtasGroupResponse.add("list", data); - MtasSolrResultUtil.rewrite(mtasGroupResponse); + MtasSolrResultUtil.rewrite(mtasGroupResponse, searchComponent); } return mtasGroupResponse; } @@ -528,7 +534,7 @@ public class MtasSolrComponentGroup try { mtasResponseGroup = (ArrayList<Object>) mtasResponse.get("group"); if (mtasResponseGroup != null) { - MtasSolrResultUtil.rewrite(mtasResponseGroup); + MtasSolrResultUtil.rewrite(mtasResponseGroup, searchComponent); } } catch (ClassCastException e) { log.debug(e); diff --git a/src/mtas/solr/handler/component/util/MtasSolrComponentJoin.java b/src/mtas/solr/handler/component/util/MtasSolrComponentJoin.java deleted file mode 100644 index d20b438..0000000 --- a/src/mtas/solr/handler/component/util/MtasSolrComponentJoin.java +++ /dev/null @@ -1,193 +0,0 @@ -package mtas.solr.handler.component.util; - -import java.io.IOException; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.solr.common.util.NamedList; -import org.apache.solr.common.util.SimpleOrderedMap; -import org.apache.solr.handler.component.ResponseBuilder; -import org.apache.solr.handler.component.SearchComponent; -import org.apache.solr.handler.component.ShardRequest; -import org.apache.solr.handler.component.ShardResponse; - -import mtas.codec.util.CodecComponent.ComponentFields; -import mtas.codec.util.CodecComponent.ComponentJoin; -import mtas.solr.handler.component.MtasSolrSearchComponent; - -/** - * The Class MtasSolrComponentJoin. - */ -@SuppressWarnings("deprecation") -public class MtasSolrComponentJoin implements MtasSolrComponent<ComponentJoin> { - - /** The Constant log. */ - private static final Log log = LogFactory.getLog(MtasSolrComponentJoin.class); - - /** The Constant PARAM_MTAS_JOIN. */ - public static final String PARAM_MTAS_JOIN = MtasSolrSearchComponent.PARAM_MTAS - + ".join"; - - /** The Constant NAME_MTAS_JOIN_FIELD. */ - public static final String NAME_MTAS_JOIN_FIELD = "field"; - - /** - * Instantiates a new mtas solr component join. - * - * @param searchComponent the search component - */ - public MtasSolrComponentJoin(MtasSolrSearchComponent searchComponent) { - } - - /* - * (non-Javadoc) - * - * @see - * mtas.solr.handler.component.util.MtasSolrComponent#prepare(org.apache.solr. - * handler.component.ResponseBuilder, - * mtas.codec.util.CodecComponent.ComponentFields) - */ - public void prepare(ResponseBuilder rb, ComponentFields mtasFields) - throws IOException { - if (rb.req.getParams().get(PARAM_MTAS_JOIN + "." + NAME_MTAS_JOIN_FIELD, - null) != null) { - Set<String> fields = new HashSet<>(Arrays.asList(rb.req.getParams() - .get(PARAM_MTAS_JOIN + "." + NAME_MTAS_JOIN_FIELD).split(","))); - String key = createKeyFromRequest(rb); - mtasFields.doJoin = true; - mtasFields.join = new ComponentJoin(fields, key); - rb.setNeedDocSet(true); - } - - } - - /* - * (non-Javadoc) - * - * @see - * mtas.solr.handler.component.util.MtasSolrComponent#modifyRequest(org.apache - * .solr.handler.component.ResponseBuilder, - * org.apache.solr.handler.component.SearchComponent, - * org.apache.solr.handler.component.ShardRequest) - */ - public void modifyRequest(ResponseBuilder rb, SearchComponent who, - ShardRequest sreq) { - if (sreq.params.getBool(MtasSolrSearchComponent.PARAM_MTAS, false) - && sreq.params.getBool(PARAM_MTAS_JOIN, false)) { - if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) { - // do nothing - } else { - // remove for other requests - Set<String> keys = MtasSolrResultUtil - .getIdsFromParameters(rb.req.getParams(), PARAM_MTAS_JOIN); - sreq.params.remove(PARAM_MTAS_JOIN); - for (String key : keys) { - sreq.params.remove(PARAM_MTAS_JOIN + "." + key); - } - } - } - } - - /* - * (non-Javadoc) - * - * @see - * mtas.solr.handler.component.util.MtasSolrComponent#create(mtas.codec.util. - * CodecComponent.BasicComponent, java.lang.Boolean) - */ - public SimpleOrderedMap<Object> create(ComponentJoin join, Boolean encode) - throws IOException { - MtasSolrJoinResult data = new MtasSolrJoinResult(join); - SimpleOrderedMap<Object> mtasJoinResponse = new SimpleOrderedMap<>(); - if (encode) { - mtasJoinResponse.add("_encoded_data", MtasSolrResultUtil.encode(data)); - } else { - mtasJoinResponse.add("data", data.rewrite()); - } - return mtasJoinResponse; - } - - /* - * (non-Javadoc) - * - * @see - * mtas.solr.handler.component.util.MtasSolrComponent#finishStage(org.apache. - * solr.handler.component.ResponseBuilder) - */ - @SuppressWarnings("unchecked") - public void finishStage(ResponseBuilder rb) { - if (rb.req.getParams().getBool(MtasSolrSearchComponent.PARAM_MTAS, false) - && rb.stage == MtasSolrSearchComponent.STAGE_JOIN) { - for (ShardRequest sreq : rb.finished) { - if (sreq.params.getBool(MtasSolrSearchComponent.PARAM_MTAS, false) - && sreq.params.getBool(PARAM_MTAS_JOIN, false)) { - for (ShardResponse shardResponse : sreq.responses) { - NamedList<Object> response = shardResponse.getSolrResponse() - .getResponse(); - try { - Object data = response.findRecursive("mtas", "join"); - if (data != null && data instanceof String) { - NamedList<Object> mtasResponse = (NamedList<Object>) response - .get("mtas"); - mtasResponse.remove("join"); - mtasResponse.add("join", - MtasSolrResultUtil.decode((String) data)); - } - } catch (ClassCastException e) { - log.debug(e); - // shouldn't happen - } - } - } - } - } - } - - /* - * (non-Javadoc) - * - * @see - * mtas.solr.handler.component.util.MtasSolrComponent#distributedProcess(org. - * apache.solr.handler.component.ResponseBuilder, - * mtas.codec.util.CodecComponent.ComponentFields) - */ - @SuppressWarnings("unchecked") - public void distributedProcess(ResponseBuilder rb, ComponentFields mtasFields) - throws IOException { - // rewrite - NamedList<Object> mtasResponse = null; - try { - mtasResponse = (NamedList<Object>) rb.rsp.getValues().get("mtas"); - } catch (ClassCastException e) { - log.debug(e); - mtasResponse = null; - } - if (mtasResponse != null) { - MtasSolrJoinResult mtasSolrJoinResult; - try { - mtasSolrJoinResult = (MtasSolrJoinResult) mtasResponse.get("join"); - if (mtasSolrJoinResult != null) { - mtasResponse.removeAll("join"); - mtasResponse.add("join", mtasSolrJoinResult.rewrite()); - } - } catch (ClassCastException e) { - log.debug(e); - mtasResponse.remove("join"); - } - } - } - - /** - * Creates the key from request. - * - * @param rb the rb - * @return the string - */ - private String createKeyFromRequest(ResponseBuilder rb) { - return rb.req.getParams().toQueryString(); - } - -} diff --git a/src/mtas/solr/handler/component/util/MtasSolrComponentPrefix.java b/src/mtas/solr/handler/component/util/MtasSolrComponentPrefix.java index 9d8f2fc..4b8e721 100644 --- a/src/mtas/solr/handler/component/util/MtasSolrComponentPrefix.java +++ b/src/mtas/solr/handler/component/util/MtasSolrComponentPrefix.java @@ -136,7 +136,7 @@ public class MtasSolrComponentPrefix * CodecComponent.BasicComponent, java.lang.Boolean) */ public SimpleOrderedMap<Object> create(ComponentPrefix prefix, - Boolean encode) { + Boolean encode) throws IOException { SimpleOrderedMap<Object> mtasPrefixResponse = new SimpleOrderedMap<>(); mtasPrefixResponse.add("key", prefix.key); if (encode) { @@ -219,7 +219,7 @@ public class MtasSolrComponentPrefix for (Object mtasResponsePrefixItemRaw : mtasResponsePrefix) { mtasResponsePrefixItem = (NamedList<Object>) mtasResponsePrefixItemRaw; repairPrefixItems(mtasResponsePrefixItem); - MtasSolrResultUtil.rewrite(mtasResponsePrefixItem); + MtasSolrResultUtil.rewrite(mtasResponsePrefixItem, searchComponent); } } } catch (ClassCastException e) { diff --git a/src/mtas/solr/handler/component/util/MtasSolrComponentStats.java b/src/mtas/solr/handler/component/util/MtasSolrComponentStats.java index 55cda9c..5e4a1fc 100644 --- a/src/mtas/solr/handler/component/util/MtasSolrComponentStats.java +++ b/src/mtas/solr/handler/component/util/MtasSolrComponentStats.java @@ -878,7 +878,7 @@ public class MtasSolrComponentStats MtasSolrResultUtil.encode(data)); } else { mtasPositionResponse.add(position.dataCollector.getCollectorType(), data); - MtasSolrResultUtil.rewrite(mtasPositionResponse); + MtasSolrResultUtil.rewrite(mtasPositionResponse, searchComponent); } return mtasPositionResponse; } @@ -903,7 +903,7 @@ public class MtasSolrComponentStats mtasTokenResponse.add("_encoded_data", MtasSolrResultUtil.encode(data)); } else { mtasTokenResponse.add(token.dataCollector.getCollectorType(), data); - MtasSolrResultUtil.rewrite(mtasTokenResponse); + MtasSolrResultUtil.rewrite(mtasTokenResponse, searchComponent); } return mtasTokenResponse; } @@ -944,7 +944,7 @@ public class MtasSolrComponentStats mtasSpanResponse.add("_encoded_data", MtasSolrResultUtil.encode(data)); } else { mtasSpanResponse.add(span.dataCollector.getCollectorType(), data); - MtasSolrResultUtil.rewrite(mtasSpanResponse); + MtasSolrResultUtil.rewrite(mtasSpanResponse, searchComponent); } return mtasSpanResponse; } @@ -1007,7 +1007,7 @@ public class MtasSolrComponentStats try { mtasResponseStats = (NamedList<Object>) mtasResponse.get("stats"); if (mtasResponseStats != null) { - MtasSolrResultUtil.rewrite(mtasResponseStats); + MtasSolrResultUtil.rewrite(mtasResponseStats, searchComponent); } } catch (ClassCastException e) { log.debug(e); diff --git a/src/mtas/solr/handler/component/util/MtasSolrComponentTermvector.java b/src/mtas/solr/handler/component/util/MtasSolrComponentTermvector.java index cb15461..b25acf9 100644 --- a/src/mtas/solr/handler/component/util/MtasSolrComponentTermvector.java +++ b/src/mtas/solr/handler/component/util/MtasSolrComponentTermvector.java @@ -476,7 +476,7 @@ public class MtasSolrComponentTermvector MtasSolrResultUtil.encode(data)); } else { mtasTermVectorResponse.add("list", data); - MtasSolrResultUtil.rewrite(mtasTermVectorResponse); + MtasSolrResultUtil.rewrite(mtasTermVectorResponse, searchComponent); } return mtasTermVectorResponse; } @@ -627,7 +627,7 @@ public class MtasSolrComponentTermvector if ((mtasResponseTermvectorRaw = mtasResponse.get("termvector")) != null && mtasResponseTermvectorRaw instanceof ArrayList) { MtasSolrResultUtil - .rewrite((ArrayList<Object>) mtasResponseTermvectorRaw); + .rewrite((ArrayList<Object>) mtasResponseTermvectorRaw, searchComponent); } } } diff --git a/src/mtas/solr/handler/component/util/MtasSolrJoinResult.java b/src/mtas/solr/handler/component/util/MtasSolrJoinResult.java deleted file mode 100644 index 77e37cf..0000000 --- a/src/mtas/solr/handler/component/util/MtasSolrJoinResult.java +++ /dev/null @@ -1,55 +0,0 @@ -package mtas.solr.handler.component.util; - -import java.io.Serializable; -import java.util.Set; - -import org.apache.solr.common.util.NamedList; - -import mtas.codec.util.CodecComponent.ComponentJoin; - -/** - * The Class MtasSolrJoinResult. - */ -public class MtasSolrJoinResult implements Serializable { - - /** The Constant serialVersionUID. */ - private static final long serialVersionUID = 1L; - - /** The values. */ - private Set<String> values; - - /** The key. */ - private String key; - - /** - * Instantiates a new mtas solr join result. - * - * @param join the join - */ - public MtasSolrJoinResult(ComponentJoin join) { - values = join.values(); - key = join.key(); - } - - /** - * Rewrite. - * - * @return the named list - */ - public NamedList<Object> rewrite() { - NamedList<Object> response = new NamedList<>(); - response.add("values", values); - response.add("key", key); - return response; - } - - /** - * Merge. - * - * @param newItem the new item - */ - public void merge(MtasSolrJoinResult newItem) { - values.addAll(newItem.values); - } - -} diff --git a/src/mtas/solr/handler/component/util/MtasSolrResultMerge.java b/src/mtas/solr/handler/component/util/MtasSolrResultMerge.java index eb7bc7b..ba8d3fc 100644 --- a/src/mtas/solr/handler/component/util/MtasSolrResultMerge.java +++ b/src/mtas/solr/handler/component/util/MtasSolrResultMerge.java @@ -15,7 +15,6 @@ import org.apache.solr.handler.component.ResponseBuilder; import org.apache.solr.handler.component.ShardRequest; import org.apache.solr.handler.component.ShardResponse; -import mtas.codec.util.CodecComponent.ComponentFields; import mtas.solr.handler.component.MtasSolrSearchComponent; /** @@ -64,21 +63,22 @@ public class MtasSolrResultMerge { .getBool(MtasSolrComponentFacet.PARAM_MTAS_FACET, false)) { mergeArrayList(sreq, mtasResponse, "facet", null, false); } - // merge join - if (rb.req.getParams().getBool(MtasSolrComponentJoin.PARAM_MTAS_JOIN, + // merge collection + if (rb.req.getParams().getBool(MtasSolrComponentCollection.PARAM_MTAS_COLLECTION, false)) { - ComponentFields componentFields = (ComponentFields) rb.req - .getContext().get(ComponentFields.class); - mtasResponse.add("join", - new MtasSolrJoinResult(componentFields.join)); - mergeJoinResult(sreq, mtasResponse, "join", null); - + mergeArrayList(sreq, mtasResponse, "collection", null, false); } // merge prefix if (rb.req.getParams() .getBool(MtasSolrComponentPrefix.PARAM_MTAS_PREFIX, false)) { mergeArrayList(sreq, mtasResponse, "prefix", null, false); } + } else if (rb.stage == MtasSolrSearchComponent.STAGE_COLLECTION_INIT) { + // merge collection + if (rb.req.getParams().getBool( + MtasSolrComponentCollection.PARAM_MTAS_COLLECTION, false)) { + mergeArrayList(sreq, mtasResponse, "collection", null, false); + } } else if (rb.stage == MtasSolrSearchComponent.STAGE_TERMVECTOR_MISSING_KEY) { // merge termvector if (rb.req.getParams().getBool( @@ -109,57 +109,7 @@ public class MtasSolrResultMerge { } } } - - /** - * Merge join result. - * - * @param sreq the sreq - * @param mtasResponse the mtas response - * @param key the key - * @param preferredPurpose the preferred purpose - */ - @SuppressWarnings("unchecked") - private void mergeJoinResult(ShardRequest sreq, - NamedList<Object> mtasResponse, String key, Integer preferredPurpose) { - Object o = mtasResponse.get(key); - MtasSolrJoinResult mtasJoinResponse; - if (o instanceof MtasSolrJoinResult) { - mtasJoinResponse = (MtasSolrJoinResult) o; - } else { - mtasJoinResponse = null; - } - // collect responses for each shard - HashMap<String, NamedList<Object>> mtasListShardResponses = new HashMap<>(); - for (ShardResponse response : sreq.responses) { - // only continue if new shard or preferred purpose - if (mtasListShardResponses.containsKey(response.getShard()) - && ((preferredPurpose == null) - || (sreq.purpose != preferredPurpose))) { - break; - } - // update - try { - NamedList<Object> result = response.getSolrResponse().getResponse(); - String data = (String) result.findRecursive("mtas", key); - if (data != null) { - MtasSolrJoinResult decodedData = (MtasSolrJoinResult) MtasSolrResultUtil - .decode(data); - if (mtasJoinResponse == null) { - mtasJoinResponse = decodedData; - } else { - mtasJoinResponse.merge(decodedData); - } - } - } catch (ClassCastException e) { - log.debug(e); - } - } - if (mtasJoinResponse != null) { - mtasResponse.removeAll(key); - mtasResponse.add(key, mtasJoinResponse); - } - } - + /** * Merge named list. * @@ -381,9 +331,9 @@ public class MtasSolrResultMerge { } else if (original instanceof MtasSolrMtasResult) { MtasSolrMtasResult originalComponentResult = (MtasSolrMtasResult) original; originalComponentResult.merge((MtasSolrMtasResult) shardValue); - } else if (original instanceof MtasSolrJoinResult) { - MtasSolrJoinResult originalComponentResult = (MtasSolrJoinResult) original; - originalComponentResult.merge((MtasSolrJoinResult) shardValue); + } else if (original instanceof MtasSolrCollectionResult) { + MtasSolrCollectionResult originalComponentResult = (MtasSolrCollectionResult) original; + originalComponentResult.merge((MtasSolrCollectionResult) shardValue); } else if (original instanceof String) { // ignore? } else if (original instanceof Integer) { diff --git a/src/mtas/solr/handler/component/util/MtasSolrResultUtil.java b/src/mtas/solr/handler/component/util/MtasSolrResultUtil.java index dc95dbe..8224761 100644 --- a/src/mtas/solr/handler/component/util/MtasSolrResultUtil.java +++ b/src/mtas/solr/handler/component/util/MtasSolrResultUtil.java @@ -7,6 +7,7 @@ import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Reader; +import java.io.Serializable; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; @@ -32,6 +33,7 @@ import mtas.codec.util.collector.MtasDataItem; import mtas.parser.cql.MtasCQLParser; import mtas.parser.cql.TokenMgrError; import mtas.search.spans.util.MtasSpanQuery; +import mtas.solr.handler.component.MtasSolrSearchComponent; /** * The Class MtasSolrResultUtil. @@ -58,16 +60,18 @@ public class MtasSolrResultUtil { /** * Rewrite. * - * @param al the al - * @throws IOException Signals that an I/O exception has occurred. + * @param al + * the al + * @throws IOException + * Signals that an I/O exception has occurred. */ @SuppressWarnings({ "unchecked", "rawtypes" }) - public static void rewrite(ArrayList<?> al) throws IOException { + public static void rewrite(ArrayList<?> al, MtasSolrSearchComponent searchComponent) throws IOException { for (int i = 0; i < al.size(); i++) { if (al.get(i) instanceof NamedList) { - rewrite((NamedList) al.get(i)); + rewrite((NamedList) al.get(i), searchComponent); } else if (al.get(i) instanceof ArrayList) { - rewrite((ArrayList) al.get(i)); + rewrite((ArrayList) al.get(i), searchComponent); } } } @@ -75,22 +79,27 @@ public class MtasSolrResultUtil { /** * Rewrite. * - * @param nl the nl - * @throws IOException Signals that an I/O exception has occurred. + * @param nl + * the nl + * @throws IOException + * Signals that an I/O exception has occurred. */ - public static void rewrite(NamedList<Object> nl) throws IOException { - rewrite(nl, true); + public static void rewrite(NamedList<Object> nl, MtasSolrSearchComponent searchComponent) throws IOException { + rewrite(nl, searchComponent, true); } /** * Rewrite. * - * @param nl the nl - * @param doCollapse the do collapse - * @throws IOException Signals that an I/O exception has occurred. + * @param nl + * the nl + * @param doCollapse + * the do collapse + * @throws IOException + * Signals that an I/O exception has occurred. */ @SuppressWarnings({ "rawtypes", "unchecked" }) - private static void rewrite(NamedList<Object> nl, boolean doCollapse) + private static void rewrite(NamedList<Object> nl, MtasSolrSearchComponent searchComponent, boolean doCollapse) throws IOException { boolean showDebugInfo = false; HashMap<String, NamedList<Object>> collapseNamedList = new HashMap<>(); @@ -98,15 +107,18 @@ public class MtasSolrResultUtil { for (int i = 0; i < length; i++) { if (nl.getVal(i) instanceof NamedList) { NamedList o = (NamedList) nl.getVal(i); - rewrite(o, true); + rewrite(o, searchComponent, true); nl.setVal(i, o); } else if (nl.getVal(i) instanceof ArrayList) { ArrayList o = (ArrayList) nl.getVal(i); - rewrite(o); + rewrite(o, searchComponent); nl.setVal(i, o); } else if (nl.getVal(i) instanceof MtasDataItem) { MtasDataItem dataItem = (MtasDataItem) nl.getVal(i); nl.setVal(i, dataItem.rewrite(showDebugInfo)); + } else if (nl.getVal(i) instanceof MtasSolrCollectionResult) { + MtasSolrCollectionResult o = (MtasSolrCollectionResult) nl.getVal(i); + collapseNamedList.put(nl.getName(i), o.rewrite(searchComponent)); } else if (nl.getVal(i) instanceof MtasSolrMtasResult) { MtasSolrMtasResult o = (MtasSolrMtasResult) nl.getVal(i); if (o.dataCollector.getCollectorType() @@ -144,7 +156,7 @@ public class MtasSolrResultUtil { .equals(DataCollector.COLLECTOR_TYPE_DATA)) { NamedList<Object> nnl = o.getData(showDebugInfo); if (nnl.size() > 0) { - rewrite(nnl); + rewrite(nnl, searchComponent); collapseNamedList.put(nl.getName(i), nnl); nl.setVal(i, nnl); } else { @@ -161,13 +173,14 @@ public class MtasSolrResultUtil { for (NamedList<Object> items : collapseNamedList.values()) { nl.addAll(items); } - } + } } /** * Rewrite to array. * - * @param nnl the nnl + * @param nnl + * the nnl * @return the array list */ private static ArrayList<NamedList<Object>> rewriteToArray( @@ -194,10 +207,14 @@ public class MtasSolrResultUtil { /** * Rewrite merge list. * - * @param key the key - * @param subKey the sub key - * @param snl the snl - * @param tnl the tnl + * @param key + * the key + * @param subKey + * the sub key + * @param snl + * the snl + * @param tnl + * the tnl */ @SuppressWarnings({ "unchecked", "unused" }) private static void rewriteMergeList(String key, String subKey, @@ -222,10 +239,14 @@ public class MtasSolrResultUtil { /** * Rewrite merge data. * - * @param key the key - * @param subKey the sub key - * @param snl the snl - * @param tnl the tnl + * @param key + * the key + * @param subKey + * the sub key + * @param snl + * the snl + * @param tnl + * the tnl */ @SuppressWarnings({ "unused", "unchecked" }) private static void rewriteMergeData(String key, String subKey, @@ -246,28 +267,36 @@ public class MtasSolrResultUtil { /** * Encode. * - * @param o the o + * @param o + * the o * @return the string + * @throws IOException */ - public static String encode(Object o) { - ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); - ObjectOutputStream objectOutputStream; - try { - objectOutputStream = new ObjectOutputStream(byteArrayOutputStream); - objectOutputStream.writeObject(o); - objectOutputStream.close(); - byte[] byteArray = byteArrayOutputStream.toByteArray(); - return Base64.byteArrayToBase64(byteArray); - } catch (IOException e) { - log.error(e); - return null; + public static String encode(Object o) throws IOException { + if (o instanceof Serializable) { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + ObjectOutputStream objectOutputStream; + try { + objectOutputStream = new ObjectOutputStream(byteArrayOutputStream); + objectOutputStream.writeObject(o); + objectOutputStream.close(); + byte[] byteArray = byteArrayOutputStream.toByteArray(); + return Base64.byteArrayToBase64(byteArray); + } catch (IOException e) { + e.printStackTrace(); + log.error(e); + return null; + } + } else { + throw new IOException("no serializable object"); } } /** * Decode. * - * @param s the s + * @param s + * the s * @return the object */ static Object decode(String s) { @@ -286,7 +315,8 @@ public class MtasSolrResultUtil { /** * Decode. * - * @param l the l + * @param l + * the l * @return the array list */ @SuppressWarnings({ "rawtypes", "unchecked" }) @@ -304,7 +334,8 @@ public class MtasSolrResultUtil { /** * Decode. * - * @param nl the nl + * @param nl + * the nl * @return the named list */ @SuppressWarnings({ "rawtypes", "unchecked" }) @@ -346,8 +377,10 @@ public class MtasSolrResultUtil { /** * Gets the ids from parameters. * - * @param params the params - * @param prefix the prefix + * @param params + * the params + * @param prefix + * the prefix * @return the ids from parameters */ public static SortedSet<String> getIdsFromParameters(SolrParams params, @@ -369,12 +402,18 @@ public class MtasSolrResultUtil { /** * Compare and check. * - * @param list the list - * @param original the original - * @param nameNew the name new - * @param nameOriginal the name original - * @param unique the unique - * @throws IOException Signals that an I/O exception has occurred. + * @param list + * the list + * @param original + * the original + * @param nameNew + * the name new + * @param nameOriginal + * the name original + * @param unique + * the unique + * @throws IOException + * Signals that an I/O exception has occurred. */ public static void compareAndCheck(String[] list, String[] original, String nameNew, String nameOriginal, Boolean unique) throws IOException { @@ -398,15 +437,23 @@ public class MtasSolrResultUtil { /** * Construct query. * - * @param queryValue the query value - * @param queryType the query type - * @param queryPrefix the query prefix - * @param queryVariables the query variables - * @param field the field - * @param queryIgnore the query ignore - * @param maximumIgnoreLength the maximum ignore length + * @param queryValue + * the query value + * @param queryType + * the query type + * @param queryPrefix + * the query prefix + * @param queryVariables + * the query variables + * @param field + * the field + * @param queryIgnore + * the query ignore + * @param maximumIgnoreLength + * the maximum ignore length * @return the mtas span query - * @throws IOException Signals that an I/O exception has occurred. + * @throws IOException + * Signals that an I/O exception has occurred. */ public static MtasSpanQuery constructQuery(String queryValue, String queryType, String queryPrefix, diff --git a/src/mtas/solr/search/MtasJoinQParser.java b/src/mtas/solr/search/MtasJoinQParser.java index 7e27ef9..db18735 100644 --- a/src/mtas/solr/search/MtasJoinQParser.java +++ b/src/mtas/solr/search/MtasJoinQParser.java @@ -1,27 +1,38 @@ package mtas.solr.search; +import java.io.IOException; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.search.AutomatonQuery; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.solr.common.params.SolrParams; +import org.apache.solr.core.PluginBag.PluginHolder; +import org.apache.solr.handler.component.SearchComponent; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.search.QParser; import org.apache.solr.search.SyntaxError; +import mtas.solr.handler.component.MtasSolrSearchComponent; + /** * The Class MtasJoinQParser. */ public class MtasJoinQParser extends QParser { - /** The Constant MTAS_JOIN_QPARSER_URL. */ - public static final String MTAS_JOIN_QPARSER_URL = "url"; + /** The Constant MTAS_JOIN_QPARSER_ID. */ + public static final String MTAS_JOIN_QPARSER_COLLECTION = "collection"; - /** The Constant MTAS_JOIN_QPARSER_REQUEST. */ - public static final String MTAS_JOIN_QPARSER_REQUEST = "request"; + /** The Constant MTAS_JOIN_QPARSER_FIELD. */ + public static final String MTAS_JOIN_QPARSER_FIELD = "field"; - /** The url. */ - String url = null; + /** The id. */ + private String id = null; - /** The request. */ - String request = null; + /** The fields. */ + private String[] fields = null; /** * Instantiates a new mtas join Q parser. @@ -35,17 +46,16 @@ public class MtasJoinQParser extends QParser { SolrQueryRequest req) { super(qstr, localParams, params, req); - // SearchComponent sc = req.getCore().getSearchComponent("mtas"); - // if ((sc != null) && (sc instanceof MtasSolrSearchComponent)) { - // msc = (MtasSolrSearchComponent) sc; - // } - if ((localParams.getParams(MTAS_JOIN_QPARSER_URL) != null) - && (localParams.getParams(MTAS_JOIN_QPARSER_URL).length == 1)) { - url = localParams.getParams(MTAS_JOIN_QPARSER_URL)[0]; + if ((localParams.getParams(MTAS_JOIN_QPARSER_COLLECTION) != null) + && (localParams.getParams(MTAS_JOIN_QPARSER_COLLECTION).length == 1)) { + id = localParams.getParams(MTAS_JOIN_QPARSER_COLLECTION)[0]; } - if ((localParams.getParams(MTAS_JOIN_QPARSER_REQUEST) != null) - && (localParams.getParams(MTAS_JOIN_QPARSER_REQUEST).length == 1)) { - request = localParams.getParams(MTAS_JOIN_QPARSER_REQUEST)[0]; + if ((localParams.getParams(MTAS_JOIN_QPARSER_FIELD) != null) + && (localParams.getParams(MTAS_JOIN_QPARSER_FIELD).length > 0)) { + fields = new String[localParams + .getParams(MTAS_JOIN_QPARSER_FIELD).length]; + System.arraycopy(localParams.getParams(MTAS_JOIN_QPARSER_FIELD), 0, + fields, 0, localParams.getParams(MTAS_JOIN_QPARSER_FIELD).length); } } @@ -56,12 +66,42 @@ public class MtasJoinQParser extends QParser { */ @Override public Query parse() throws SyntaxError { - if (url == null) { - throw new SyntaxError("no " + MTAS_JOIN_QPARSER_URL); - } else if (request == null) { - throw new SyntaxError("no " + MTAS_JOIN_QPARSER_REQUEST); + if (id == null) { + throw new SyntaxError("no " + MTAS_JOIN_QPARSER_COLLECTION); + } else if (fields == null) { + throw new SyntaxError("no " + MTAS_JOIN_QPARSER_FIELD); } else { - return null; + + BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); + + MtasSolrCollectionCache mtasSolrJoinCache = null; + for (PluginHolder<SearchComponent> item : req.getCore() + .getSearchComponents().getRegistry().values()) { + if (item.get() instanceof MtasSolrSearchComponent) { + mtasSolrJoinCache = ((MtasSolrSearchComponent) item.get()) + .getCollectionCache(); + } + } + if (mtasSolrJoinCache != null) { + Automaton automaton; + try { + automaton = mtasSolrJoinCache.getAutomatonById(id); + if (automaton != null) { + for (String field : fields) { + booleanQueryBuilder.add( + new AutomatonQuery(new Term(field), automaton), + Occur.SHOULD); + } + } else { + throw new IOException("no data for collection '"+id+"'"); + } + } catch (IOException e) { + throw new SyntaxError("could not construct automaton: "+e.getMessage(), e); + } + return booleanQueryBuilder.build(); + } else { + throw new SyntaxError("no MtasSolrSearchComponent found"); + } } } diff --git a/src/mtas/solr/search/MtasSolrCollectionCache.java b/src/mtas/solr/search/MtasSolrCollectionCache.java new file mode 100644 index 0000000..7b9b832 --- /dev/null +++ b/src/mtas/solr/search/MtasSolrCollectionCache.java @@ -0,0 +1,565 @@ +package mtas.solr.search; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Serializable; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.UUID; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.solr.common.util.Base64; +import org.apache.solr.common.util.SimpleOrderedMap; + +/** + * The Class MtasSolrCollectionCache. + */ +public class MtasSolrCollectionCache { + + /** The Constant log. */ + private static final Log log = LogFactory + .getLog(MtasSolrCollectionCache.class); + + /** The Constant DEFAULT_LIFETIME. */ + private static final long DEFAULT_LIFETIME = 86400; + + /** The Constant DEFAULT_MAXIMUM_NUMBER. */ + private static final int DEFAULT_MAXIMUM_NUMBER = 1000; + + /** The Constant DEFAULT_MAXIMUM_OVERFLOW. */ + private static final int DEFAULT_MAXIMUM_OVERFLOW = 10; + + /** The id to version. */ + private Map<String, String> idToVersion; + + /** The version to item. */ + private Map<String, MtasSolrCollectionCacheItem> versionToItem; + + /** The expiration version. */ + private Map<String, Long> expirationVersion; + + /** The collection cache path. */ + private Path collectionCachePath; + + /** The life time. */ + private long lifeTime; + + /** The maximum number. */ + private int maximumNumber; + + /** The maximum overflow. */ + private int maximumOverflow; + + /** + * Instantiates a new mtas solr collection cache. + * + * @param cacheDirectory the cache directory + * @param lifeTime the life time + * @param maximumNumber the maximum number + * @param maximumOverflow the maximum overflow + */ + public MtasSolrCollectionCache(String cacheDirectory, Long lifeTime, + Integer maximumNumber, Integer maximumOverflow) { + this.lifeTime = (lifeTime != null && lifeTime > 0) ? lifeTime + : DEFAULT_LIFETIME; + this.maximumNumber = (maximumNumber != null && maximumNumber > 0) + ? maximumNumber : DEFAULT_MAXIMUM_NUMBER; + this.maximumOverflow = (maximumOverflow != null && maximumOverflow > 0) + ? maximumOverflow : DEFAULT_MAXIMUM_OVERFLOW; + idToVersion = new HashMap<>(); + expirationVersion = new HashMap<>(); + versionToItem = new HashMap<>(); + if (cacheDirectory != null) { + try { + collectionCachePath = Files + .createDirectories(Paths.get(cacheDirectory)); + // reconstruct administration + File[] fileList = collectionCachePath.toFile().listFiles(); + if (fileList != null) { + for (File file : fileList) { + if (file.isFile()) { + String version = file.getName(); + MtasSolrCollectionCacheItem item = read(version, null); + if (item != null) { + if (idToVersion.containsKey(item.id)) { + expirationVersion.remove(idToVersion.get(item.id)); + versionToItem.remove(idToVersion.get(item.id)); + idToVersion.remove(item.id); + if (!file.delete()) { + log.error("couldn't delete " + file); + } + } + // don't keep data or automaton in memory + item.data = null; + // store in memory + idToVersion.put(item.id, version); + expirationVersion.put(version, + file.lastModified() + (1000 * lifeTime)); + versionToItem.put(version, item); + } else { + if (!file.delete()) { + log.error("couldn't delete " + file); + } + } + } else if (file.isDirectory()) { + log.info("unexpected directory " + file.getName()); + } + } + clear(); + } + } catch (IOException e) { + collectionCachePath = null; + log.error("couldn't create cache directory " + cacheDirectory, e); + } + } + } + + /** + * Creates the. + * + * @param size the size + * @param data the data + * @return the string + * @throws IOException Signals that an I/O exception has occurred. + */ + public String create(Integer size, HashSet<String> data) throws IOException { + return create(null, size, data); + } + + /** + * Creates the. + * + * @param id the id + * @param size the size + * @param data the data + * @return the string + * @throws IOException Signals that an I/O exception has occurred. + */ + public String create(String id, Integer size, HashSet<String> data) + throws IOException { + if (collectionCachePath != null) { + // initialization + Date date = clear(); + // create always new version + String version; + do { + version = UUID.randomUUID().toString(); + } while (versionToItem.containsKey(version)); + // create new item + MtasSolrCollectionCacheItem item; + if (id != null) { + item = new MtasSolrCollectionCacheItem(id, size, data); + // remove if item with id already exists + deleteById(id); + } else { + item = new MtasSolrCollectionCacheItem(version, size, data); + } + // register + idToVersion.put(id, version); + expirationVersion.put(version, date.getTime() + (1000 * lifeTime)); + versionToItem.put(version, item); + // store data in file + File file = collectionCachePath.resolve(version).toFile(); + try (OutputStream outputStream = new FileOutputStream(file); + Writer outputStreamWriter = new OutputStreamWriter(outputStream, + StandardCharsets.UTF_8);) { + outputStreamWriter.write(encode(item)); + // set correct time to reconstruct administration on restart + if (!file.setLastModified(date.getTime())) { + log.debug("couldn't change filetime " + file.getAbsolutePath()); + } + // don't store data in memory + item.data = null; + // return version + // System.out.println("STORED: " + version + " - " + item.size); + return version; + } catch (IOException e) { + idToVersion.remove(id); + expirationVersion.remove(version); + versionToItem.remove(version); + throw new IOException("couldn't create " + version, e); + } + } else { + throw new IOException("no cachePath available, can't store data"); + } + } + + /** + * List. + * + * @return the list + */ + public List<SimpleOrderedMap<Object>> list() { + List<SimpleOrderedMap<Object>> list = new ArrayList<>(); + for (Entry<String, String> entry : idToVersion.entrySet()) { + SimpleOrderedMap<Object> item = new SimpleOrderedMap<>(); + item.add("id", entry.getKey()); + item.add("size", versionToItem.get(entry.getValue()).size); + item.add("version", entry.getValue()); + item.add("expiration", expirationVersion.get(entry.getValue())); + list.add(item); + } + return list; + } + + /** + * Check. + * + * @param id the id + * @return the simple ordered map + * @throws IOException Signals that an I/O exception has occurred. + */ + public SimpleOrderedMap<Object> check(String id) throws IOException { + if (idToVersion.containsKey(id)) { + String version = idToVersion.get(id); + MtasSolrCollectionCacheItem item = versionToItem.get(version); + Date date = new Date(); + long now = date.getTime(); + if (verify(version, now)) { + SimpleOrderedMap<Object> data = new SimpleOrderedMap<>(); + data.add("now", now); + data.add("id", item.id); + data.add("size", item.size); + data.add("version", version); + data.add("expiration", expirationVersion.get(version)); + return data; + } else { + idToVersion.remove(id); + versionToItem.remove(version); + expirationVersion.remove(version); + return null; + } + } else { + return null; + } + } + + /** + * Now. + * + * @return the long + */ + public long now() { + return clear().getTime(); + } + + /** + * Gets the data by id. + * + * @param id the id + * @return the data by id + * @throws IOException Signals that an I/O exception has occurred. + */ + public HashSet<String> getDataById(String id) throws IOException { + if (idToVersion.containsKey(id)) { + return get(id); + } else { + return null; + } + } + + /** + * Gets the automaton by id. + * + * @param id the id + * @return the automaton by id + * @throws IOException Signals that an I/O exception has occurred. + */ + public Automaton getAutomatonById(String id) throws IOException { + if (idToVersion.containsKey(id)) { + List<BytesRef> bytesArray = new ArrayList<>(); + Set<String> data = get(id); + if (data != null) { + Term term; + for (String item : data) { + term = new Term("dummy", item); + bytesArray.add(term.bytes()); + } + Collections.sort(bytesArray); + return Automata.makeStringUnion(bytesArray); + } + } + return null; + } + + /** + * Delete by id. + * + * @param id the id + */ + public void deleteById(String id) { + if (idToVersion.containsKey(id)) { + String version = idToVersion.remove(id); + expirationVersion.remove(version); + versionToItem.remove(version); + if (collectionCachePath != null + && !collectionCachePath.resolve(version).toFile().delete()) { + log.debug("couldn't delete " + version); + } + } + } + + /** + * Gets the. + * + * @param id the id + * @return the hash set + * @throws IOException Signals that an I/O exception has occurred. + */ + private HashSet<String> get(String id) throws IOException { + if (collectionCachePath != null) { + Date date = clear(); + if (idToVersion.containsKey(id)) { + String version = idToVersion.get(id); + expirationVersion.put(version, date.getTime() + (1000 * lifeTime)); + MtasSolrCollectionCacheItem newItem = read(version, date.getTime()); + if (newItem != null && newItem.id.equals(id)) { + return newItem.data; + } else { + log.error("couldn't get " + version); + // delete file and remove from index + if (!collectionCachePath.resolve(version).toFile().delete()) { + log.debug("couldn't delete " + version); + } + idToVersion.remove(id); + expirationVersion.remove(version); + versionToItem.remove(version); + } + } else { + log.error("doesn't exist anymore"); + } + return null; + } else + + { + throw new IOException("no cachePath available, can't get data"); + } + } + + /** + * Read. + * + * @param version the version + * @param time the time + * @return the mtas solr collection cache item + */ + private MtasSolrCollectionCacheItem read(String version, Long time) { + try { + Path path = collectionCachePath.resolve(version); + String data = new String(Files.readAllBytes(path), + StandardCharsets.UTF_8); + MtasSolrCollectionCacheItem decodedData = decode(data); + + // set correct time to reconstruct administration on restart + if (time != null) { + File file = path.toFile(); + if (!file.setLastModified(time)) { + log.debug("couldn't change filetime " + file.getAbsolutePath()); + } + } + return decodedData; + } catch (IOException e) { + log.error("couldn't read " + version, e); + } + return null; + } + + /** + * Verify. + * + * @param version the version + * @param time the time + * @return true, if successful + */ + private boolean verify(String version, Long time) { + if (versionToItem.containsKey(version)) { + Path path = collectionCachePath.resolve(version); + File file = path.toFile(); + if (file.exists() && file.canRead() && file.canWrite()) { + if (time != null) { + if (!file.setLastModified(time)) { + log.debug("couldn't change filetime " + file.getAbsolutePath()); + } else { + expirationVersion.put(version, time + (1000 * lifeTime)); + } + } + return true; + } else { + return false; + } + } else { + return false; + } + } + + /** + * Clear. + * + * @return the date + */ + private Date clear() { + Date date = new Date(); + Long timestamp = date.getTime(); + HashSet<String> idsToBeRemoved = new HashSet<>(); + // check expiration + for (Entry<String, Long> entry : expirationVersion.entrySet()) { + if (entry.getValue() < timestamp) { + String version = entry.getKey(); + if (versionToItem.containsKey(version)) { + idsToBeRemoved.add(versionToItem.get(version).id); + } else { + log.debug("could not remove " + version); + } + } + } + for (String id : idsToBeRemoved) { + deleteById(id); + } + idsToBeRemoved.clear(); + // check size + if (expirationVersion.size() > maximumNumber + maximumOverflow) { + Set<Entry<String, Long>> mapEntries = expirationVersion.entrySet(); + List<Entry<String, Long>> aList = new LinkedList<>(mapEntries); + Collections.sort(aList, + (Entry<String, Long> ele1, Entry<String, Long> ele2) -> ele2 + .getValue().compareTo(ele1.getValue())); + aList.subList(maximumNumber, aList.size()).clear(); + for (Entry<String, MtasSolrCollectionCacheItem> entry : versionToItem + .entrySet()) { + if (!expirationVersion.containsKey(entry.getKey())) { + idsToBeRemoved.add(entry.getValue().id); + } + } + for (String id : idsToBeRemoved) { + deleteById(id); + } + idsToBeRemoved.clear(); + } + return date; + } + + /** + * Encode. + * + * @param o the o + * @return the string + * @throws IOException Signals that an I/O exception has occurred. + */ + private String encode(MtasSolrCollectionCacheItem o) throws IOException { + if (o != null) { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + ObjectOutputStream objectOutputStream; + objectOutputStream = new ObjectOutputStream(byteArrayOutputStream); + objectOutputStream.writeObject(o); + objectOutputStream.close(); + byte[] byteArray = byteArrayOutputStream.toByteArray(); + return Base64.byteArrayToBase64(byteArray); + } else { + throw new IOException("nothing to encode"); + } + } + + /** + * Decode. + * + * @param s the s + * @return the mtas solr collection cache item + * @throws IOException Signals that an I/O exception has occurred. + */ + private MtasSolrCollectionCacheItem decode(String s) throws IOException { + byte[] bytes = Base64.base64ToByteArray(s); + ObjectInputStream objectInputStream; + objectInputStream = new ObjectInputStream(new ByteArrayInputStream(bytes)); + try { + Object o = objectInputStream.readObject(); + if (o instanceof MtasSolrCollectionCacheItem) { + return (MtasSolrCollectionCacheItem) o; + } else { + throw new IOException("unexpected " + o.getClass().getSimpleName()); + } + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } + + /** + * Empty. + */ + public void empty() { + for (Entry<String, String> entry : idToVersion.entrySet()) { + expirationVersion.remove(entry.getValue()); + versionToItem.remove(entry.getValue()); + if (collectionCachePath != null + && !collectionCachePath.resolve(entry.getValue()).toFile().delete()) { + log.debug("couldn't delete " + entry.getValue()); + } + } + idToVersion.clear(); + } + +} + +class MtasSolrCollectionCacheItem implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 1L; + public String id; + public Integer size; + public HashSet<String> data = null; + + public MtasSolrCollectionCacheItem(String id, Integer size, + HashSet<String> data) throws IOException { + if (id != null) { + this.id = id; + this.size = size; + this.data = data; + } else { + throw new IOException("no id provided"); + } + } + + @Override + public int hashCode() { + int h = this.getClass().getSimpleName().hashCode(); + h = (h * 3) ^ id.hashCode(); + return h; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final MtasSolrCollectionCacheItem that = (MtasSolrCollectionCacheItem) obj; + return (id.equals(that.id)); + } +} diff --git a/src/mtas/solr/search/MtasSolrJoinCache.java b/src/mtas/solr/search/MtasSolrJoinCache.java deleted file mode 100644 index c7b2029..0000000 --- a/src/mtas/solr/search/MtasSolrJoinCache.java +++ /dev/null @@ -1,364 +0,0 @@ -package mtas.solr.search; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.io.Serializable; -import java.io.Writer; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map.Entry; -import java.util.Set; -import java.util.UUID; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.solr.common.util.Base64; - -/** - * The Class MtasSolrJoinCache. - */ -public class MtasSolrJoinCache { - - /** The Constant log. */ - private static final Log log = LogFactory.getLog(MtasSolrJoinCache.class); - - /** The Constant DEFAULT_LIFETIME. */ - private static final long DEFAULT_LIFETIME = 86400; - - /** The Constant DEFAULT_MAXIMUM_NUMBER. */ - private static final int DEFAULT_MAXIMUM_NUMBER = 1000; - - /** The Constant DEFAULT_MAXIMUM_OVERFLOW. */ - private static final int DEFAULT_MAXIMUM_OVERFLOW = 10; - - /** The administration. */ - private HashMap<MtasSolrJoinCacheItem, String> administration; - - /** The index. */ - private HashMap<String, MtasSolrJoinCacheItem> index; - - /** The expiration. */ - private HashMap<String, Long> expiration; - - /** The join cache path. */ - private Path joinCachePath; - - /** The life time. */ - private long lifeTime; - - /** The maximum number. */ - private int maximumNumber; - - /** The maximum overflow. */ - private int maximumOverflow; - - /** - * Instantiates a new mtas solr join cache. - * - * @param cacheDirectory the cache directory - * @param lifeTime the life time - * @param maximumNumber the maximum number - * @param maximumOverflow the maximum overflow - */ - public MtasSolrJoinCache(String cacheDirectory, Long lifeTime, - Integer maximumNumber, Integer maximumOverflow) { - joinCachePath = null; - this.lifeTime = (lifeTime != null && lifeTime > 0) ? lifeTime - : DEFAULT_LIFETIME; - this.maximumNumber = (maximumNumber != null && maximumNumber > 0) - ? maximumNumber : DEFAULT_MAXIMUM_NUMBER; - this.maximumOverflow = (maximumOverflow != null && maximumOverflow > 0) - ? maximumOverflow : DEFAULT_MAXIMUM_OVERFLOW; - if (cacheDirectory != null) { - try { - joinCachePath = Files.createDirectories(Paths.get(cacheDirectory)); - File[] fileList = joinCachePath.toFile().listFiles(); - if (fileList != null) { - for (File file : fileList) { - if (file.isFile() && !file.delete()) { - log.error("couldn't delete " + file); - } else if (file.isDirectory()) { - log.info("unexpected directory " + file.getName()); - } - } - } - } catch (IOException e) { - joinCachePath = null; - log.info("couldn't create cache directory " + cacheDirectory, e); - } - } - administration = new HashMap<>(); - expiration = new HashMap<>(); - } - - /** - * Creates the. - * - * @param url the url - * @param request the request - * @param data the data - * @return the string - * @throws IOException Signals that an I/O exception has occurred. - */ - public String create(String url, String request, Serializable data) - throws IOException { - MtasSolrJoinCacheItem item = new MtasSolrJoinCacheItem(url, request, null); - return create(item, data); - } - - /** - * Creates the. - * - * @param item the item - * @param data the data - * @return the string - * @throws IOException Signals that an I/O exception has occurred. - */ - private String create(MtasSolrJoinCacheItem item, Serializable data) - throws IOException { - // initialisation - Date date = clear(); - delete(item); - // create always new key - String key; - do { - key = UUID.randomUUID().toString(); - } while (index.containsKey(key)); - // register - administration.put(item, key); - expiration.put(key, date.getTime() + lifeTime); - index.put(key, item); - // store data - if (joinCachePath != null) { - File file = joinCachePath.resolve(key).toFile(); - try (OutputStream outputStream = new FileOutputStream(file); - Writer outputStreamWriter = new OutputStreamWriter(outputStream, - StandardCharsets.UTF_8);) { - outputStreamWriter.write(encode(data)); - return key; - } catch (IOException e) { - administration.remove(item); - expiration.remove(key); - log.error("couldn't create " + key, e); - return null; - } - } else { - item.data = encode(data); - return key; - } - } - - /** - * Gets the. - * - * @param url the url - * @param request the request - * @return the object - * @throws IOException Signals that an I/O exception has occurred. - */ - public Object get(String url, String request) throws IOException { - MtasSolrJoinCacheItem item = new MtasSolrJoinCacheItem(url, request, null); - if (administration.containsKey(item)) { - return get(item); - } else { - return null; - } - } - - /** - * Gets the. - * - * @param key the key - * @return the object - * @throws IOException Signals that an I/O exception has occurred. - */ - public Object get(String key) throws IOException { - if (index.containsKey(key)) { - return get(index.get(key)); - } else { - return null; - } - } - - /** - * Gets the. - * - * @param item the item - * @return the object - * @throws IOException Signals that an I/O exception has occurred. - */ - private Object get(MtasSolrJoinCacheItem item) throws IOException { - Date date = clear(); - if (administration.containsKey(item)) { - String key = administration.get(item); - expiration.put(key, date.getTime() + lifeTime); - if (joinCachePath != null) { - try { - Path path = joinCachePath.resolve(key); - String data = new String(Files.readAllBytes(path), - StandardCharsets.UTF_8); - return decode(data); - } catch (IOException e) { - if (!joinCachePath.resolve(key).toFile().delete()) { - log.debug("couldn't delete " + key); - } - administration.remove(item); - expiration.remove(key); - log.error("couldn't get " + key, e); - } - } else { - if (item.data != null) { - return decode(item.data); - } else { - return null; - } - } - } else { - log.error("doesn't exist anymore"); - } - return null; - } - - /** - * Delete. - * - * @param item the item - */ - private void delete(MtasSolrJoinCacheItem item) { - if (administration.containsKey(item)) { - String key = administration.remove(item); - expiration.remove(key); - index.remove(key); - if (joinCachePath != null - && !joinCachePath.resolve(key).toFile().delete()) { - log.debug("couldn't delete " + key); - } - } - } - - /** - * Clear. - * - * @return the date - */ - private Date clear() { - Date date = new Date(); - Long timestamp = date.getTime(); - HashSet<MtasSolrJoinCacheItem> toBeRemoved = new HashSet<>(); - // check expiration - for (Entry<String, Long> entry : expiration.entrySet()) { - if (entry.getValue() < timestamp) { - for (Entry<MtasSolrJoinCacheItem, String> subEntry : administration - .entrySet()) { - if (subEntry.getValue().equals(entry.getKey())) { - toBeRemoved.add(subEntry.getKey()); - } - } - } - } - for (MtasSolrJoinCacheItem item : toBeRemoved) { - delete(item); - } - // check size - if (expiration.size() > maximumNumber + maximumOverflow) { - Set<Entry<String, Long>> mapEntries = expiration.entrySet(); - List<Entry<String, Long>> aList = new LinkedList<>(mapEntries); - Collections.sort(aList, - (Entry<String, Long> ele1, Entry<String, Long> ele2) -> ele2 - .getValue().compareTo(ele1.getValue())); - aList.subList(maximumNumber, aList.size()).clear(); - for (Entry<String, MtasSolrJoinCacheItem> entry : index.entrySet()) { - if (!expiration.containsKey(entry.getKey())) { - toBeRemoved.add(entry.getValue()); - } - } - for (MtasSolrJoinCacheItem item : toBeRemoved) { - delete(item); - } - } - return date; - } - - /** - * Encode. - * - * @param o the o - * @return the string - * @throws IOException Signals that an I/O exception has occurred. - */ - private String encode(Serializable o) throws IOException { - ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); - ObjectOutputStream objectOutputStream; - objectOutputStream = new ObjectOutputStream(byteArrayOutputStream); - objectOutputStream.writeObject(o); - objectOutputStream.close(); - byte[] byteArray = byteArrayOutputStream.toByteArray(); - return Base64.byteArrayToBase64(byteArray); - } - - /** - * Decode. - * - * @param s the s - * @return the object - * @throws IOException Signals that an I/O exception has occurred. - */ - private Object decode(String s) throws IOException { - byte[] bytes = Base64.base64ToByteArray(s); - ObjectInputStream objectInputStream; - objectInputStream = new ObjectInputStream(new ByteArrayInputStream(bytes)); - try { - return objectInputStream.readObject(); - } catch (ClassNotFoundException e) { - throw new IOException(e); - } - } - -} - -class MtasSolrJoinCacheItem { - - public String url; - public String request; - public String data; - - public MtasSolrJoinCacheItem(String url, String request, String data) { - this.url = url == null ? "" : url; - this.request = request == null ? "" : request; - this.data = data == null ? "" : data; - } - - @Override - public int hashCode() { - int h = this.getClass().getSimpleName().hashCode(); - h = (h * 3) ^ url.hashCode(); - h = (h * 5) ^ request.hashCode(); - return h; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - final MtasSolrJoinCacheItem that = (MtasSolrJoinCacheItem) obj; - return (url.equals(that.url)) && (request.equals(that.request)); - } -} diff --git a/src/mtas/solr/search/MtasSolrJoinQParserPlugin.java b/src/mtas/solr/search/MtasSolrJoinQParserPlugin.java index 7dc58f3..ed6d686 100644 --- a/src/mtas/solr/search/MtasSolrJoinQParserPlugin.java +++ b/src/mtas/solr/search/MtasSolrJoinQParserPlugin.java @@ -1,11 +1,18 @@ package mtas.solr.search; +import java.io.IOException; + import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.CoreContainer; +import org.apache.solr.core.PluginBag.PluginHolder; +import org.apache.solr.handler.component.SearchComponent; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.search.QParser; import org.apache.solr.search.QParserPlugin; +import mtas.solr.handler.component.MtasSolrSearchComponent; + /** * The Class MtasSolrJoinQParserPlugin. */ diff --git a/src/mtas/solr/update/processor/MtasUpdateRequestProcessorFactory.java b/src/mtas/solr/update/processor/MtasUpdateRequestProcessorFactory.java index 633a1dd..007766a 100644 --- a/src/mtas/solr/update/processor/MtasUpdateRequestProcessorFactory.java +++ b/src/mtas/solr/update/processor/MtasUpdateRequestProcessorFactory.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; @@ -161,7 +162,7 @@ public class MtasUpdateRequestProcessorFactory Class<?> cls = Class.forName((String) className); if (cls.isAssignableFrom(MtasCharFilterFactory.class)) { Class<?>[] types = { Map.class, - SolrResourceLoader.class }; + ResourceLoader.class }; Constructor<?> cnstr = cls.getConstructor(types); Object cff = cnstr.newInstance(args, resourceLoader); if (cff instanceof MtasCharFilterFactory) { @@ -216,7 +217,7 @@ public class MtasUpdateRequestProcessorFactory if (className != null) { try { Class<?> cls = Class.forName((String) className); - Class<?>[] types = { Map.class, SolrResourceLoader.class }; + Class<?>[] types = { Map.class, ResourceLoader.class }; Constructor<?> cnstr = cls.getConstructor(types); Object cff = cnstr.newInstance(args, resourceLoader); if (cff instanceof MtasTokenizerFactory) { diff --git a/src/site/markdown/features.md b/src/site/markdown/features.md index d04b012..99e3b97 100644 --- a/src/site/markdown/features.md +++ b/src/site/markdown/features.md @@ -11,10 +11,10 @@ Annotational layers and structure are added to the existing [Lucene](https://luc **Extension of search capabilities**: * Supports [CQL](search_cql.html) query language. -* [Statistics](search_stats.html) on number of [words](search_query_stats_positions.html), [tokens](search_query_stats_tokens.html) and [spans](search_query_stats_spans.html). +* [Statistics](search_stats.html) on number of [words](search_component_stats_positions.html), [tokens](search_component_stats_tokens.html) and [spans](search_component_stats_spans.html). * Usage of [functions](search_functions.html) to produce statistics for custom defined relations between multiple spans and/or number of words. -* [Facets](search_query_facet.html) with [statistics](search_stats.html) on hits. -* [Kwic](search_query_kwic.html), [Lists](search_query_list.html), [Document](search_query_document.html), [termvectors](search_query_termvector.html) and [grouping](search_query_group.html) for spans. +* [Facets](search_component_facet.html) with [statistics](search_stats.html) on hits. +* [Kwic](search_component_kwic.html), [Lists](search_component_list.html), [Document](search_component_document.html), [termvectors](search_component_termvector.html) and [grouping](search_component_group.html) for spans. **Supports existing Solr capabilities**: diff --git a/src/site/markdown/index.md b/src/site/markdown/index.md index 7675314..95ef638 100644 --- a/src/site/markdown/index.md +++ b/src/site/markdown/index.md @@ -6,7 +6,7 @@ In recent years, multiple solutions have come available providing search on huge > > `<entity="location/> within (<s/> containing [lemma="utrecht"])` -Parsers for several [document formats](indexing_formats.html) are provided, each with extended possibilities for [configuration](indexing_configuration.html), and advanced query [features](features.html) like [statistics](search_query_stats.html), [termvectors](search_query_termvector.html) and [kwic](search_query_kwic.html) are available. +Parsers for several [document formats](indexing_formats.html) are provided, each with extended possibilities for [configuration](indexing_configuration.html), and advanced query [features](features.html) like [statistics](search_component_stats.html), [termvectors](search_component_termvector.html) and [kwic](search_component_kwic.html) are available. Source code and releases are available on [GitHub](https://github.com/meertensinstituut/mtas/), see [installation instructions](installation.html) on how to get started. diff --git a/src/site/markdown/installation_lucene.md.vm b/src/site/markdown/installation_lucene.md.vm index 892c3ca..3addc96 100644 --- a/src/site/markdown/installation_lucene.md.vm +++ b/src/site/markdown/installation_lucene.md.vm @@ -57,7 +57,7 @@ indexReader.close(); **Advanced search** By using the provided `collect` method, also more advanced -options are available, like computing the [termvector](search_query_termvector.html) +options are available, like computing the [termvector](search_component_termvector.html) ```java IndexReader indexReader = DirectoryReader.open(directory); diff --git a/src/site/markdown/search.md b/src/site/markdown/search.md index f362f36..d49f399 100644 --- a/src/site/markdown/search.md +++ b/src/site/markdown/search.md @@ -1,4 +1,4 @@ #Search -To take advantage of the annotation and structure added to the index, a specific Mtas searchComponent and queryParser are available. This enables the use of [CQL](search_cql.html) in both regular and specific [Mtas queries](search_query.html). For Solr search requests, some adjustments have to be made within the [configuration](search_configuration.html). +To take advantage of the annotation and structure added to the index, a specific Mtas [searchComponent](search_component.html), [queryParser](search_parser.html) and [requestHandler](search_handler.html) are available. This enables the use of [CQL](search_cql.html) in both regular and specific Mtas queries. For Solr search requests, some adjustments have to be made within the [configuration](search_configuration.html). diff --git a/src/site/markdown/search_component.md b/src/site/markdown/search_component.md new file mode 100644 index 0000000..1b95d3b --- /dev/null +++ b/src/site/markdown/search_component.md @@ -0,0 +1,41 @@ +# Search Component + +To perform specific Mtas queries in Solr requests, the following parameter should be used. + +| Parameter | Value | Obligatory | +|-------------|--------|-------------| +| mtas | true | yes | + +See [statistics](search_component_stats.html), +[kwic](search_component_kwic.html), [list](search_component_list.html), [document](search_component_document.html), [termvector](search_component_termvector.html), [facet](search_component_facet.html), [group](search_component_group.html), [prefix](search_component_prefix.html) and [collection](search_component_collection.html) for more details and examples. + +--- + +**Regular queries** + +Besides from specific Mtas queries in Solr requests, also [CQL](search_cql.html) can be used in regular queries by [configuring](search_configuration.html) the Mtas query parser in solrconfig.xml. + +*Example 1* + +Search for documents containing the word "de" with a query. + +`q={!mtas_cql+field%3D"text"+query%3D"[t%3D\"de\"]"}&fl=*&start=0&rows=0&wt=json&indent=true` + +``` json +"response":{"numFound":1664241,"start":0,"docs":[] + } +``` + +*Example 2* + +Search for documents containing the word "de" with a filter query. + +`fq={!mtas_cql+field%3D"text"+query%3D"[t%3D\"de\"]"}&q=*%3A*&fl=*&start=0&rows=0&wt=json&indent=true` + +``` json +"response":{"numFound":1664241,"start":0,"docs":[] + } +``` + + + diff --git a/src/site/markdown/search_component_collection.md b/src/site/markdown/search_component_collection.md new file mode 100644 index 0000000..60aa304 --- /dev/null +++ b/src/site/markdown/search_component_collection.md @@ -0,0 +1,26 @@ +#Collection + +Mtas provides a method to join query results, based on (temporary) storing lists of values, and reusing these stored lists or collections into following queries. + +To manage collections, in Solr requests, besides the parameter to enable the [Mtas query component](search_component.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.collection | true | yes | + +Multiple actions can be performed within the same request. To distinguish them, a unique identifier has to be provided for each of the required operations. + +##Create + +To make a new collection based on the set of unique values from one or multiple fieldnames, the `create` action can be used. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.collection.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.collection.\<identifier\>.action | create | | yes | +| mtas.collection.\<identifier\>.identifier | \<string\> | one or more comma separated fieldnames | yes | + +The values will be restricted to the set occurring within the listed fields for the set of documents matching the request. The provided identifier should be an unique string that can be used later on in other requests to refer to this set of data. Sharding is fully supported, i.e. the values are collected from all participating shards, and stored on both the main core and all these shards. + + + diff --git a/src/site/markdown/search_component_document.md b/src/site/markdown/search_component_document.md new file mode 100644 index 0000000..d28ba56 --- /dev/null +++ b/src/site/markdown/search_component_document.md @@ -0,0 +1,284 @@ +# Document + +Mtas can produce statistics on used terms for the individual listed documents. To get this information, in Solr requests, besides the parameter to enable the [Mtas query component](search_component.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.document | true | yes | + +Multiple document results can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required document results. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.document.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.document.\<identifier\>.field | \<string\> | Mtas field | yes | +| mtas.document.\<identifier\>.prefix | \<string\> | prefix |yes | +| mtas.document.\<identifier\>.number | \<double\> | create list with specified number of most frequent items | no | +| mtas.document.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | +| mtas.document.\<identifier\>.regexp | \<string\> | regular expression condition on term | no | +| mtas.document.\<identifier\>.ignoreRegexp | \<string\> | regular expression condition for terms that have to be ignored | no | + +## List + +A list can be provided, specifying the set of terms to consider when computing the result. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.document.\<identifier\>.list | \<string\> | comma separated list of values | yes | +| mtas.document.\<identifier\>.listRegexp | \<boolean\> | list of values are to be interpreted as regular expressions | no | +| mtas.document.\<identifier\>.listExpand | \<boolean\> | expand the matches on values from list | no | +| mtas.document.\<identifier\>.listExpandNumber | \<double\> | number of expansions of matches on values from list | no | + +## Ignore list + +Also a ignore list can be provided, specifying the set of terms not to consider when computing the result. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.document.\<identifier\>.ignoreList | \<string\> | comma separated list of values | yes | +| mtas.document.\<identifier\>.ignoreListRegexp | \<boolean\> | list of values are to be interpreted as regular expressions | no | + +--- + +## Examples +1. [Basic](#basic) : Statistics unique words for each document +2. [Regexp](#regexp) : Most frequent words containing only letters a-z and minimum length 5 +3. [List](#list) : Statistics for a provided list of words +4. [Ignore](#ignore) : Statistics for a provided list of regular expressions, ignoring another list of regular expressions + +--- + +<a name="basic"></a> + +### Basic + +**Example** +Statistics for set of unique tokens with prefix *t* (words) for each listed document. + + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5B%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.document=true&mtas.document.0.field=text&mtas.document.0.prefix=t&mtas.document.0.key=words&mtas.document.0.type=all&fl=*&start=0&rows=2&wt=json&indent=true` + +```json +"mtas":{ + "document":[{ + "key":"words", + "list":[{ + "documentKey":"4115a95c-011c-11e4-b0ff-51bcbd7c379f", + "sumsq":113964.0, + "populationvariance":126.5639231447591, + "max":166.0, + "sum":3336.0, + "kurtosis":92.19837080635624, + "standarddeviation":11.257199352433314, + "n":789, + "quadraticmean":12.01836364230935, + "min":1.0, + "median":1.0, + "variance":126.72453726042504, + "mean":4.228136882129286, + "geometricmean":1.9285975498109995, + "sumoflogs":518.209740627951, + "skewness":8.377350653392202}, + { + "documentKey":"4115aac4-011c-11e4-b0ff-51bcbd7c379f", + "sumsq":25489.0, + "populationvariance":35.695641666666134, + "max":77.0, + "sum":1563.0, + "kurtosis":72.57030420433823, + "standarddeviation":5.979568021426876, + "n":600, + "quadraticmean":6.517796151051877, + "min":1.0, + "median":1.0, + "variance":35.75523372287092, + "mean":2.6050000000000004, + "geometricmean":1.5249529474773036, + "sumoflogs":253.1781332820801, + "skewness":7.70682353088895}]}]} +``` + +<a name="regexp"></a> + +### Regexp + +**Example** +Most frequent tokens containing only letters a-z and minimum length 5 with prefix *t* (words) for each listed document. + +**Regexp**<br/> +`[a-z]{5,}` + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5B%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.document=true&mtas.document.0.field=NLContent_mtas&mtas.document.0.prefix=t&mtas.document.0.key=list+of+words&mtas.document.0.type=n%2Csum%2Cmean&mtas.document.0.regexp=%5Ba-z%5D%7B5%2C%7D&mtas.document.0.number=5&fl=%2A&start=0&rows=2&wt=json&indent=true` + +```json +"mtas":{ + "document":[{ + "key":"list of words", + "list":[{ + "documentKey":"c0c4200c-1eee-11e5-b891-f48ce0be173a", + "list":[{ + "sum":471, + "key":"zijne"}, + { + "sum":317, + "key":"eenen"}, + { + "sum":304, + "key":"zegde"}, + { + "sum":249, + "key":"hebben"}, + { + "sum":229, + "key":"welke"}], + "mean":4.552402402402403, + "sum":30319, + "n":6660}, + { + "documentKey":"c0c453d8-1eee-11e5-b891-f48ce0be173a", + "list":[{ + "sum":348, + "key":"heeft"}, + { + "sum":243, + "key":"hebben"}, + { + "sum":199, + "key":"prins"}, + { + "sum":173, + "key":"vader"}, + { + "sum":161, + "key":"komen"}], + "mean":4.641632967456191, + "sum":24104, + "n":5193}]}]} +``` + +<a name="list"></a> + +### List + +**Example** +Statistics for a provided list of words for each listed document. + +**List**<br/> +`koe,paard,schaap,geit,kip` + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bt_lc%3D%5C%22koe%5C%22%7Ct_lc%3D%5C%22paard%5C%22%7Ct_lc%3D%5C%22schaap%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.document=true&mtas.document.0.field=text&mtas.document.0.prefix=t_lc&mtas.document.0.key=list+of+words&mtas.document.0.type=n%2Csum%2Cmean&mtas.document.0.list=koe%2Cpaard%2Cschaap%2Cgeit%2Ckip&mtas.document.0.listRegexp=false&mtas.document.0.listExpand=false&mtas.document.0.number=100&fl=%2A&start=0&rows=2&wt=json&indent=true` + +```json +"mtas":{ + "document":[{ + "key":"list of words", + "list":[{ + "documentKey":"c0c46b7a-1eee-11e5-b891-f48ce0be173a", + "list":[{ + "sum":3, + "key":"paard"}, + { + "sum":2, + "key":"schaap"}], + "mean":2.5, + "sum":5, + "n":2}, + { + "documentKey":"c0c453d8-1eee-11e5-b891-f48ce0be173a", + "list":[{ + "sum":31, + "key":"paard"}, + { + "sum":1, + "key":"kip"}], + "mean":16.0, + "sum":32, + "n":2}]}]} +``` + +<a name="ignore"></a> + +### Ignore + +**Example** +Statistics for a provided list of regular expressions, ignoring another list of regular expressions for each listed document. + +**Regexp**<br/> +`[a-z]{7,}` + +**Ignore**<br/> +`[a-z]{10,}` + +**List**<br/> +`een.*,.*heid` + +**Ignore list**<br/> +`een.*heid,ee.*nheid` + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bt_lc%3D%5C%22eenheid%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.document=true&mtas.document.0.field=text&mtas.document.0.prefix=t_lc&mtas.document.0.key=advanced+list+of+words&mtas.document.0.type=n%2Csum%2Cmean&mtas.document.0.regexp=%5Ba-z%5D%7B7%2C%7D&mtas.document.0.list=een.%2A%2C.%2Aheid&mtas.document.0.listRegexp=true&mtas.document.0.listExpand=true&mtas.document.0.listExpandNumber=3&mtas.document.0.ignoreRegexp=%5Ba-z%5D%7B10%2C%7D&mtas.document.0.ignoreList=een.%2Aheid%2Cee.%2Anheid&mtas.document.0.ignoreListRegexp=true&mtas.document.0.number=10&fl=text_numberOfPositions%2CNLCore_NLIdentification_nederlabID%2CNLProfile_name%2CNLTitle_title&start=0&rows=2&wt=json&indent=true` + +```json +"mtas":{ + "document":[{ + "key":"advanced list of words", + "list":[{ + "documentKey":"c0c41486-1eee-11e5-b891-f48ce0be173a", + "list":[{ + "sum":166, + "list":{ + "droefheid":{ + "sum":36}, + "godheid":{ + "sum":22}, + "waarheid":{ + "sum":22}}, + "key":".*heid"}, + { + "sum":93, + "list":{ + "eenigen":{ + "sum":46}, + "eensklaps":{ + "sum":32}, + "eenigste":{ + "sum":3}}, + "key":"een.*"}], + "mean":5.886363636363637, + "sum":259, + "n":44}, + { + "documentKey":"c0c453d8-1eee-11e5-b891-f48ce0be173a", + "list":[{ + "sum":36, + "list":{ + "afscheid":{ + "sum":12}, + "hoogheid":{ + "sum":4}, + "bezigheid":{ + "sum":3}}, + "key":".*heid"}, + { + "sum":24, + "list":{ + "eenvoudig":{ + "sum":15}, + "eenzame":{ + "sum":3}, + "eenmaal":{ + "sum":2}}, + "key":"een.*"}], + "mean":3.1578947368421053, + "sum":60, + "n":19}]}]} +``` + +--- + +**Lucene** + +To get statistics on used terms for the listed documents [directly in Lucene](installation_lucene.html), *ComponentDocument* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_component_facet.md b/src/site/markdown/search_component_facet.md new file mode 100644 index 0000000..d9efd40 --- /dev/null +++ b/src/site/markdown/search_component_facet.md @@ -0,0 +1,266 @@ +# Facets + +Mtas can produce facets on metadata for Mtas queries. To get this information, in Solr requests, besides the parameter to enable the [Mtas query component](search_component.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.facet | true | yes | + +Multiple facet results can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required document results. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.facet.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.facet.\<identifier\>.field | \<string\> | Mtas field | yes | + +## Queries + +One or multiple queries on the defined Mtas field have to be defined + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.facet.\<identifier\>.query.\<identifier query\>.type | \<string\> | query language: [cql](search_cql.html) | yes | +| mtas.facet.\<identifier\>.query.\<identifier query\>.value | \<string\> | query: [cql](search_cql.html) | yes | +| mtas.facet.\<identifier\>.query.\<identifier query\>.prefix | \<string\> | default prefix | no | +| mtas.facet.\<identifier\>.query.\<identifier query\>.ignore | \<string\> | ignore query: [cql](search_cql.html) | no | +| mtas.facet.\<identifier\>.query.\<identifier query\>.maximumIgnoreLength | \<integer\> | maximum number of succeeding occurrences to ignore | no | + +### Variables + +The query may contain one or more variables, and the value(s) of these variables have to be defined + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.facet.\<identifier\>.query.\<identifier query\>.variable.\<identifier variable\>.name | \<string\> | name of variable | yes | +| mtas.facet.\<identifier\>.query.\<identifier query\>.variable.\<identifier variable\>.value | \<string\> | comma separated list of values | yes | + +## Base + +One or multiple fields to produce facets over have to be defined + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.facet.\<identifier\>.base.\<identifier base\>.field | \<string\> | field to produce facet over | yes | +| mtas.facet.\<identifier\>.base.\<identifier base\>.type | \<string\> | required [type of statistics](search_stats.html) | no | +| mtas.facet.\<identifier\>.base.\<identifier base\>.sort.type | \<string\> | sort on term or [type of statistics](search_stats.html) | no | +| mtas.facet.\<identifier\>.base.\<identifier base\>.sort.direction | \<string\> | sort direction: asc or desc | no | +| mtas.facet.\<identifier\>.base.\<identifier base\>.number | \<double\> | number of facets | no | +| mtas.facet.\<identifier\>.base.\<identifier base\>.minimum | \<double\> | minimum number of occurrences span(s) | no | +| mtas.facet.\<identifier\>.base.\<identifier base\>.maximum | \<double\> | maximum number of occurrences span(s) | no | + +### Ranges + +Number values can be grouped into ranges by defining a size and optionally a base for these ranges. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.facet.\<identifier\>.base.\<identifier base\>.range.size | \<double\> | size of the range | yes | +| mtas.facet.\<identifier\>.base.\<identifier base\>.range.base | \<double\> | base for the ranges | no | + +### Functions + +To compute statistics for values based on the occurrence of one or multiple spans, optionally [functions](search_functions.html) can be added. The parameters for these functions are the number of occurrences *$q0*, *$q1*, ... for each span and the number of positions *$n* in a document. Statistics on the value computed for each document in the set are added to the response. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.facet.\<identifier\>.base.\<identifier base\>.function.\<identifier function\>.key | \<string\> | key used in response | no | +| mtas.facet.\<identifier\>.base.\<identifier base\>.function.\<identifier function\>.expression | \<string\> | see [functions](search_functions.html) | yes | +| mtas.facet.\<identifier\>.base.\<identifier base\>.function.\<identifier function\>.type | \<string\> | required [type of statistics](search_stats.html) | no | + +The key is added to the response and may be used to distinguish between multiple functions, and should therefore be unique within each specified facet base. + +--- + +## Examples +1. [Basic](#basic) : basic facet on occurring part of speech +2. [Multiple](#multiple) : multiple facets on occurring part of speech +3. [Variable](#variable) : facets on occurring part of speech with variable +4. [Range](#range) : facet on occurring part of speech with range +5. [Function](#function) : facet on occurring part of speech with function + +--- + +<a name="basic"></a> + +### Basic + +**Example** +Facet over year for CQL query `[pos="N"]`. + +**Request and response** +`q=*:*&mtas=true&mtas.facet=true&mtas.facet.0.field=test&mtas.facet.0.key=example+-+basic&mtas.facet.0.query.0.type=cql&mtas.facet.0.query.0.value=[pos%3D"N"]&mtas.facet.0.base.0.field=year&mtas.facet.0.base.0.sort.type=sum&mtas.facet.0.base.0.sort.direction=desc&mtas.facet.0.base.0.number=3&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "facet":[{ + "key":"example - basic", + "listTotal":257, + "list":[{ + "mean":380.58187772925766, + "sum":697226, + "n":1832, + "key":"1997"}, + { + "mean":389.84488636363636, + "sum":686127, + "n":1760, + "key":"1999"}, + { + "mean":415.17861482381534, + "sum":683384, + "n":1646, + "key":"2002"}]}]} +``` + +<a name="multiple"></a> + +### Multiple + +**Example** +Facet over genre and year for CQL query `[pos="N"]`. + +**Request and response** +`q=*:*&mtas=true&mtas.facet=true&mtas.facet.0.field=test&mtas.facet.0.key=example+-+multiple&mtas.facet.0.query.0.type=cql&mtas.facet.0.query.0.value=[pos%3D"N"]&mtas.facet.0.base.0.field=genre&mtas.facet.0.base.0.sort.type=sum&mtas.facet.0.base.0.sort.direction=desc&mtas.facet.0.base.0.number=2&mtas.facet.0.base.1.field=year&mtas.facet.0.base.1.sort.type=sum&mtas.facet.0.base.1.sort.direction=desc&mtas.facet.0.base.1.number=2&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "facet":[{ + "key":"example - multiple", + "listTotal":26, + "list":[{ + "mean":409.7034217657067, + "sum":65015836, + "n":158690, + "listTotal":257, + "list":{ + "1997":{ + "mean":380.58187772925766, + "sum":697226, + "n":1832}, + "1999":{ + "mean":389.84488636363636, + "sum":686127, + "n":1760}}, + "key":"jaarboeken"}, + { + "mean":409.7034217657067, + "sum":65015836, + "n":158690, + "listTotal":257, + "list":{ + "1997":{ + "mean":380.58187772925766, + "sum":697226, + "n":1832}, + "1999":{ + "mean":389.84488636363636, + "sum":686127, + "n":1760}}, + "key":"periodieken"}]}]} +``` + +<a name="variable"></a> + +### Variable + +**Example** +Facet over year for CQL query `[pos=$1]` with `$1` equal to `N,ADJ`. + +**Request and response** +`q=*:*&mtas=true&mtas.facet=true&mtas.facet.0.field=text&mtas.facet.0.key=example+-+variable&mtas.facet.0.query.0.type=cql&mtas.facet.0.query.0.value=[pos%3D$1]&mtas.facet.0.query.0.variable.0.name=1&mtas.facet.0.query.0.variable.0.value=N,ADJ&mtas.facet.0.base.0.field=year&mtas.facet.0.base.0.sort.type=sum&mtas.facet.0.base.0.sort.direction=desc&mtas.facet.0.base.0.number=3&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "facet":[{ + "key":"example - variable", + "listTotal":257, + "list":[{ + "mean":531.8187772925764, + "sum":974292, + "n":1832, + "key":"1997"}, + { + "mean":545.3232954545455, + "sum":959769, + "n":1760, + "key":"1999"}, + { + "mean":573.460510328068, + "sum":943916, + "n":1646, + "key":"2002"}]}]} +``` + +<a name="range"></a> + +### Range + +**Example** +Facet over year with ranges of size 10 for CQL query `[pos="N"]`. + +**Request and response** +`q=*:*&mtas=true&mtas.facet=true&mtas.facet.0.field=test&mtas.facet.0.key=example+-+range&mtas.facet.0.query.0.type=cql&mtas.facet.0.query.0.value=[pos%3D"N"]&mtas.facet.0.base.0.field=year&mtas.facet.0.base.0.sort.type=sum&mtas.facet.0.base.0.sort.direction=desc&mtas.facet.0.base.0.number=3&mtas.facet.0.base.0.range.size=10&mtas.facet.0.base.0.range.base=0&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "facet":[{ + "key":"example - range", + "listTotal":29, + "list":[{ + "mean":369.9619179400794, + "sum":6149507, + "n":16622, + "key":"1990-1999"}, + { + "mean":559.2636835405855, + "sum":5711760, + "n":10213, + "key":"1900-1909"}, + { + "mean":482.52500238117915, + "sum":5066030, + "n":10499, + "key":"1910-1919"}]}]} +``` + +<a name="function"></a> + +### Function + +**Example** +Facet over year for CQL query `[pos="N"]` with function. + +**Request and response** +`q=*:*&mtas=true&mtas.facet=true&mtas.facet.0.field=test&mtas.facet.0.key=example+-+basic&mtas.facet.0.query.0.type=cql&mtas.facet.0.query.0.value=[pos%3D"N"]&mtas.facet.0.base.0.field=year&mtas.facet.0.base.0.sort.type=sum&mtas.facet.0.base.0.sort.direction=desc&mtas.facet.0.base.0.number=2&mtas.facet.0.base.0.minimum=1&mtas.facet.0.base.0.function.0.key=relative&mtas.facet.0.base.0.function.0.expression=$q0/$n&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "facet":[{ + "key":"example - basic", + "listTotal":255, + "list":[{ + "mean":515.6997041420118, + "sum":697226, + "n":1352, + "functions":{ + "relative":{ + "mean":0.17235837258586809, + "sum":233.02851973609367, + "n":1352}}, + "key":"1997"}, + { + "mean":476.14642609299096, + "sum":686127, + "n":1441, + "functions":{ + "relative":{ + "mean":0.17248794525621, + "sum":248.55512911419862, + "n":1441}}, + "key":"1999"}]}]} +``` + + +**Lucene** + +To produce facets on metadata [directly in Lucene](installation_lucene.html), *ComponentFacet* together with the provided *collect* method can be used. \ No newline at end of file diff --git a/src/site/markdown/search_component_group.md b/src/site/markdown/search_component_group.md new file mode 100644 index 0000000..af9c9ab --- /dev/null +++ b/src/site/markdown/search_component_group.md @@ -0,0 +1,548 @@ +# Grouping + +Mtas can group results for Mtas queries within the (filtered) set of documents. To get this information, in Solr requests, besides the parameter to enable the [Mtas query component](search_component.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.group | true | yes | + +Multiple group results can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required document results. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.group.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.group.\<identifier\>.field | \<string\> | Mtas field | yes | +| mtas.group.\<identifier\>.number | \<integer\> | number of results | no | + +## Query + +A query on the defined Mtas field has to be defined + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.group.\<identifier\>.query.type | \<string\> | query language: [cql](search_cql.html) | yes | +| mtas.group.\<identifier\>.query.value | \<string\> | query: [cql](search_cql.html) | yes | +| mtas.group.\<identifier\>.query.prefix | \<string\> | default prefix | no | +| mtas.group.\<identifier\>.query.ignore | \<string\> | ignore query: [cql](search_cql.html) | no | +| mtas.group.\<identifier\>.query.maximumIgnoreLength | \<integer\> | maximum number of succeeding occurrences to ignore | no | + +### Variables + +The query may contain one or more variables, and the value(s) of these variables have to be defined + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.group.\<identifier\>.query.variable.\<identifier variable\>.name | \<string\> | name of variable | yes | +| mtas.group.\<identifier\>.query.variable.\<identifier variable\>.value | \<string\> | comma separated list of values | yes | + +### Group + +Finally, the exact grouping has to be specified. Specification of the prefixes can be made for + +* positions inside the hit, covering all positions +* specified positions inside the hit defined from the left or right +* specified positions inside the hit defined from the left or right, which may exceed the hit boundaries to respectively right or left +* positions left or right from the hit + +--- + +**Grouping inside hit** + +The most simple form is grouping over a list of specified prefixes occurring at the position(s) inside the hit. + + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.group.\<identifier\>.grouping.hit.inside.prefixes | \<string\> | comma seperated list of prefixes | yes | + +--- + +**Grouping left inside hit** + +To group over specified prefixes occurring at positions specified from the left side inside the hit, *insideLeft* can be used. + + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.group.\<identifier\>.grouping.hit.insideLeft.\<identifier insideLeft\>.prefixes | \<string\> | comma seperated list of prefixes | yes | +| mtas.group.\<identifier\>.grouping.hit.insideLeft.\<identifier insideLeft\>.position | \<integer\>(-\<integer\>) | position(s) | yes | + +--- + +**Grouping right inside hit** + +To group over specified prefixes occurring at positions specified from the right side inside the hit, *insideRight* can be used. + + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.group.\<identifier\>.grouping.hit.insideRight.\<identifier insideRight\>.prefixes | \<string\> | comma seperated list of prefixes | yes | +| mtas.group.\<identifier\>.grouping.hit.insideRight.\<identifier insideRight\>.position | \<integer\>(-\<integer\>) | position(s) | yes | + +--- + +**Grouping left hit** + +To group over specified prefixes occurring at positions specified from the left side of the hit, optionally exceeding the right hit boundary, *left* is available. + + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.group.\<identifier\>.grouping.hit.left.\<identifier left\>.prefixes | \<string\> | comma seperated list of prefixes | yes | +| mtas.group.\<identifier\>.grouping.hit.left.\<identifier left\>.position | \<integer\>(-\<integer\>) | position(s) | yes | + +--- + +**Grouping right hit** + +To group over specified prefixes occurring at positions specified from the right side of the hit, optionally exceeding the left hit boundary, *right* is available. + + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.group.\<identifier\>.grouping.hit.right.\<identifier right\>.prefixes | \<string\> | comma seperated list of prefixes | yes | +| mtas.group.\<identifier\>.grouping.hit.right.\<identifier right\>.position | \<integer\>(-\<integer\>) | position(s) | yes | + +--- + +## Examples +1. [Inside hit](#inside_hit) : grouping based on prefixes inside the hit. +2. [Left inside hit](#inside_left_hit) : grouping based on prefixes occurring at positions specified from the left, inside the hit. +3. [Right inside hit](#inside_right_hit) : grouping based on prefixes occurring at positions specified from the right, inside the hit. +4. [Left hit](#left_hit) : grouping based on prefixes occurring at positions specified from the left, not necessarily inside the hit. +5. [Right hit](#right_hit) : grouping based on prefixes occurring at positions specified from the right, not necessarily inside the hit. +6. [Left](#left) : grouping based on prefixes occurring at positions at the left side from the hit. +7. [Right](#right) : grouping based on prefixes occurring at positions at the right side from the hit. + +--- + +<a name="inside hit"></a> + +### Inside hit + +**Example** +Grouping over prefix `lemma` for CQL query `[pos="LID"]`. + +**Request and response** +`q=*:*&rows=0&mtas=true&mtas.group=true&mtas.group.0.field=text&mtas.group.0.query.type=cql&mtas.group.0.query.value=[pos="LID"]&mtas.group.0.grouping.hit.inside.prefixes=lemma&mtas.group.0.number=5&wt=json&indent=true` + +``` json +"mtas":{ + "group":[{ + "key":"0", + "listTotal":523, + "list":[{ + "mean":156.32153403822628, + "sum":20062462, + "n":128341, + "group":{"hit":{"0":[{ + "prefix":"lemma", + "value":"de"}]}}, + "key":"| [lemma=\"de\"] |"}, + { + "mean":55.123732635459874, + "sum":6698195, + "n":121512, + "group":{"hit":{"0":[{ + "prefix":"lemma", + "value":"het"}]}}, + "key":"| [lemma=\"het\"] |"}, + { + "mean":46.594516509433966, + "sum":5531701, + "n":118720, + "group":{"hit":{"0":[{ + "prefix":"lemma", + "value":"een"}]}}, + "key":"| [lemma=\"een\"] |"}]}]} +``` + +<a name="left inside hit"></a> + +### Left inside hit + +**Example** +Grouping over prefix `lemma` at position `0` from the left and prefix `pos` at position `1-3` from the left inside the hit for CQL query `[pos="LID"][pos="ADJ"]`. + +**Request and response** +`q=*:*&rows=0&mtas=true&mtas.group=true&mtas.group.0.field=text&mtas.group.0.query.type=cql&mtas.group.0.query.value=[pos="LID"][pos="ADJ"]&mtas.group.0.grouping.hit.insideLeft.0.prefixes=lemma&mtas.group.0.grouping.hit.insideLeft.0.position=0&mtas.group.0.grouping.hit.insideLeft.1.prefixes=pos&mtas.group.0.grouping.hit.insideLeft.1.position=1-3&mtas.group.0.number=3&wt=json&indent=true` + +``` json +"mtas":{ + "group":[{ + "key":"0", + "listTotal":72, + "list":[{ + "mean":31.155598846589545, + "sum":3630375, + "n":116524, + "group":{"hit":{ + "0":[{ + "prefix":"lemma", + "value":"de"}], + "1":[{ + "prefix":"pos", + "value":"ADJ"}]}}, + "key":"| [lemma=\"de\"] [pos=\"ADJ\"] |"}, + { + "mean":17.898333524005643, + "sum":1877392, + "n":104892, + "group":{"hit":{ + "0":[{ + "prefix":"lemma", + "value":"een"}], + "1":[{ + "prefix":"pos", + "value":"ADJ"}]}}, + "key":"| [lemma=\"een\"] [pos=\"ADJ\"] |"}, + { + "mean":13.61732368967055, + "sum":1404518, + "n":103142, + "group":{"hit":{ + "0":[{ + "prefix":"lemma", + "value":"het"}], + "1":[{ + "prefix":"pos", + "value":"ADJ"}]}}, + "key":"| [lemma=\"het\"] [pos=\"ADJ\"] |"}]}]} +``` + +<a name="right inside hit"></a> + +### Right inside hit + +**Example** +Grouping over prefix `lemma` at position `0` from the right and prefix `pos` at position `1-3` from the right inside the hit for CQL query `[pos="LID"][pos="ADJ"]`. + +**Request and response** +`q=*:*&rows=0&mtas=true&mtas.group=true&mtas.group.0.field=text&mtas.group.0.query.type=cql&mtas.group.0.query.value=[pos="LID"][pos="ADJ"]&mtas.group.0.grouping.hit.insideRight.0.prefixes=lemma&mtas.group.0.grouping.hit.insideRight.0.position=1&mtas.group.0.grouping.hit.insideRight.1.prefixes=pos&mtas.group.0.grouping.hit.insideRight.1.position=1-3&mtas.group.0.number=3&wt=json&indent=true` + +``` json +"mtas":{ + "group":[{ + "key":"0", + "listTotal":72, + "list":[{ + "mean":31.155598846589545, + "sum":3630375, + "n":116524, + "group":{"hit":{ + "0":[{ + "prefix":"lemma", + "value":"de"}, + { + "prefix":"pos", + "value":"LID"}], + "1":null}}, + "key":"| [lemma=\"de\" & pos=\"LID\"] [] |"}, + { + "mean":17.898333524005643, + "sum":1877392, + "n":104892, + "group":{"hit":{ + "0":[{ + "prefix":"lemma", + "value":"een"}, + { + "prefix":"pos", + "value":"LID"}], + "1":null}}, + "key":"| [lemma=\"een\" & pos=\"LID\"] [] |"}, + { + "mean":13.61732368967055, + "sum":1404518, + "n":103142, + "group":{"hit":{ + "0":[{ + "prefix":"lemma", + "value":"het"}, + { + "prefix":"pos", + "value":"LID"}], + "1":null}}, + "key":"| [lemma=\"het\" & pos=\"LID\"] [] |"}]}]} +``` + +<a name="left hit"></a> + +### Left hit + +**Example** +Grouping over prefixes `lemma` and `pos` on position `3` from the left for CQL query `[pos="ADJ"]{2} followedby [][pos="LID"]`. + +**Request and response** +`q=*:*&rows=0&mtas=true&mtas.group=true&mtas.group.0.field=NLContent_mtas&mtas.group.0.query.type=cql&mtas.group.0.query.value=[pos="ADJ"]{2} followedby [][pos="LID"]&mtas.group.0.grouping.hit.left.0.prefixes=pos,lemma&mtas.group.0.grouping.hit.left.0.position=3&mtas.group.0.number=3&wt=json&indent=true` + +``` json +"mtas":{ + "group":[{ + "key":"0", + "listTotal":12, + "list":[{ + "mean":1.791719691185204, + "sum":63357, + "n":35361, + "group":{ + "hit":{ + "0":null, + "1":null}, + "right":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"de"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"| [] [] | [] [lemma=\"de\" & pos=\"LID\"]"}, + { + "mean":1.248066748066748, + "sum":18399, + "n":14742, + "group":{ + "hit":{ + "0":null, + "1":null}, + "right":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"het"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"| [] [] | [] [lemma=\"het\" & pos=\"LID\"]"}, + { + "mean":1.2065838092038965, + "sum":14368, + "n":11908, + "group":{ + "hit":{ + "0":null, + "1":null}, + "right":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"een"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"| [] [] | [] [lemma=\"een\" & pos=\"LID\"]"}]}]} +``` + +<a name="right hit"></a> + +### Right hit + +**Example** +Grouping over prefix `pos` and `lemma` on position `3` from the right for CQL query `[pos="ADJ"]{2} precededby [pos="LID"][]`. + +**Request and response** +`q=*:*&rows=0&mtas=true&mtas.group=true&mtas.group.0.field=text&mtas.group.0.query.type=cql&mtas.group.0.query.value=[pos="ADJ"]{2} precededby [pos="LID"][]&mtas.group.0.grouping.hit.right.0.prefixes=pos,lemma&mtas.group.0.grouping.hit.right.0.position=3&mtas.group.0.number=3&wt=json&indent=true` + +``` json +"mtas":{ + "group":[{ + "key":"0", + "listTotal":20, + "list":[{ + "mean":1.632708503124151, + "sum":48080, + "n":29448, + "group":{ + "hit":{ + "0":null, + "1":null}, + "left":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"de"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"[] [lemma=\"de\" & pos=\"LID\"] | [] [] |"}, + { + "mean":1.4123518709740865, + "sum":28723, + "n":20337, + "group":{ + "hit":{ + "0":null, + "1":null}, + "left":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"een"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"[] [lemma=\"een\" & pos=\"LID\"] | [] [] |"}, + { + "mean":1.255492025278363, + "sum":16688, + "n":13292, + "group":{ + "hit":{ + "0":null, + "1":null}, + "left":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"het"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"[] [lemma=\"het\" & pos=\"LID\"] | [] [] |"}]}]} +``` + +--- + +<a name="left"></a> + +### Left + +**Example** +Grouping over prefixes `lemma` and `pos` on position `1` at the left side for CQL query `[pos="ADJ"]{2} precededby [pos="LID"][]`. + +**Request and response** +`q=*:*&rows=0&mtas=true&mtas.group=true&mtas.group.0.field=NLContent_mtas&mtas.group.0.query.type=cql&mtas.group.0.query.value=[pos="ADJ"]{2} precededby [pos="LID"][]&mtas.group.0.grouping.left.0.prefixes=pos,lemma&mtas.group.0.grouping.left.0.position=1&mtas.group.0.number=3&wt=json&indent=true` + +``` json +"mtas":{ + "group":[{ + "key":"0", + "listTotal":20, + "list":[{ + "mean":1.632708503124151, + "sum":48080, + "n":29448, + "group":{ + "hit":{ + "0":null, + "1":null}, + "left":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"de"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"[] [lemma=\"de\" & pos=\"LID\"] | [] [] |"}, + { + "mean":1.4123518709740865, + "sum":28723, + "n":20337, + "group":{ + "hit":{ + "0":null, + "1":null}, + "left":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"een"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"[] [lemma=\"een\" & pos=\"LID\"] | [] [] |"}, + { + "mean":1.255492025278363, + "sum":16688, + "n":13292, + "group":{ + "hit":{ + "0":null, + "1":null}, + "left":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"het"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"[] [lemma=\"het\" & pos=\"LID\"] | [] [] |"}]}]} +``` + +--- + +<a name="right"></a> + +### Right + +**Example** +Grouping over prefixes `lemma` and `pos` on position `1` at the right side for CQL query `[pos="ADJ"]{2} followedby [][pos="LID"]`. + +**Request and response** +`q=*:*&rows=0&mtas=true&mtas.group=true&mtas.group.0.field=NLContent_mtas&mtas.group.0.query.type=cql&mtas.group.0.query.value=[pos="ADJ"]{2} followedby [][pos="LID"]&mtas.group.0.grouping.right.0.prefixes=pos,lemma&mtas.group.0.grouping.right.0.position=1&mtas.group.0.number=3&wt=json&indent=true` + +``` json +"mtas":{ + "group":[{ + "key":"0", + "listTotal":12, + "list":[{ + "mean":1.791719691185204, + "sum":63357, + "n":35361, + "group":{ + "hit":{ + "0":null, + "1":null}, + "right":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"de"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"| [] [] | [] [lemma=\"de\" & pos=\"LID\"]"}, + { + "mean":1.248066748066748, + "sum":18399, + "n":14742, + "group":{ + "hit":{ + "0":null, + "1":null}, + "right":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"het"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"| [] [] | [] [lemma=\"het\" & pos=\"LID\"]"}, + { + "mean":1.2065838092038965, + "sum":14368, + "n":11908, + "group":{ + "hit":{ + "0":null, + "1":null}, + "right":{ + "0":null, + "1":[{ + "prefix":"lemma", + "value":"een"}, + { + "prefix":"pos", + "value":"LID"}]}}, + "key":"| [] [] | [] [lemma=\"een\" & pos=\"LID\"]"}]}]} +``` + + +**Lucene** + +To group results [directly in Lucene](installation_lucene.html), *ComponentGroup* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_component_kwic.md b/src/site/markdown/search_component_kwic.md new file mode 100644 index 0000000..e145447 --- /dev/null +++ b/src/site/markdown/search_component_kwic.md @@ -0,0 +1,326 @@ +# Kwic + +Mtas can produce keywords in context (kwic) for Mtas queries within the listed documents. To get this information, in Solr requests, besides the parameter to enable the [Mtas query component](search_component.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.kwic | true | yes | + +Keyword in context results on multiple spans can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required kwics. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.kwic.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.kwic.\<identifier\>.field | \<string\> | Mtas field | yes | +| mtas.kwic.\<identifier\>.query.type | \<string\> | query language: [cql](search_cql.html) | yes | +| mtas.kwic.\<identifier\>.query.value | \<string\> | query: [cql](search_cql.html) | yes | +| mtas.kwic.\<identifier\>.query.prefix | \<string\> | default prefix | no | +| mtas.kwic.\<identifier\>.query.ignore | \<string\> | ignore query: [cql](search_cql.html) | no | +| mtas.kwic.\<identifier\>.query.maximumIgnoreLength | \<integer\> | maximum number of succeeding occurrences to ignore | no | +| mtas.kwic.\<identifier\>.prefix | \<string\> | comma separated list of prefixes | no | +| mtas.kwic.\<identifier\>.number | \<double\> | maximum number for selection of items for each document | no | +| mtas.kwic.\<identifier\>.start | \<double\> | offset for selection of items for each document | no | +| mtas.kwic.\<identifier\>.left | \<double\> | number of positions left of hit | no | +| mtas.kwic.\<identifier\>.right | \<double\> | number of positions right of hit | no | +| mtas.kwic.\<identifier\>.output | \<string\> | "token" or "hit" | no | + + +## Variables + +The query may contain one or more variables, and the value(s) of these variables have to be defined + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.kwic.\<identifier\>.query.variable.\<identifier variable\>.name | \<string\> | name of variable | yes | +| mtas.kwic.\<identifier\>.query.variable.\<identifier variable\>.value | \<string\> | comma separated list of values | yes | + +--- + +## Examples +1. [Token](#token) : List of tokens with prefix *t*, *pos* and *s* for adjectives followed by a noun +2. [Hit](#hit) : List of hits with prefix *t*, *pos* and *s* for articles followed by an adjective and a noun +3. [Left and Right](#left-and-right) : List of tokens with prefix *t* and *s* for sentences starting with an article, expanded to the left and the right +--- + +<a name="token"></a> + +### Token + +**Example** +Keyword in context with output type *token* and prefixes *t*, *pos* and *s* for adjectives followed by a noun + +**CQL** +`[pos="ADJ"][pos="N"]` + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bpos%3D%5C%22ADJ%5C%22%5D%5Bpos%3D%5C%22N%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.kwic=true&mtas.kwic.0.field=text&mtas.kwic.0.query.type=cql&mtas.kwic.0.query.value=%5Bpos%3D%22ADJ%22%5D%5Bpos%3D%22N%22%5D&mtas.kwic.0.key=adjective%2Bnoun&mtas.kwic.0.prefix=t%2Cpos%2Cs&mtas.kwic.0.output=token&mtas.kwic.0.number=2&mtas.kwic.0.start=0&mtas.kwic.0.left=0&mtas.kwic.0.right=0&fl=%2A&start=0&rows=1&wt=json&indent=true` + +```json +"mtas":{ + "kwic":[{ + "key":"adjective+noun", + "list":[{ + "documentKey":"61d2a1b3-9068-4815-ba4d-3370e5a809d7", + "documentTotal":31, + "documentMinPosition":0, + "documentMaxPosition":673, + "list":[{ + "startPosition":0, + "endPosition":1, + "tokens":[{ + "mtasId":8, + "prefix":"t", + "value":"fusiebedrijf", + "positionStart":1, + "positionEnd":1, + "parentMtasId":81}, + { + "mtasId":15, + "prefix":"pos", + "value":"N", + "positionStart":1, + "positionEnd":1}, + { + "mtasId":81, + "prefix":"s", + "value":"", + "positionStart":0, + "positionEnd":8, + "parentMtasId":82}, + { + "mtasId":0, + "prefix":"t", + "value":"Nieuw", + "positionStart":0, + "positionEnd":0, + "parentMtasId":81}, + { + "mtasId":5, + "prefix":"pos", + "value":"ADJ", + "positionStart":0, + "positionEnd":0}]}, + { + "startPosition":5, + "endPosition":6, + "tokens":[{ + "mtasId":45, + "prefix":"t", + "value":"Belgische", + "positionStart":5, + "positionEnd":5, + "parentMtasId":81}, + { + "mtasId":51, + "prefix":"pos", + "value":"ADJ", + "positionStart":5, + "positionEnd":5}, + { + "mtasId":55, + "prefix":"t", + "value":"energiemarkt", + "positionStart":6, + "positionEnd":6, + "parentMtasId":81}, + { + "mtasId":62, + "prefix":"pos", + "value":"N", + "positionStart":6, + "positionEnd":6}, + { + "mtasId":81, + "prefix":"s", + "value":"", + "positionStart":0, + "positionEnd":8, + "parentMtasId":82}]}]}]}]} +``` + + +<a name="hit"></a> + +### Hit + +**Example** +Keyword in context with output type *hit* and prefixes *t*, *pos* and *s* for articles followed by an adjective and a noun + +**CQL** +`[pos="LID"][pos="ADJ"][pos="N"]` + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bpos%3D%5C%22LID%5C%22%5D%5Bpos%3D%5C%22ADJ%5C%22%5D%5Bpos%3D%5C%22N%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.kwic=true&mtas.kwic.0.field=text&mtas.kwic.0.query.type=cql&mtas.kwic.0.query.value=%5Bpos%3D%22LID%22%5D%5Bpos%3D%22ADJ%22%5D%5Bpos%3D%22N%22%5D&mtas.kwic.0.key=article%2Badjective%2Bnoun&mtas.kwic.0.prefix=t%2Cpos%2Cs&mtas.kwic.0.output=hit&mtas.kwic.0.number=2&mtas.kwic.0.start=0&mtas.kwic.0.left=0&mtas.kwic.0.right=0&fl=%2A&start=0&rows=1&wt=json&indent=true` + +```json +"mtas":{ + "kwic":[{ + "key":"article+adjective+noun", + "list":[{ + "documentKey":"61d2a1b3-9068-4815-ba4d-3370e5a809d7", + "documentTotal":21, + "documentMinPosition":0, + "documentMaxPosition":673, + "list":[{ + "hit":{ + "92":[["t", + "De"], + ["pos", + "LID"], + ["s", + null]], + "93":[["t", + "nieuwe"], + ["pos", + "ADJ"], + ["s", + null]], + "94":[["t", + "fusiegroep"], + ["pos", + "N"], + ["s", + null]]}}, + { + "hit":{ + "106":[["t", + "De"], + ["pos", + "LID"], + ["s", + null]], + "107":[["t", + "Belgische"], + ["pos", + "ADJ"], + ["s", + null]], + "108":[["t", + "regering"], + ["pos", + "N"], + ["s", + null]]}}]}]}]} +``` + +--- + +<a name="left-and-right"></a> + +### Left and Right + +**Example** +Keyword in context with output type *token* and prefixes *t* and *s* for sentences starting with an article, expanded two positions to the left and one position to the right + +**CQL** +`<s>[pos="LID"]` + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%3Cs%3E%5Bpos%3D%5C%22LID%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.kwic=true&mtas.kwic.0.field=text&mtas.kwic.0.query.type=cql&mtas.kwic.0.query.value=%3Cs%3E%5Bpos%3D%22LID%22%5D&mtas.kwic.0.key=sentence+starting+with+article&mtas.kwic.0.prefix=t%2Cs&mtas.kwic.0.output=token&mtas.kwic.0.number=2&mtas.kwic.0.start=0&mtas.kwic.0.left=2&mtas.kwic.0.right=1&fl=%2A&start=0&rows=1&wt=json&indent=true` + +```json +"mtas":{ + "kwic":[{ + "key":"sentence starting with article", + "list":[{ + "documentKey":"61d2a1b3-9068-4815-ba4d-3370e5a809d7", + "documentTotal":10, + "documentMinPosition":0, + "documentMaxPosition":673, + "list":[{ + "startPosition":14, + "endPosition":14, + "tokens":[{ + "mtasId":136, + "prefix":"t", + "value":"fusiegroep", + "positionStart":15, + "positionEnd":15, + "parentMtasId":295}, + { + "mtasId":295, + "prefix":"s", + "value":"", + "positionStart":14, + "positionEnd":36, + "parentMtasId":417}, + { + "mtasId":128, + "prefix":"t", + "value":"De", + "positionStart":14, + "positionEnd":14, + "parentMtasId":295}, + { + "mtasId":113, + "prefix":"t", + "value":"afslanking", + "positionStart":13, + "positionEnd":13, + "parentMtasId":126}, + { + "mtasId":107, + "prefix":"t", + "value":"tot", + "positionStart":12, + "positionEnd":12, + "parentMtasId":126}, + { + "mtasId":126, + "prefix":"s", + "value":"", + "positionStart":9, + "positionEnd":13, + "parentMtasId":127}]}, + { + "startPosition":92, + "endPosition":92, + "tokens":[{ + "mtasId":729, + "prefix":"t", + "value":".", + "positionStart":91, + "positionEnd":91, + "parentMtasId":737}, + { + "mtasId":746, + "prefix":"t", + "value":"nieuwe", + "positionStart":93, + "positionEnd":93, + "parentMtasId":853}, + { + "mtasId":738, + "prefix":"t", + "value":"De", + "positionStart":92, + "positionEnd":92, + "parentMtasId":853}, + { + "mtasId":853, + "prefix":"s", + "value":"", + "positionStart":92, + "positionEnd":105, + "parentMtasId":1114}, + { + "mtasId":723, + "prefix":"t", + "value":"Parijs", + "positionStart":90, + "positionEnd":90, + "parentMtasId":737}, + { + "mtasId":737, + "prefix":"s", + "value":"", + "positionStart":59, + "positionEnd":91, + "parentMtasId":1114}]}]}]}]} +``` + +--- + +##Lucene + +To use keywords in context [directly in Lucene](installation_lucene.html), *ComponentKwic* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_component_list.md b/src/site/markdown/search_component_list.md new file mode 100644 index 0000000..feeff11 --- /dev/null +++ b/src/site/markdown/search_component_list.md @@ -0,0 +1,347 @@ +#List + +Mtas can retrieve list of hits for Mtas queries within the (filtered) set of documents. To get this information, in Solr requests, besides the parameter to enable the [Mtas query component](search_component.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.list | true | yes | + +List results on multiple spans can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required lists. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.list.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.list.\<identifier\>.field | \<string\> | Mtas field | yes | +| mtas.list.\<identifier\>.query.type | \<string\> | query language: [cql](search_cql.html) | yes | +| mtas.list.\<identifier\>.query.value | \<string\> | query: [cql](search_cql.html) | yes | +| mtas.list.\<identifier\>.query.prefix | \<string\> | default prefix | no | +| mtas.list.\<identifier\>.query.ignore | \<string\> | ignore query: [cql](search_cql.html) | no | +| mtas.list.\<identifier\>.query.maximumIgnoreLength | \<integer\> | maximum number of succeeding occurrences to ignore | no | +| mtas.list.\<identifier\>.prefix | \<string\> | comma separated list of prefixes | no | +| mtas.list.\<identifier\>.number | \<double\> | maximum number of items in list | no | +| mtas.list.\<identifier\>.start | \<double\> | offset for selection of items in list | no | +| mtas.list.\<identifier\>.left | \<double\> | number of positions left of hit | no | +| mtas.list.\<identifier\>.right | \<double\> | number of positions right of hit | no | +| mtas.list.\<identifier\>.output | \<string\> | "token" or "hit" | no | + +## Variables + +The query may contain one or more variables, and the value(s) of these variables have to be defined + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.list.\<identifier\>.query.variable\<identifier variable\>.name | \<string\> | name of variable | yes | +| mtas.list.\<identifier\>.query.variable\<identifier variable\>.value | \<string\> | comma separated list of values | yes | + +--- + +## Examples +1. [Token](#token) : List of tokens with prefix *t*, *pos* and *s* for adjectives followed by a noun +2. [Hit](#hit) : List of hits with prefix *t*, *pos* and *s* for articles followed by an adjective and a noun +3. [Left and Right](#left-and-right) : List of tokens with prefix *t* and *s* for sentences starting with an article, expanded to the left and the right +--- + +<a name="token"></a> + +### Token + +**Example** +List with output type *token* and prefixes *t*, *pos* and *s* for adjectives followed by a noun + +**CQL** +`[pos="ADJ"][pos="N"]` + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bpos%3D%5C%22ADJ%5C%22%5D%5Bpos%3D%5C%22N%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.list=true&mtas.list.0.field=text&mtas.list.0.query.type=cql&mtas.list.0.query.value=%5Bpos%3D%22ADJ%22%5D%5Bpos%3D%22N%22%5D&mtas.list.0.key=adjective%2Bnoun&mtas.list.0.prefix=t%2Cpos%2Cs&mtas.list.0.output=token&mtas.list.0.number=2&mtas.list.0.start=0&mtas.list.0.left=0&mtas.list.0.right=0&fl=%2A&rows=0&wt=json&indent=true` + +```json +"mtas":{ + "list":[{ + "key":"adjective+noun", + "number":2, + "list":[{ + "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", + "documentHitPosition":0, + "documentHitTotal":239, + "documentMinPosition":0, + "documentMaxPosition":6385, + "startPosition":29, + "endPosition":30, + "tokens":[{ + "mtasId":191, + "prefix":"t", + "value":"beknopte", + "positionStart":29, + "positionEnd":29, + "parentMtasId":337}, + { + "mtasId":197, + "prefix":"pos", + "value":"ADJ", + "positionStart":29, + "positionEnd":29}, + { + "mtasId":199, + "prefix":"t", + "value":"levensschets", + "positionStart":30, + "positionEnd":30, + "parentMtasId":337}, + { + "mtasId":204, + "prefix":"pos", + "value":"N", + "positionStart":30, + "positionEnd":30}, + { + "mtasId":337, + "prefix":"s", + "value":"", + "positionStart":7, + "positionEnd":49, + "parentMtasId":1152}]}, + { + "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", + "documentHitPosition":1, + "documentHitTotal":239, + "documentMinPosition":0, + "documentMaxPosition":6385, + "startPosition":56, + "endPosition":57, + "tokens":[{ + "mtasId":380, + "prefix":"t", + "value":"gebied", + "positionStart":57, + "positionEnd":57, + "parentMtasId":610}, + { + "mtasId":387, + "prefix":"pos", + "value":"N", + "positionStart":57, + "positionEnd":57}, + { + "mtasId":373, + "prefix":"t", + "value":"velerlei", + "positionStart":56, + "positionEnd":56, + "parentMtasId":610}, + { + "mtasId":378, + "prefix":"pos", + "value":"ADJ", + "positionStart":56, + "positionEnd":56}, + { + "mtasId":610, + "prefix":"s", + "value":"", + "positionStart":50, + "positionEnd":90, + "parentMtasId":1152}]}]}]} +``` + + +<a name="hit"></a> + +### Hit + +**Example** +List with output type *hit* and prefixes *t*, *pos* and *s* for articles followed by an adjective and a noun + +**CQL** +`[pos="LID"][pos="ADJ"][pos="N"]` + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bpos%3D%5C%22LID%5C%22%5D%5Bpos%3D%5C%22ADJ%5C%22%5D%5Bpos%3D%5C%22N%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.list=true&mtas.list.0.field=text&mtas.list.0.query.type=cql&mtas.list.0.query.value=%5Bpos%3D%22LID%22%5D%5Bpos%3D%22ADJ%22%5D%5Bpos%3D%22N%22%5D&mtas.list.0.key=article%2Badjective%2Bnoun&mtas.list.0.prefix=t%2Cpos%2Cs&mtas.list.0.output=hit&mtas.list.0.number=2&mtas.list.0.start=0&mtas.list.0.left=0&mtas.list.0.right=0&fl=%2A&rows=0&wt=json&indent=true` + +```json +"mtas":{ + "list":[{ + "key":"article+adjective+noun", + "number":2, + "list":[{ + "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", + "documentHitPosition":0, + "documentHitTotal":80, + "documentMinPosition":0, + "documentMaxPosition":6385, + "startPosition":210, + "endPosition":212, + "hit":{ + "210":[["t", + "het"], + ["pos", + "LID"], + ["s", + null]], + "211":[["t", + "Middelbaar"], + ["pos", + "ADJ"], + ["s", + null]], + "212":[["t", + "Onderwijs"], + ["pos", + "N"], + ["s", + null]]}}, + { + "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", + "documentHitPosition":1, + "documentHitTotal":80, + "documentMinPosition":0, + "documentMaxPosition":6385, + "startPosition":237, + "endPosition":239, + "hit":{ + "237":[["t", + "het"], + ["pos", + "LID"], + ["s", + null]], + "238":[["t", + "Middelbaar"], + ["pos", + "ADJ"], + ["s", + null]], + "239":[["t", + "Onderwijs"], + ["pos", + "N"], + ["s", + null]]}}]}]} +``` + +--- + +<a name="left-and-right"></a> + +### Left and Right + +**Example** +List with output type *token* and prefixes *t* and *s* for sentences starting with an article, expanded two positions to the left and one position to the right + +**CQL** +`<s>[pos="LID"]` + +**Request and response** +`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%3Cs%3E%5Bpos%3D%5C%22LID%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.list=true&mtas.list.0.field=text&mtas.list.0.query.type=cql&mtas.list.0.query.value=%3Cs%3E%5Bpos%3D%22LID%22%5D&mtas.list.0.key=sentence+starting+with+article&mtas.list.0.prefix=t%2Cs&mtas.list.0.output=token&mtas.list.0.number=2&mtas.list.0.start=0&mtas.list.0.left=2&mtas.list.0.right=1&fl=%2A&rows=0&wt=json&indent=true` + +```json +"mtas":{ + "list":[{ + "key":"sentence starting with article", + "number":2, + "list":[{ + "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", + "documentHitPosition":0, + "documentHitTotal":18, + "documentMinPosition":0, + "documentMaxPosition":6385, + "startPosition":378, + "endPosition":378, + "tokens":[{ + "mtasId":2534, + "prefix":"t", + "value":"leven", + "positionStart":379, + "positionEnd":379, + "parentMtasId":2914}, + { + "mtasId":2517, + "prefix":"t", + "value":".", + "positionStart":377, + "positionEnd":377, + "parentMtasId":2526}, + { + "mtasId":2527, + "prefix":"t", + "value":"Het", + "positionStart":378, + "positionEnd":378, + "parentMtasId":2914}, + { + "mtasId":2914, + "prefix":"s", + "value":"", + "positionStart":378, + "positionEnd":433, + "parentMtasId":2915}, + { + "mtasId":2512, + "prefix":"t", + "value":"Landbouwkundige", + "positionStart":376, + "positionEnd":376, + "parentMtasId":2526}, + { + "mtasId":2526, + "prefix":"s", + "value":"", + "positionStart":307, + "positionEnd":377, + "parentMtasId":2915}]}, + { + "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", + "documentHitPosition":1, + "documentHitTotal":18, + "documentMinPosition":0, + "documentMaxPosition":6385, + "startPosition":878, + "endPosition":878, + "tokens":[{ + "mtasId":5794, + "prefix":"t", + "value":"De", + "positionStart":878, + "positionEnd":878, + "parentMtasId":5999}, + { + "mtasId":5801, + "prefix":"t", + "value":"eerzucht", + "positionStart":879, + "positionEnd":879, + "parentMtasId":5999}, + { + "mtasId":5999, + "prefix":"s", + "value":"", + "positionStart":878, + "positionEnd":908, + "parentMtasId":6305}, + { + "mtasId":5779, + "prefix":"t", + "value":"bewaarheid", + "positionStart":876, + "positionEnd":876, + "parentMtasId":5792}, + { + "mtasId":5786, + "prefix":"t", + "value":".", + "positionStart":877, + "positionEnd":877, + "parentMtasId":5792}, + { + "mtasId":5792, + "prefix":"s", + "value":"", + "positionStart":857, + "positionEnd":877, + "parentMtasId":5793}]}]}]} +``` + +--- + +**Lucene** + +To get a list of hits [directly in Lucene](installation_lucene.html), *ComponentList* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_component_prefix.md b/src/site/markdown/search_component_prefix.md new file mode 100644 index 0000000..b1d30b6 --- /dev/null +++ b/src/site/markdown/search_component_prefix.md @@ -0,0 +1,73 @@ +#Prefix + +Mtas can produce a list of available prefixes. To get this information, in Solr requests, besides the parameter to enable the [Mtas query component](search_component.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.stats.prefix | true | yes | + +Information for multiple fields can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required statistics. The list of available prefixes is independent of any restriction in the document set, and also prefixes of deleted documents can be taken into account when the core hasn't been optimized. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.stats.prefix.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.stats.prefix.\<identifier\>.field | \<string\> | Mtas field | yes | + +The *key* is added to the response and may be used to distinguish between multiple lists, and should therefore be unique. The response will contain three lists: prefixes strictly used for single position tokens, prefixes (also) used for multiple position tokens and prefixes used for multiple non adjacent positions. Notice that the last list will always be a subset of the second list. + +## Examples +1. [Basic](#basic) : list of available prefixes. + +<a name="basic"></a> + +### Basic + +**Example** +List of avilable prefixes. + +**Request and response** +`q=*%3A*&mtas=true&mtas.prefix=true&mtas.prefix.0.field=text&mtas.prefix.0.key=example+-+basic&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "prefix":[{ + "key":"example - basic", + "singlePosition":["feat.buiging", + "feat.conjtype", + "feat.dial", + "feat.genus", + "feat.getal", + "feat.getal-n", + "feat.graad", + "feat.head", + "feat.lwtype", + "feat.naamval", + "feat.npagr", + "feat.ntype", + "feat.numtype", + "feat.pdtype", + "feat.persoon", + "feat.positie", + "feat.pvagr", + "feat.pvtijd", + "feat.spectype", + "feat.status", + "feat.vwtype", + "feat.vztype", + "feat.wvorm", + "lemma", + "morpheme", + "pos", + "t", + "t_lc"], + "multiplePosition":["div", + "entity", + "head", + "p", + "s"], + "setPosition":["entity"]}]} +``` + +**Lucene** + +To get a list of prefixes [directly in Lucene](installation_lucene.html), *ComponentPrefix* together with the provided *collect* method can be used. \ No newline at end of file diff --git a/src/site/markdown/search_component_stats.md b/src/site/markdown/search_component_stats.md new file mode 100644 index 0000000..6ba456c --- /dev/null +++ b/src/site/markdown/search_component_stats.md @@ -0,0 +1,10 @@ +#Statistics + +To get statistics in Solr requests, besides the parameter to enable the [Mtas query component](search_component.html), the following parameter should be used. + +| Parameter | Value | Obligatory | +|-------------|--------|-------------| +| mtas.stats | true | yes | + +Using this parameter, it is possible to add statistics on [positions](search_component_stats_positions.html), [tokens](search_component_stats_tokens.html) and [spans](search_component_stats_spans.html) to the response on a request. + diff --git a/src/site/markdown/search_component_stats_positions.md b/src/site/markdown/search_component_stats_positions.md new file mode 100644 index 0000000..0e22b8d --- /dev/null +++ b/src/site/markdown/search_component_stats_positions.md @@ -0,0 +1,142 @@ +#Statistics - positions + +To get statistics on the number of positions within a set of documents in Solr requests, besides the parameter to enable [statistics](search_component_stats.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.stats.positions | true | yes | + +Multiple statistics on positions can be produced within the same request. +To distinguish them, a unique identifier has to be provided for +each of the required statistics. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.stats.positions.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.stats.positions.\<identifier\>.field | \<string\> | Mtas field | yes | +| mtas.stats.positions.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | +| mtas.stats.positions.\<identifier\>.minimum | \<double\> | minimum number of positions | no | +| mtas.stats.positions.\<identifier\>.maximum | \<double\> | maximum number of positions | no | + +The *key* is added to the response and may be used to distinguish between multiple statistics on positions, and should therefore be unique. The optional *minimum* and *maximum* can be used to focus only on documents satisfying a condition on the number of positions. + +--- + +## Examples +1. [Basic](#basic) : basic statistics on the number of positions. +2. [Minimum and maximum](#minimum-and-maximum) : statistics on the number of positions with restrictions on this number. +3. [Subset](#subset) : statistics on the number of positions within a subset of documents. + +--- + +<a name="basic"></a> + +### Basic + +**Example** +Total and average number of positions and the number of documents. + +**Request and response** +`q=*%3A*&rows=0&mtas=true&mtas.stats=true&mtas.stats.positions=true&mtas.stats.positions.0.field=text&mtas.stats.positions.0.key=example - basic&mtas.stats.positions.0.type=sum,mean,n&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "positions":[{ + "key":"example - basic", + "mean":244.26537188929916, + "sum":504361094, + "n":2064808}]}} +``` + +<a name="minimum-and-maximum"></a> + +### Minimum and maximum + +**Example** +Full statistics on positions for documents with a minimum of 100 positions, for documents with a maximum of 200 positions, and for documents with between 100 and 200 positions. + +**Request and response** +`q=*%3A*&rows=0&mtas=true&mtas.stats=true&mtas.stats.positions=true&mtas.stats.positions.0.field=text&mtas.stats.positions.0.key=example - minimum&mtas.stats.positions.0.type=all&mtas.stats.positions.0.minimum=100&mtas.stats.positions.1.field=text&mtas.stats.positions.1.key=example - maximum&mtas.stats.positions.1.type=all&mtas.stats.positions.1.maximum=200&mtas.stats.positions.2.field=text&mtas.stats.positions.2.key=example - minimum and maximum&mtas.stats.positions.2.type=all&mtas.stats.positions.2.minimum=100&mtas.stats.positions.2.maximum=200&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "positions":[{ + "key":"example - minimum", + "sumsq":4.407777345501E12, + "populationvariance":4021377.043206717, + "max":419252.0, + "sum":4.53494907E8, + "kurtosis":7589.040501278469, + "standarddeviation":2005.3380969650148, + "n":1047253, + "quadraticmean":2051.5590305379797, + "min":100.0, + "median":232.0, + "variance":4021380.883139267, + "mean":433.0328077360544, + "geometricmean":269.1549624469481, + "sumoflogs":5859681.392265234, + "skewness":70.39565176567714}, + { + "key":"example - maximum", + "sumsq":1.2589493055E10, + "populationvariance":2516.516960673755, + "max":200.0, + "sum":1.14146849E8, + "kurtosis":-0.5513713934014715, + "standarddeviation":50.164914844725146, + "n":1462493, + "quadraticmean":92.78060994263417, + "min":0.0, + "median":68.0, + "variance":2516.5186813785253, + "mean":78.04950109162947, + "geometricmean":0.0, + "sumoflogs":"-Infinity", + "skewness":0.6202671670124106}, + { + "key":"example - minimum and maximum", + "sumsq":9.370630488E9, + "populationvariance":832.9926334704653, + "max":200.0, + "sum":6.3280662E7, + "kurtosis":-1.0893405044786282, + "standarddeviation":28.861644194831847, + "n":444938, + "quadraticmean":145.12246855142547, + "min":100.0, + "median":139.0, + "variance":832.9945056290709, + "mean":142.22355024745016, + "geometricmean":139.3394542837307, + "sumoflogs":2196620.2289446634, + "skewness":0.31081665704505534}]}} +``` + +<a name="subset"></a> + +### Subset + +**Example** +Total and average number of positions and the number of documents for a subset of documents. + +**Request and response** +`q=text:koe&rows=0&mtas=true&mtas.stats=true&mtas.stats.positions=true&mtas.stats.positions.0.field=text&mtas.stats.positions.0.key=example - subset&mtas.stats.positions.0.type=sum,mean,n&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "positions":[{ + "key":"example - subset", + "mean":5265.321033210332, + "sum":14269020, + "n":2710}]}} +``` + +--- + +##Lucene + +To use statistics on the number of positions [directly in Lucene](installation_lucene.html), *ComponentPosition* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_component_stats_spans.md b/src/site/markdown/search_component_stats_spans.md new file mode 100644 index 0000000..4a2a0ec --- /dev/null +++ b/src/site/markdown/search_component_stats_spans.md @@ -0,0 +1,416 @@ +#Statistics - spans + +To get statistics on the occurrence of a span within a set of documents in Solr requests, besides the parameter to enable [statistics](search_component_stats.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.stats.spans | true | yes | + +Multiple statistics on the occurrence of a span can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required statistics. Furthermore, statistics for the occurrence of multiple spans can be produced. Spans are described by a query, and to distinguish multiple spans, also a query identifier has to be provided. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.stats.spans.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.stats.spans.\<identifier\>.field | \<string\> | Mtas field | yes | +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.type | \<string\> | query language: [cql](search_cql.html) | yes | +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.value | \<string\> | query: [cql](search_cql.html) | yes | +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.prefix | \<string\> | default prefix | no | +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.ignore | \<string\> | ignore query: [cql](search_cql.html) | no | +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.maximumIgnoreLength | \<integer\> | maximum number of succeeding occurrences to ignore | no | +| mtas.stats.spans.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | +| mtas.stats.spans.\<identifier\>.minimum | \<double\> | minimum number of occurrences span | no | +| mtas.stats.spans.\<identifier\>.maximum | \<double\> | maximum number of occurrences span | no | + +The *key* is added to the response and may be used to distinguish between multiple statistics on the occurrence of spans, and should therefore be unique. The optional *minimum* and *maximum* can be used to focus only on documents satisfying a condition on the number of occurrences of the spans. When multiple queries are provided, the provided boundary will hold on the sum of occurrences of the resulting spans. + +## Variables + +The query may contain one or more variables, and the value(s) of these variables have to be defined + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.variable.\<identifier variable\>.name | \<string\> | name of variable | yes | +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.variable.\<identifier variable\>.value | \<string\> | comma separated list of values | yes | + +## Functions + +To compute statistics for values based on the occurrence of one or multiple spans, optionally [functions](search_functions.html) can be added. The parameters for these functions are the number of occurrences *$q0*, *$q1*, ... for each span and the number of positions *$n* in a document. Statistics on the value computed for each document in the set are added to the response. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.stats.spans.\<identifier\>.function.\<identifier function\>.key | \<string\> | key used in response | no | +| mtas.stats.spans.\<identifier\>.function.\<identifier function\>.expression | \<string\> | see [functions](search_functions.html) | yes | +| mtas.stats.spans.\<identifier\>.function.\<identifier function\>.type | \<string\> | required [type of statistics](search_stats.html) | no | + +Again, the *key* is added to the response and may be used to distinguish between multiple functions, and should therefore be unique. + +--- + +## Examples +1. [Basic](#basic) : basic statistics on the occurrence of a word. +2. [Minimum and Maximum](#minimum-and-maximum) : statistics on the occurrence of a word with restrictions on the number of occurrences. +3. [Subset](#subset) : statistics on the occurrence of a word within a subset of documents. +4. [Multiple](#multiple) : statistics on the occurrence of multiple words. +5. [Prefix](#prefix) : default prefix for query +5. [Ignore](#ignore) : query with ignore +6. [Ignore and maximumIgnoreLength](#ignore-and-maximumignorelength) : query with ignore and maximumIgnoreLength +6. [Functions](#functions) : statistics using functions. +7. [Multiple and Functions](#multiple-and-functions) : statistics using functions on the occurrence of multiple words. + +--- + +<a name="basic"></a> + +### Basic + +**Example** +Total and average number of occurrences of the word "de" and the number of documents. + +**CQL** +`[t="de"]` + +**Request and response** +`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=%5Bt%3D%22de%22%5D&mtas.stats.spans.0.key=example - basic&mtas.stats.spans.0.type=n%2Csum%2Cmean&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "spans":[{ + "key":"example - basic", + "mean":10.488239100197209, + "sum":21656200, + "n":2064808}]}} +``` + +<a name="minimum-and-maximum"></a> + +### Minimum and Maximum + +**Example** +Full statistics on the number of occurrences of the word "de" for documents with a minimum of 100 occurrences, for documents with a maximum of 200 occurrences, and for documents with between 100 and 200 occurrences. + +**CQL** +`[t="de"]` + +**Request and response** +`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t%3D"de"]&mtas.stats.spans.0.key=example - minimum&mtas.stats.spans.0.type=all&mtas.stats.spans.0.minimum=100&mtas.stats.spans.1.field=text&mtas.stats.spans.1.query.0.type=cql&mtas.stats.spans.1.query.0.value=[t%3D"de"]&mtas.stats.spans.1.key=example - maximum&mtas.stats.spans.1.type=all&mtas.stats.spans.1.maximum=200&mtas.stats.spans.2.field=text&mtas.stats.spans.2.query.0.type=cql&mtas.stats.spans.2.query.0.value=[t%3D"de"]&mtas.stats.spans.2.key=example - minimum and maximum&mtas.stats.spans.2.type=all&mtas.stats.spans.2.minimum=100&mtas.stats.spans.2.maximum=200&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "spans":[{ + "key":"example - minimum", + "sumsq":8.697655383E9, + "populationvariance":419224.862744871, + "max":18192.0, + "sum":4531747.0, + "kurtosis":164.01633761739456, + "standarddeviation":647.4937185426337, + "n":18030, + "quadraticmean":694.5495506941058, + "min":100.0, + "median":136.0, + "variance":419248.1155521673, + "mean":251.3448141985584, + "geometricmean":160.50112302303313, + "sumoflogs":91561.76594051626, + "skewness":10.552060273112971}, + { + "key":"example - maximum", + "sumsq":7.37391079E8, + "populationvariance":271.8217238864797, + "max":200.0, + "sum":1.9102393E7, + "kurtosis":31.734626574581217, + "standarddeviation":16.487020826545898, + "n":2061623, + "quadraticmean":18.91229851589547, + "min":0.0, + "median":4.0, + "variance":271.82185573495815, + "mean":9.265706193615522, + "geometricmean":0.0, + "sumoflogs":"-Infinity", + "skewness":4.741031505227169}, + { + "key":"example - minimum and maximum", + "sumsq":2.73698488E8, + "populationvariance":684.3248008017308, + "max":200.0, + "sum":1977940.0, + "kurtosis":-0.47377181206297303, + "standarddeviation":26.16048359466255, + "n":14845, + "quadraticmean":135.78321834689768, + "min":100.0, + "median":127.0, + "variance":684.3709019066084, + "mean":133.23947457056252, + "geometricmean":130.83072059647412, + "sumoflogs":72353.10901272473, + "skewness":0.7177265003819447}]}} +``` + +<a name="subset"></a> + +### Subset + +**Example** +Total and average number of occurrences of the word "de" and the number of documents for a subset of documents. + +**CQL** +`[t="de"]` + +**Request and response** +`q=text:koe&rows=0&mtas=true&mtas.stats=true&mtas.stats.tokens=true&mtas.stats.tokens.0.field=text&mtas.stats.tokens.0.key=example - subset&mtas.stats.tokens.0.type=sum,mean,n&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "tokens":[{ + "key":"example - subset", + "mean":42901.60996309963, + "sum":116263363, + "n":2710}]}} +``` + +<a name="multiple"></a> + +### Multiple + +**Example** +Total and average number of occurrences of the word "de" and "het", and the number of documents. + +**CQL** +1. combined cql: `[t="de"|t="het"]` +2. combined regexp: `[t="(de|het)"]` +3. two queries: `[t="de"]` `[t="het"]` + +**Request and response** +`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t%3D"de"|t%3D"het"]&mtas.stats.spans.0.key=multiple+-+combined+cql&mtas.stats.spans.0.type=n%2Csum%2Cmean&mtas.stats.spans.1.field=text&mtas.stats.spans.1.query.0.type=cql&mtas.stats.spans.1.query.0.value=[t%3D"(de|het)"]&mtas.stats.spans.1.key=multiple+-+combined+regexp&mtas.stats.spans.1.type=n%2Csum%2Cmean&mtas.stats.spans.2.field=text&mtas.stats.spans.2.query.0.type=cql&mtas.stats.spans.2.query.0.value=[t%3D"de"]&mtas.stats.spans.2.query.1.type=cql&mtas.stats.spans.2.query.1.value=[t%3D"het"]&mtas.stats.spans.2.key=multiple+-+two+queries&mtas.stats.spans.2.type=n%2Csum%2Cmean&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "spans":[{ + "key":"multiple - combined cql", + "mean":15.178130848001365, + "sum":31339926, + "n":2064808}, + { + "key":"multiple - combined regexp", + "mean":15.178130848001365, + "sum":31339926, + "n":2064808}, + { + "key":"multiple - two queries", + "mean":15.178130848001365, + "sum":31339926, + "n":2064808}]}} +``` + +<a name="prefix"></a> + +### Prefix + +**Example** +Total and average number of occurrences of the word "de" followed by an adjective. + +**CQL** +`"de" [pos="ADJ"]` + +**Request and response** +`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value="de" [pos%3D"ADJ"]&mtas.stats.spans.0.query.0.prefix=t_lc&mtas.stats.spans.0.key=example - prefix&mtas.stats.spans.0.type=n%2Csum%2Cmean&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "spans":[{ + "key":"example - prefix", + "mean":2.1725308115815127, + "sum":4485859, + "n":2064808}]}} +``` + +<a name="ignore"></a> + +### Ignore + +**Example** +Total and average number of occurrences of an article followed by a noun, ignoring adjectives. + +**CQL** +`[pos="LID"][pos="N"]` + +**Ignore** +`[pos="ADJ"]` + + +**Request and response** +`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t_lc%3D"de"]&mtas.stats.spans.0.key=functions+-+de&mtas.stats.spans.0.type=n%2Csum%2Cmean&mtas.stats.spans.0.function.0.expression=%24q0%2F%24n&mtas.stats.spans.0.function.0.key=relative+frequency&mtas.stats.spans.0.function.0.type=mean%2Cstandarddeviation%2Cdistribution(start%3D0%2Cend%3D0.1%2Cnumber%3D10)&mtas.stats.spans.0.function.1.expression=%24n&mtas.stats.spans.0.function.1.key=number+of+words&mtas.stats.spans.0.function.1.type=n%2Csum&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "spans":[{ + "key":"functions - de", + "mean":12.352043386116287, + "sum":25504598, + "n":2064808, + "functions":{ + "number of words":{ + "sum":504361094, + "n":2064808}, + "relative frequency":{ + "distribution(start=0,end=0.1,number=10)":{ + "[0.000,0.010)":390003, + "[0.010,0.020)":120903, + "[0.020,0.030)":173830, + "[0.030,0.040)":209994, + "[0.040,0.050)":245098, + "[0.050,0.060)":253528, + "[0.060,0.070)":218325, + "[0.070,0.080)":163982, + "[0.080,0.090)":115929, + "[0.090,0.100)":77207}, + "mean":0.04538673326024501, + "errorList":{"division by zero":1039}, + "standarddeviation":0.03284884758453086, + "errorNumber":1039}}}]}} +``` + +<a name="ignore-and-maximumignorelength"></a> + +### Ignore and maximumIgnoreLength + +<a name="functions"></a> + +### Functions + +**Example** +Statistics for the relative frequency of the word "de" and the total number of words in documents containing this word. + +**CQL** +`[t="de"]` + +**Functions** +`$q0/$n` +`$n` + +**Request and response** +`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t_lc%3D"de"]&mtas.stats.spans.0.key=functions+-+de&mtas.stats.spans.0.type=n%2Csum%2Cmean&mtas.stats.spans.0.function.0.expression=%24q0%2F%24n&mtas.stats.spans.0.function.0.key=relative+frequency&mtas.stats.spans.0.function.0.type=mean%2Cstandarddeviation%2Cdistribution(start%3D0%2Cend%3D0.1%2Cnumber%3D10)&mtas.stats.spans.0.function.1.expression=%24n&mtas.stats.spans.0.function.1.key=number+of+words&mtas.stats.spans.0.function.1.type=n%2Csum&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "spans":[{ + "key":"functions - de", + "mean":12.352043386116287, + "sum":25504598, + "n":2064808, + "functions":{ + "number of words":{ + "sum":504361094, + "n":2064808}, + "relative frequency":{ + "distribution(start=0,end=0.1,number=10)":{ + "[0.000,0.010)":390003, + "[0.010,0.020)":120903, + "[0.020,0.030)":173830, + "[0.030,0.040)":209994, + "[0.040,0.050)":245098, + "[0.050,0.060)":253528, + "[0.060,0.070)":218325, + "[0.070,0.080)":163982, + "[0.080,0.090)":115929, + "[0.090,0.100)":77207}, + "mean":0.04538673326024501, + "errorList":{"division by zero":1039}, + "standarddeviation":0.03284884758453086, + "errorNumber":1039}}}]}} +``` + +<a name="multiple-and-functions"></a> + +### Multiple and Functions + +**Example** +Statistics for the absolute and relative frequency of the words "de", "het" and "een", for *part of speech* type "LID" and the total number of words in documents containing this word. + +**CQL** +`[t="de"]` +`[t="het"]` +`[t="een"]` +`[pos="LID"]` + +**Functions** +`$q0/$n` +`$q1/$n` +`$q2/$n` +`$q3/$n` +`$q0/$q3` +`$q1/$q3` +`$q2/$q3` +`($q0+$q1+$q2)/$q3` + +**Request and response** +`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t_lc%3D"de"]&mtas.stats.spans.0.query.1.type=cql&mtas.stats.spans.0.query.1.value=[t_lc%3D"het"]&mtas.stats.spans.0.query.2.type=cql&mtas.stats.spans.0.query.2.value=[t_lc%3D"een"]&mtas.stats.spans.0.query.3.type=cql&mtas.stats.spans.0.query.3.value=[pos%3D"LID"]&mtas.stats.spans.0.key=multiple+and+functions+-+de%2Bhet%2Been+and+LID&mtas.stats.spans.0.type=n&mtas.stats.spans.0.minimum=1&mtas.stats.spans.0.function.0.expression=%24q0&mtas.stats.spans.0.function.0.key=de+-+absolute&mtas.stats.spans.0.function.0.type=n%2Csum&mtas.stats.spans.0.function.1.expression=%24q1&mtas.stats.spans.0.function.1.key=het+-+absolute&mtas.stats.spans.0.function.1.type=n%2Csum&mtas.stats.spans.0.function.2.expression=%24q2&mtas.stats.spans.0.function.2.key=een+-+absolute&mtas.stats.spans.0.function.2.type=n%2Csum&mtas.stats.spans.0.function.3.expression=%24q3&mtas.stats.spans.0.function.3.key=LID+-+absolute&mtas.stats.spans.0.function.3.type=n%2Csum&mtas.stats.spans.0.function.4.expression=%24q0%2F%24n&mtas.stats.spans.0.function.4.key=de+-+relative+to+positions&mtas.stats.spans.0.function.4.type=n%2Cmean&mtas.stats.spans.0.function.5.expression=%24q1%2F%24n&mtas.stats.spans.0.function.5.key=het+-+relative+to+positions&mtas.stats.spans.0.function.5.type=n%2Cmean&mtas.stats.spans.0.function.6.expression=%24q2%2F%24n&mtas.stats.spans.0.function.6.key=een+-+relative+to+positions&mtas.stats.spans.0.function.6.type=n%2Cmean&mtas.stats.spans.0.function.7.expression=%24q3%2F%24n&mtas.stats.spans.0.function.7.key=LID+-+relative+to+positions&mtas.stats.spans.0.function.7.type=n%2Cmean&mtas.stats.spans.0.function.8.expression=%24q0%2F%24q3&mtas.stats.spans.0.function.8.key=de+-+relative+to+LID&mtas.stats.spans.0.function.8.type=n%2Cmean&mtas.stats.spans.0.function.9.expression=%24q1%2F%24q3&mtas.stats.spans.0.function.9.key=het+-+relative+to+LID&mtas.stats.spans.0.function.9.type=n%2Cmean&mtas.stats.spans.0.function.10.expression=%24q2%2F%24q3&mtas.stats.spans.0.function.10.key=een+-+relative+to+LID&mtas.stats.spans.0.function.10.type=n%2Cmean&mtas.stats.spans.0.function.11.expression=(%24q0%2B%24q1%2B%24q2)%2F%24q3&mtas.stats.spans.0.function.11.key=de%2Bhet%2Been+-+relative+to+LID&mtas.stats.spans.0.function.11.type=n%2Cmean&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "spans":[{ + "key":"multiple and functions - de+het+een and LID", + "n":1890377, + "functions":{ + "een - relative to LID":{ + "mean":0.26177400695591124, + "errorList":{"division by zero":24175}, + "n":1890377, + "errorNumber":24175}, + "LID - absolute":{ + "sum":44077220, + "n":1890377}, + "de+het+een - relative to LID":{ + "mean":1.0864079360130154, + "errorList":{"division by zero":24175}, + "n":1890377, + "errorNumber":24175}, + "het - relative to LID":{ + "mean":0.2740826070638114, + "errorList":{"division by zero":24175}, + "n":1890377, + "errorNumber":24175}, + "een - relative to positions":{ + "mean":0.021631171906706374, + "n":1890377}, + "een - absolute":{ + "sum":10620744, + "n":1890377}, + "het - relative to positions":{ + "mean":0.02235754528581941, + "n":1890377}, + "de - absolute":{ + "sum":25504598, + "n":1890377}, + "het - absolute":{ + "sum":11530937, + "n":1890377}, + "LID - relative to positions":{ + "mean":0.08693980190126971, + "n":1890377}, + "de - relative to LID":{ + "mean":0.5505513219945993, + "errorList":{"division by zero":24175}, + "n":1890377, + "errorNumber":24175}, + "de - relative to positions":{ + "mean":0.049574709134571515, + "n":1890377}}}]}} +``` + +--- + +##Lucene + +To use statistics on the occurrence of a span [directly in Lucene](installation_lucene.html), *ComponentSpan* together with the provided *collect* method can be used. \ No newline at end of file diff --git a/src/site/markdown/search_component_stats_tokens.md b/src/site/markdown/search_component_stats_tokens.md new file mode 100644 index 0000000..d9cad89 --- /dev/null +++ b/src/site/markdown/search_component_stats_tokens.md @@ -0,0 +1,142 @@ +#Statistics - tokens + +To get statistics on the number of tokens within a set of documents in Solr requests, besides the parameter to enable [statistics](search_component_stats.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.stats.tokens | true | yes | + +Multiple statistics on tokens can be produced within the same request. +To distinguish them, a unique identifier has to be provided for +each of the required statistics. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.stats.tokens.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.stats.tokens.\<identifier\>.field | \<string\> | Mtas field | yes | +| mtas.stats.tokens.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | +| mtas.stats.tokens.\<identifier\>.minimum | \<double\> | minimum number of tokens | no | +| mtas.stats.tokens.\<identifier\>.maximum | \<double\> | maximum number of tokens | no | + +The *key* is added to the response and may be used to distinguish between multiple statistics on tokens, and should therefore be unique. The optional *minimum* and *maximum* can be used to focus only on documents satisfying a condition on the number of tokens. + +--- + +## Examples +1. [Basic](#basic) : basic statistics on the number of tokens. +2. [Minimum and maximum](#minimum-and-maximum) : statistics on the number of tokens with restrictions on this number. +3. [Subset](#subset) : statistics on the number of tokens within a subset of documents. + +--- + +<a name="basic"></a> + +### Basic + +**Example** +Total and average number of tokens and the number of documents. + +**Request and response** +`q=*%3A*&rows=0&mtas=true&mtas.stats=true&mtas.stats.tokens=true&mtas.stats.tokens.0.field=text&mtas.stats.tokens.0.key=example - basic&mtas.stats.tokens.0.type=sum,mean,n&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "tokens":[{ + "key":"example - basic", + "mean":1949.101406523028, + "sum":4024520177, + "n":2064808}]}} +``` + +<a name="minimum-and-maximum"></a> + +### Minimum and maximum + +**Example** +Full statistics on tokens for documents with a minimum of 100 tokens, for documents with a maximum of 200 tokens, and for documents with between 100 and 200 tokens. + +**Request and response** +`q=*%3A*&rows=0&mtas=true&mtas.stats=true&mtas.stats.tokens=true&mtas.stats.tokens.0.field=text&mtas.stats.tokens.0.key=example - minimum&mtas.stats.tokens.0.type=all&mtas.stats.tokens.0.minimum=500&mtas.stats.tokens.1.field=text&mtas.stats.tokens.1.key=example - maximum&mtas.stats.tokens.1.type=all&mtas.stats.tokens.1.maximum=1000&mtas.stats.tokens.2.field=text&mtas.stats.tokens.2.key=example - minimum and maximum&mtas.stats.tokens.2.type=all&mtas.stats.tokens.2.minimum=500&mtas.stats.tokens.2.maximum=1000&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "tokens":[{ + "key":"example - minimum", + "sumsq":2.91825668357275E14, + "populationvariance":2.022964435797023E8, + "max":3320612.0, + "sum":3.837278477E9, + "kurtosis":9580.99014557769, + "standarddeviation":14223.100544366072, + "n":1390207, + "quadraticmean":14488.452755067281, + "min":500.0, + "median":1359.0, + "variance":2.0229658909514648E8, + "mean":2760.2209433559033, + "geometricmean":1584.8982392362057, + "sumoflogs":1.0243428152831953E7, + "skewness":79.47215006871889}, + { + "key":"example - maximum", + "sumsq":3.33432806009E11, + "populationvariance":65815.48228216589, + "max":1000.0, + "sum":5.49051031E8, + "kurtosis":-0.9495132030213522, + "standarddeviation":256.54539199058576, + "n":1178024, + "quadraticmean":532.0189410229931, + "min":0.0, + "median":441.0, + "variance":65815.53815160331, + "mean":466.07796700236827, + "geometricmean":0.0, + "sumoflogs":"-Infinity", + "skewness":0.2518109944817064}, + { + "key":"example - minimum and maximum", + "sumsq":2.70110872559E11, + "populationvariance":20021.06838039624, + "max":1000.0, + "sum":3.61809331E8, + "kurtosis":-1.0824803795579663, + "standarddeviation":141.49596513804715, + "n":503423, + "quadraticmean":732.4947329880449, + "min":500.0, + "median":704.0, + "variance":20021.108150347452, + "mean":718.6984523949043, + "geometricmean":704.889293672351, + "sumoflogs":3301468.553637138, + "skewness":0.2634725299866506}]}} +``` + +<a name="subset"></a> + +### Subset + +**Example** +Total and average number of tokens and the number of documents for a subset of documents. + +**Request and response** +`q=text:koe&rows=0&mtas=true&mtas.stats=true&mtas.stats.tokens=true&mtas.stats.tokens.0.field=text&mtas.stats.tokens.0.key=example - subset&mtas.stats.tokens.0.type=sum,mean,n&wt=json&indent=true` + +``` json +"mtas":{ + "stats":{ + "tokens":[{ + "key":"example - subset", + "mean":42901.60996309963, + "sum":116263363, + "n":2710}]}} +``` + +--- + +##Lucene + +To use statistics on the number of tokens [directly in Lucene](installation_lucene.html), *ComponentToken* together with the provided *collect* method can be used. \ No newline at end of file diff --git a/src/site/markdown/search_component_termvector.md b/src/site/markdown/search_component_termvector.md new file mode 100644 index 0000000..836dbdc --- /dev/null +++ b/src/site/markdown/search_component_termvector.md @@ -0,0 +1,326 @@ +#Termvector + +Mtas can produce termvectors for the set of documents satisfying the condition and/or filter. To get this information, in Solr requests, besides the parameter to enable the [Mtas query component](search_component.html), the following parameter should be provided. + +| Parameter | Value | Obligatory | +|-----------------------|--------|-------------| +| mtas.termvector | true | yes | + +Multiple termvector results can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required document results. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.termvector.\<identifier\>.key | \<string\> | key used in response | no | +| mtas.termvector.\<identifier\>.field | \<string\> | Mtas field | yes | +| mtas.termvector.\<identifier\>.prefix | \<string\> | prefix |yes | +| mtas.termvector.\<identifier\>.number | \<double\> | number of terms in list | no | +| mtas.termvector.\<identifier\>.start | \<string\> | begin list after provided term, only if sorted on term | no | +| mtas.termvector.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | +| mtas.termvector.\<identifier\>.regexp | \<string\> | regular expression condition on term | no | +| mtas.termvector.\<identifier\>.ignoreRegexp | \<string\> | regular expression condition for terms that have to be ignored | no | +| mtas.termvector.\<identifier\>.sort.type | \<string\> | sort on term or [type of statistics](search_stats.html) | no | +| mtas.termvector.\<identifier\>.sort.direction | \<string\> | sort direction: asc or desc | no | + + +## Full + +When using distributed search, instead of applying the more efficient default algorithm where in two rounds lists of terms are collected and combined from the participating cores, also another approach can be used. Using the *full* option, the complete lists of terms (matching all requirements) is collected from the participating cores, and combined afterwards. This approach is likely to be less efficient when huge lists are involved, but necessary for example when results have to be sorted on specific statistics. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.termvector.\<identifier\>.full | \<boolean\> | compute full list of terms | no | + +## List + +If a list of terms is provided, the termvector will be restricted to items from this list. These items may be configured to be interpreted as explicit terms or as regular expressions. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.termvector.\<identifier\>.list | [\<string\>,...] | list of terms | yes | +| mtas.termvector.\<identifier\>.listRegexp | \<boolean\> | interpret items in provided list as regular expressions | no | + +Furthermore, a list of terms can be provided that should be ignored within the termvector. These items may also be configured to be interpreted as explicit terms or as regular expressions. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.termvector.\<identifier\>.ignoreList | [\<string\>,...] | list of terms | yes | +| mtas.termvector.\<identifier\>.ignoreListRegexp | \<boolean\> | interpret items in provided ignoreList as regular expressions | no | + + +## Functions + +Besides the specified statistics on hits over the documents, also statistics on the computed value of functions on the number of hits and the total number of words over the documents can be provided. In the definition of such a function, the number of hits is referred to as *$q0*, and the number of words is referred to as $n. + +| Parameter | Value | Info | Obligatory | +|-------------------------------------------------|--------------|--------------------------------|-------------| +| mtas.termvector.\<identifier\>.function.\<identifier function\>.key | \<string\> | key used in response | no | +| mtas.termvector.\<identifier\>.function.\<identifier function\>.expression | \<string\> | definition of function | yes | +| mtas.termvector.\<identifier\>.function.\<identifier function\>.type | \<string\> | required [type of statistics](search_stats.html) | no | + + +Again, the key is added to the response and may be used to distinguish between multiple functions, and should therefore be unique within each specified termvector. + +--- + +## Examples +1. [Basic](#basic) : basic statistics on occurring part of speech +2. [Regexp](#regexp) : words of length 5 containing only characters a-z, sorted descending by number of hits +3. [Ignore](#ignore) : previous result, ignoring words ending with $-e$. +4. [List](#list) : termvector for provided list of words. +5. [Start](#start) : termvector for words containing only characters a-z sorted by term and > *koe*. +6. [Functions](#functions) : statistics on hits, relative frequency and total number of words in document for words containing only characters a-z. + +--- + +<a name="basic"></a> + +### Basic + +**Example** +Total and average number of occurrences of part of speech (pos). + +**Request and response** +`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - basic&mtas.termvector.0.field=text&mtas.termvector.0.prefix=pos&mtas.termvector.0.number=3&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "termvector":[{ + "key":"example - basic", + "list":[{ + "mean":200.22966889678833, + "sum":25797991, + "n":128842, + "key":"ADJ"}, + { + "mean":149.53835013602176, + "sum":18689303, + "n":124980, + "key":"BW"}, + { + "mean":459.93552395416265, + "sum":59963634, + "n":130374, + "key":"LET"}]}]} +``` + +<a name="regexp"></a> + +### Regexp + +**Example** +List of words with length 5 and containing only characters a-z, sorted descending by number of hits. + +**Regular expression** +`[a-z]{5}` + +**Request and response** +`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - regexp&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.number=5&mtas.termvector.0.type=n,sum&mtas.termvector.0.regexp=[a-z]{5}&mtas.termvector.0.sort.type=sum&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "termvector":[{ + "key":"example - regexp", + "list":[{ + "sum":972687, + "n":94160, + "key":"heeft"}, + { + "sum":645227, + "n":84306, + "key":"wordt"}, + { + "sum":436038, + "n":82453, + "key":"onder"}, + { + "sum":391488, + "n":40512, + "key":"zijne"}, + { + "sum":314539, + "n":62316, + "key":"welke"}]}]} +``` + +<a name="ignore"></a> + +### Ignore + +**Example** +List of words with length 5 and containing only characters a-z, sorted descending by number of hits, ignoring all words ending with $-e$. + +**Regular expressions** +`[a-z]{5}` +`.*e` + +**Request and response** +`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - ignore&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.number=5&mtas.termvector.0.type=n,sum&mtas.termvector.0.regexp=[a-z]{5}&mtas.termvector.0.ignoreRegexp=.*e&mtas.termvector.0.sort.type=sum&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "termvector":[{ + "key":"example - ignore", + "list":[{ + "sum":972687, + "n":94160, + "key":"heeft"}, + { + "sum":645227, + "n":84306, + "key":"wordt"}, + { + "sum":436038, + "n":82453, + "key":"onder"}, + { + "sum":304620, + "n":60555, + "key":"leven"}, + { + "sum":297160, + "n":58263, + "key":"waren"}]}]} +``` + +<a name="basic"></a> + +### List + +**Example** +Termvector for provided list of words. + +**List** +`koe,paard,schaap,geit,kip` + +**Request and response** +`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - list&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.list=koe,paard,schaap,geit,kip&mtas.termvector.0.type=n,sum&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "termvector":[{ + "key":"example - list", + "list":[{ + "sum":1128, + "n":683, + "key":"geit"}, + { + "sum":1410, + "n":864, + "key":"kip"}, + { + "sum":4432, + "n":2344, + "key":"koe"}, + { + "sum":15478, + "n":7436, + "key":"paard"}, + { + "sum":2154, + "n":1591, + "key":"schaap"}]}]} +``` + +<a name="start"></a> + +### Start + +**Example** +Termvector for words containing only characters a-z sorted by term and > *koe*. + +**Request and response** +`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - start&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.regexp=[a-z]*&mtas.termvector.0.number=5&mtas.termvector.0.start=koe&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "termvector":[{ + "key":"example - start", + "list":[{ + "mean":2.0, + "sum":2, + "n":1, + "key":"koea"}, + { + "mean":1.0, + "sum":1, + "n":1, + "key":"koeaan"}, + { + "mean":1.0, + "sum":2, + "n":2, + "key":"koeachtig"}, + { + "mean":1.5, + "sum":3, + "n":2, + "key":"koeachtige"}, + { + "mean":1.0, + "sum":2, + "n":2, + "key":"koeala"}]}]} +``` + + +<a name="functions"></a> + +### Functions + +**Example** +List of words containing only characters a-z, sorted descending by number of hits, with statistics on hits, relative frequency and total number of words in document. + +**Regular expression** +`[a-z]*` + +**Functions** +`$q0/$n` +`$n` + +**Request and response** +`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - list&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.regexp=[a-z]*&mtas.termvector.0.sort.type=sum&mtas.termvector.0.type=n,sum&mtas.termvector.0.function.0.expression=%24q0%2F%24n&mtas.termvector.0.function.0.key=relative+frequency&mtas.termvector.0.function.0.type=n%2Cmean&mtas.termvector.0.function.1.expression=%24n&mtas.termvector.0.function.1.key=total+number+of+words&mtas.termvector.0.function.1.type=n%2Csum&mtas.termvector.0.number=3&rows=0&wt=json&indent=true` + +``` json +"mtas":{ + "termvector":[{ + "key":"example - list", + "list":[{ + "sum":15975272, + "n":127444, + "functions":{ + "total number of words":{ + "sum":391924648, + "n":127444}, + "relative frequency":{ + "mean":0.040967994034336694, + "n":127444}}, + "key":"de"}, + { + "sum":10565895, + "n":126197, + "functions":{ + "total number of words":{ + "sum":391190126, + "n":126197}, + "relative frequency":{ + "mean":0.028072930308247233, + "n":126197}}, + "key":"van"}, + { + "sum":8798835, + "n":125415, + "functions":{ + "total number of words":{ + "sum":391306760, + "n":125415}, + "relative frequency":{ + "mean":0.02376864203286862, + "n":125415}}, + "key":"en"}]}]} +``` + +**Lucene** + +To use termvectors [directly in Lucene](installation_lucene.html), *ComponentTermvector* together with the provided *collect* method can be used. + + diff --git a/src/site/markdown/search_configuration.md b/src/site/markdown/search_configuration.md index 5044e48..6527438 100644 --- a/src/site/markdown/search_configuration.md +++ b/src/site/markdown/search_configuration.md @@ -23,20 +23,18 @@ This enables the handling of all Mtas specific arguments within a select request **Mtas queryParser** -The `mtas.solr.search.MtasSolrCQLQParserPlugin` has to be included to enable the use of CQL queries: +The `mtas.solr.search.MtasSolrCQLQParserPlugin` has to be included to enable the use of [CQL queries](search_parser_cql.html): ```console <queryParser name="mtas_cql" class="mtas.solr.search.MtasSolrCQLQParserPlugin"/> ``` -And the `mtas.solr.search.MtasSolrJoinQParserPlugin` has to be included to enable the use of join queries: +The `mtas.solr.search.MtasSolrJoinQParserPlugin` has to be included to enable the use of [join queries](search_parser_join.html): ```console <queryParser name="mtas_join" class="mtas.solr.search.MtasSolrJoinQParserPlugin"/> ``` -This enables the use of expressions like `{!mtas_cql field="mtas" query="[pos=\"N\"]"}` within Solr (filter) queries. - **Mtas requestHandler** Adding the `mtas.solr.handler.MtasRequestHandler` enables additional Mtas functionality that doesn't belong in the select requestHandler. diff --git a/src/site/markdown/search_cql.md b/src/site/markdown/search_cql.md index 5e87a63..4f19013 100644 --- a/src/site/markdown/search_cql.md +++ b/src/site/markdown/search_cql.md @@ -8,13 +8,13 @@ To describe sets of tokens matching some condition, a query language is needed. #### Prefix -For each field containing Mtas tokenized text, every token is associated with a prefix. Within the field, only a limited set of prefixes is used to distinguish between the different types of annotation. By using a [prefix query](search_query_prefix.html) a full list of used prefixes can be produced. +For each field containing Mtas tokenized text, every token is associated with a prefix. Within the field, only a limited set of prefixes is used to distinguish between the different types of annotation. By using a [prefix query](search_component_prefix.html) a full list of used prefixes can be produced. <a name="value"></a> #### Value -The optional postfix associated with a token can be queried within CQL by providing a *value*. This is a regular expression, the supported syntax is documented in the RegExp class provided by Lucene. By using a [termvector query](search_query_termvector.html), for each [prefix](#prefix) a list of postfix values can be produced. +The optional postfix associated with a token can be queried within CQL by providing a *value*. This is a regular expression, the supported syntax is documented in the RegExp class provided by Lucene. By using a [termvector query](search_component_termvector.html), for each [prefix](#prefix) a list of postfix values can be produced. <a name="variable"></a> diff --git a/src/site/markdown/search_handler.md b/src/site/markdown/search_handler.md new file mode 100644 index 0000000..f00aaad --- /dev/null +++ b/src/site/markdown/search_handler.md @@ -0,0 +1 @@ +# Request Handler \ No newline at end of file diff --git a/src/site/markdown/search_parser.md b/src/site/markdown/search_parser.md new file mode 100644 index 0000000..6913aca --- /dev/null +++ b/src/site/markdown/search_parser.md @@ -0,0 +1,5 @@ +# Query Parser + +Two queryParsers are available to provide the use of [CQL queries](search_parser_cql.html) and [join queries](search_parser_join.html). + + diff --git a/src/site/markdown/search_parser_cql.md b/src/site/markdown/search_parser_cql.md new file mode 100644 index 0000000..ae0b531 --- /dev/null +++ b/src/site/markdown/search_parser_cql.md @@ -0,0 +1,4 @@ +# CQL Query Parser + +The `mtas.solr.search.MtasSolrCQLQParserPlugin` enables the use of [CQL](search_cql.html) within Solr (filter) queries, e.g. `{!mtas_cql field="mtas" query="[pos=\"N\"]"}`. + diff --git a/src/site/markdown/search_parser_join.md b/src/site/markdown/search_parser_join.md new file mode 100644 index 0000000..011e5e5 --- /dev/null +++ b/src/site/markdown/search_parser_join.md @@ -0,0 +1,4 @@ +# Join Query Parser + +The `mtas.solr.search.MtasSolrJoinQParserPlugin` enables the use of joins within Solr (filter) queries. + diff --git a/src/site/markdown/search_query.md b/src/site/markdown/search_query.md deleted file mode 100644 index 04e5de3..0000000 --- a/src/site/markdown/search_query.md +++ /dev/null @@ -1,41 +0,0 @@ -# Query - -To perform specific Mtas queries in Solr requests, the following parameter should be used. - -| Parameter | Value | Obligatory | -|-------------|--------|-------------| -| mtas | true | yes | - -See [statistics](search_query_stats.html), -[kwic](search_query_kwic.html), [list](search_query_list.html), [document](search_query_document.html), [termvector](search_query_termvector.html), [facet](search_query_facet.html), [group](search_query_group.html) and [prefix](search_query_prefix.html) for more details and examples. - ---- - -**Regular queries** - -Besides from specific Mtas queries in Solr requests, also [CQL](search_cql.html) can be used in regular queries by [configuring](search_configuration.html) the Mtas query parser in solrconfig.xml. - -*Example 1* - -Search for documents containing the word "de" with a query. - -`q={!mtas_cql+field%3D"text"+query%3D"[t%3D\"de\"]"}&fl=*&start=0&rows=0&wt=json&indent=true` - -``` json -"response":{"numFound":1664241,"start":0,"docs":[] - } -``` - -*Example 2* - -Search for documents containing the word "de" with a filter query. - -`fq={!mtas_cql+field%3D"text"+query%3D"[t%3D\"de\"]"}&q=*%3A*&fl=*&start=0&rows=0&wt=json&indent=true` - -``` json -"response":{"numFound":1664241,"start":0,"docs":[] - } -``` - - - diff --git a/src/site/markdown/search_query_document.md b/src/site/markdown/search_query_document.md deleted file mode 100644 index 53e22be..0000000 --- a/src/site/markdown/search_query_document.md +++ /dev/null @@ -1,284 +0,0 @@ -# Document - -Mtas can produce statistics on used terms for the individual listed documents. To get this information, in Solr requests, besides the parameter to enable [Mtas queries](search_query.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.document | true | yes | - -Multiple document results can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required document results. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.document.\<identifier\>.key | \<string\> | key used in response | no | -| mtas.document.\<identifier\>.field | \<string\> | Mtas field | yes | -| mtas.document.\<identifier\>.prefix | \<string\> | prefix |yes | -| mtas.document.\<identifier\>.number | \<double\> | create list with specified number of most frequent items | no | -| mtas.document.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | -| mtas.document.\<identifier\>.regexp | \<string\> | regular expression condition on term | no | -| mtas.document.\<identifier\>.ignoreRegexp | \<string\> | regular expression condition for terms that have to be ignored | no | - -## List - -A list can be provided, specifying the set of terms to consider when computing the result. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.document.\<identifier\>.list | \<string\> | comma separated list of values | yes | -| mtas.document.\<identifier\>.listRegexp | \<boolean\> | list of values are to be interpreted as regular expressions | no | -| mtas.document.\<identifier\>.listExpand | \<boolean\> | expand the matches on values from list | no | -| mtas.document.\<identifier\>.listExpandNumber | \<boolean\> | number of expansions of matches on values from list | no | - -## Ignore list - -Also a ignore list can be provided, specifying the set of terms not to consider when computing the result. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.document.\<identifier\>.ignoreList | \<string\> | comma separated list of values | yes | -| mtas.document.\<identifier\>.ignoreListRegexp | \<boolean\> | list of values are to be interpreted as regular expressions | no | - ---- - -## Examples -1. [Basic](#basic) : Statistics unique words for each document -2. [Regexp](#regexp) : Most frequent words containing only letters a-z and minimum length 5 -3. [List](#list) : Statistics for a provided list of words -4. [Ignore](#ignore) : Statistics for a provided list of regular expressions, ignoring another list of regular expressions - ---- - -<a name="basic"></a> - -### Basic - -**Example** -Statistics for set of unique tokens with prefix *t* (words) for each listed document. - - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5B%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.document=true&mtas.document.0.field=text&mtas.document.0.prefix=t&mtas.document.0.key=words&mtas.document.0.type=all&fl=*&start=0&rows=2&wt=json&indent=true` - -```json -"mtas":{ - "document":[{ - "key":"words", - "list":[{ - "documentKey":"4115a95c-011c-11e4-b0ff-51bcbd7c379f", - "sumsq":113964.0, - "populationvariance":126.5639231447591, - "max":166.0, - "sum":3336.0, - "kurtosis":92.19837080635624, - "standarddeviation":11.257199352433314, - "n":789, - "quadraticmean":12.01836364230935, - "min":1.0, - "median":1.0, - "variance":126.72453726042504, - "mean":4.228136882129286, - "geometricmean":1.9285975498109995, - "sumoflogs":518.209740627951, - "skewness":8.377350653392202}, - { - "documentKey":"4115aac4-011c-11e4-b0ff-51bcbd7c379f", - "sumsq":25489.0, - "populationvariance":35.695641666666134, - "max":77.0, - "sum":1563.0, - "kurtosis":72.57030420433823, - "standarddeviation":5.979568021426876, - "n":600, - "quadraticmean":6.517796151051877, - "min":1.0, - "median":1.0, - "variance":35.75523372287092, - "mean":2.6050000000000004, - "geometricmean":1.5249529474773036, - "sumoflogs":253.1781332820801, - "skewness":7.70682353088895}]}]} -``` - -<a name="regexp"></a> - -### Regexp - -**Example** -Most frequent tokens containing only letters a-z and minimum length 5 with prefix *t* (words) for each listed document. - -**Regexp**<br/> -`[a-z]{5,}` - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5B%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.document=true&mtas.document.0.field=NLContent_mtas&mtas.document.0.prefix=t&mtas.document.0.key=list+of+words&mtas.document.0.type=n%2Csum%2Cmean&mtas.document.0.regexp=%5Ba-z%5D%7B5%2C%7D&mtas.document.0.number=5&fl=%2A&start=0&rows=2&wt=json&indent=true` - -```json -"mtas":{ - "document":[{ - "key":"list of words", - "list":[{ - "documentKey":"c0c4200c-1eee-11e5-b891-f48ce0be173a", - "list":[{ - "sum":471, - "key":"zijne"}, - { - "sum":317, - "key":"eenen"}, - { - "sum":304, - "key":"zegde"}, - { - "sum":249, - "key":"hebben"}, - { - "sum":229, - "key":"welke"}], - "mean":4.552402402402403, - "sum":30319, - "n":6660}, - { - "documentKey":"c0c453d8-1eee-11e5-b891-f48ce0be173a", - "list":[{ - "sum":348, - "key":"heeft"}, - { - "sum":243, - "key":"hebben"}, - { - "sum":199, - "key":"prins"}, - { - "sum":173, - "key":"vader"}, - { - "sum":161, - "key":"komen"}], - "mean":4.641632967456191, - "sum":24104, - "n":5193}]}]} -``` - -<a name="list"></a> - -### List - -**Example** -Statistics for a provided list of words for each listed document. - -**List**<br/> -`koe,paard,schaap,geit,kip` - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bt_lc%3D%5C%22koe%5C%22%7Ct_lc%3D%5C%22paard%5C%22%7Ct_lc%3D%5C%22schaap%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.document=true&mtas.document.0.field=text&mtas.document.0.prefix=t_lc&mtas.document.0.key=list+of+words&mtas.document.0.type=n%2Csum%2Cmean&mtas.document.0.list=koe%2Cpaard%2Cschaap%2Cgeit%2Ckip&mtas.document.0.listRegexp=false&mtas.document.0.listExpand=false&mtas.document.0.number=100&fl=%2A&start=0&rows=2&wt=json&indent=true` - -```json -"mtas":{ - "document":[{ - "key":"list of words", - "list":[{ - "documentKey":"c0c46b7a-1eee-11e5-b891-f48ce0be173a", - "list":[{ - "sum":3, - "key":"paard"}, - { - "sum":2, - "key":"schaap"}], - "mean":2.5, - "sum":5, - "n":2}, - { - "documentKey":"c0c453d8-1eee-11e5-b891-f48ce0be173a", - "list":[{ - "sum":31, - "key":"paard"}, - { - "sum":1, - "key":"kip"}], - "mean":16.0, - "sum":32, - "n":2}]}]} -``` - -<a name="ignore"></a> - -### Ignore - -**Example** -Statistics for a provided list of regular expressions, ignoring another list of regular expressions for each listed document. - -**Regexp**<br/> -`[a-z]{7,}` - -**Ignore**<br/> -`[a-z]{10,}` - -**List**<br/> -`een.*,.*heid` - -**Ignore list**<br/> -`een.*heid,ee.*nheid` - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bt_lc%3D%5C%22eenheid%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.document=true&mtas.document.0.field=text&mtas.document.0.prefix=t_lc&mtas.document.0.key=advanced+list+of+words&mtas.document.0.type=n%2Csum%2Cmean&mtas.document.0.regexp=%5Ba-z%5D%7B7%2C%7D&mtas.document.0.list=een.%2A%2C.%2Aheid&mtas.document.0.listRegexp=true&mtas.document.0.listExpand=true&mtas.document.0.listExpandNumber=3&mtas.document.0.ignoreRegexp=%5Ba-z%5D%7B10%2C%7D&mtas.document.0.ignoreList=een.%2Aheid%2Cee.%2Anheid&mtas.document.0.ignoreListRegexp=true&mtas.document.0.number=10&fl=text_numberOfPositions%2CNLCore_NLIdentification_nederlabID%2CNLProfile_name%2CNLTitle_title&start=0&rows=2&wt=json&indent=true` - -```json -"mtas":{ - "document":[{ - "key":"advanced list of words", - "list":[{ - "documentKey":"c0c41486-1eee-11e5-b891-f48ce0be173a", - "list":[{ - "sum":166, - "list":{ - "droefheid":{ - "sum":36}, - "godheid":{ - "sum":22}, - "waarheid":{ - "sum":22}}, - "key":".*heid"}, - { - "sum":93, - "list":{ - "eenigen":{ - "sum":46}, - "eensklaps":{ - "sum":32}, - "eenigste":{ - "sum":3}}, - "key":"een.*"}], - "mean":5.886363636363637, - "sum":259, - "n":44}, - { - "documentKey":"c0c453d8-1eee-11e5-b891-f48ce0be173a", - "list":[{ - "sum":36, - "list":{ - "afscheid":{ - "sum":12}, - "hoogheid":{ - "sum":4}, - "bezigheid":{ - "sum":3}}, - "key":".*heid"}, - { - "sum":24, - "list":{ - "eenvoudig":{ - "sum":15}, - "eenzame":{ - "sum":3}, - "eenmaal":{ - "sum":2}}, - "key":"een.*"}], - "mean":3.1578947368421053, - "sum":60, - "n":19}]}]} -``` - ---- - -**Lucene** - -To get statistics on used terms for the listed documents [directly in Lucene](installation_lucene.html), *ComponentDocument* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_query_facet.md b/src/site/markdown/search_query_facet.md deleted file mode 100644 index aad2b43..0000000 --- a/src/site/markdown/search_query_facet.md +++ /dev/null @@ -1,12 +0,0 @@ -# Facets - -Mtas can produce facets on metadata for Mtas queries. To get this information, in Solr requests, besides the parameter to enable [Mtas queries](search_query.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.facet | true | yes | - - -**Lucene** - -To produce facets on metadata [directly in Lucene](installation_lucene.html), *ComponentFacet* together with the provided *collect* method can be used. \ No newline at end of file diff --git a/src/site/markdown/search_query_group.md b/src/site/markdown/search_query_group.md deleted file mode 100644 index 3f7454a..0000000 --- a/src/site/markdown/search_query_group.md +++ /dev/null @@ -1,11 +0,0 @@ -# Grouping - -Mtas can group results for Mtas queries within the (filtered) set of documents. To get this information, in Solr requests, besides the parameter to enable [Mtas queries](search_query.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.group | true | yes | - -**Lucene** - -To group results [directly in Lucene](installation_lucene.html), *ComponentGroup* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_query_kwic.md b/src/site/markdown/search_query_kwic.md deleted file mode 100644 index eb35c80..0000000 --- a/src/site/markdown/search_query_kwic.md +++ /dev/null @@ -1,326 +0,0 @@ -# Kwic - -Mtas can produce keywords in context (kwic) for Mtas queries within the listed documents. To get this information, in Solr requests, besides the parameter to enable [Mtas queries](search_query.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.kwic | true | yes | - -Keyword in context results on multiple spans can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required kwics. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.kwic.\<identifier\>.key | \<string\> | key used in response | no | -| mtas.kwic.\<identifier\>.field | \<string\> | Mtas field | yes | -| mtas.kwic.\<identifier\>.query.type | \<string\> | query language: [cql](search_cql.html) | yes | -| mtas.kwic.\<identifier\>.query.value | \<string\> | query: [cql](search_cql.html) | yes | -| mtas.kwic.\<identifier\>.query.prefix | \<string\> | default prefix | no | -| mtas.kwic.\<identifier\>.query.ignore | \<string\> | ignore query: [cql](search_cql.html) | no | -| mtas.kwic.\<identifier\>.query.maximumIgnoreLength | \<integer\> | maximum number of succeeding occurrences to ignore | no | -| mtas.kwic.\<identifier\>.prefix | \<string\> | comma separated list of prefixes | no | -| mtas.kwic.\<identifier\>.number | \<double\> | maximum number for selection of items for each document | no | -| mtas.kwic.\<identifier\>.start | \<double\> | offset for selection of items for each document | no | -| mtas.kwic.\<identifier\>.left | \<double\> | number of positions left of hit | no | -| mtas.kwic.\<identifier\>.right | \<double\> | number of positions right of hit | no | -| mtas.kwic.\<identifier\>.output | \<string\> | "token" or "hit" | no | - - -## Variables - -The query may contain one or more variables, and the value(s) of these variables have to be defined - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.kwic.\<identifier\>.query.variable\<identifier variable\>.name | \<string\> | name of variable | yes | -| mtas.kwic.\<identifier\>.query.variable\<identifier variable\>.value | \<string\> | comma separated list of values | yes | - ---- - -## Examples -1. [Token](#token) : List of tokens with prefix *t*, *pos* and *s* for adjectives followed by a noun -2. [Hit](#hit) : List of hits with prefix *t*, *pos* and *s* for articles followed by an adjective and a noun -3. [Left and Right](#left-and-right) : List of tokens with prefix *t* and *s* for sentences starting with an article, expanded to the left and the right ---- - -<a name="token"></a> - -### Token - -**Example** -Keyword in context with output type *token* and prefixes *t*, *pos* and *s* for adjectives followed by a noun - -**CQL** -`[pos="ADJ"][pos="N"]` - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bpos%3D%5C%22ADJ%5C%22%5D%5Bpos%3D%5C%22N%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.kwic=true&mtas.kwic.0.field=text&mtas.kwic.0.query.type=cql&mtas.kwic.0.query.value=%5Bpos%3D%22ADJ%22%5D%5Bpos%3D%22N%22%5D&mtas.kwic.0.key=adjective%2Bnoun&mtas.kwic.0.prefix=t%2Cpos%2Cs&mtas.kwic.0.output=token&mtas.kwic.0.number=2&mtas.kwic.0.start=0&mtas.kwic.0.left=0&mtas.kwic.0.right=0&fl=%2A&start=0&rows=1&wt=json&indent=true` - -```json -"mtas":{ - "kwic":[{ - "key":"adjective+noun", - "list":[{ - "documentKey":"61d2a1b3-9068-4815-ba4d-3370e5a809d7", - "documentTotal":31, - "documentMinPosition":0, - "documentMaxPosition":673, - "list":[{ - "startPosition":0, - "endPosition":1, - "tokens":[{ - "mtasId":8, - "prefix":"t", - "value":"fusiebedrijf", - "positionStart":1, - "positionEnd":1, - "parentMtasId":81}, - { - "mtasId":15, - "prefix":"pos", - "value":"N", - "positionStart":1, - "positionEnd":1}, - { - "mtasId":81, - "prefix":"s", - "value":"", - "positionStart":0, - "positionEnd":8, - "parentMtasId":82}, - { - "mtasId":0, - "prefix":"t", - "value":"Nieuw", - "positionStart":0, - "positionEnd":0, - "parentMtasId":81}, - { - "mtasId":5, - "prefix":"pos", - "value":"ADJ", - "positionStart":0, - "positionEnd":0}]}, - { - "startPosition":5, - "endPosition":6, - "tokens":[{ - "mtasId":45, - "prefix":"t", - "value":"Belgische", - "positionStart":5, - "positionEnd":5, - "parentMtasId":81}, - { - "mtasId":51, - "prefix":"pos", - "value":"ADJ", - "positionStart":5, - "positionEnd":5}, - { - "mtasId":55, - "prefix":"t", - "value":"energiemarkt", - "positionStart":6, - "positionEnd":6, - "parentMtasId":81}, - { - "mtasId":62, - "prefix":"pos", - "value":"N", - "positionStart":6, - "positionEnd":6}, - { - "mtasId":81, - "prefix":"s", - "value":"", - "positionStart":0, - "positionEnd":8, - "parentMtasId":82}]}]}]}]} -``` - - -<a name="hit"></a> - -### Hit - -**Example** -Keyword in context with output type *hit* and prefixes *t*, *pos* and *s* for articles followed by an adjective and a noun - -**CQL** -`[pos="LID"][pos="ADJ"][pos="N"]` - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bpos%3D%5C%22LID%5C%22%5D%5Bpos%3D%5C%22ADJ%5C%22%5D%5Bpos%3D%5C%22N%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.kwic=true&mtas.kwic.0.field=text&mtas.kwic.0.query.type=cql&mtas.kwic.0.query.value=%5Bpos%3D%22LID%22%5D%5Bpos%3D%22ADJ%22%5D%5Bpos%3D%22N%22%5D&mtas.kwic.0.key=article%2Badjective%2Bnoun&mtas.kwic.0.prefix=t%2Cpos%2Cs&mtas.kwic.0.output=hit&mtas.kwic.0.number=2&mtas.kwic.0.start=0&mtas.kwic.0.left=0&mtas.kwic.0.right=0&fl=%2A&start=0&rows=1&wt=json&indent=true` - -```json -"mtas":{ - "kwic":[{ - "key":"article+adjective+noun", - "list":[{ - "documentKey":"61d2a1b3-9068-4815-ba4d-3370e5a809d7", - "documentTotal":21, - "documentMinPosition":0, - "documentMaxPosition":673, - "list":[{ - "hit":{ - "92":[["t", - "De"], - ["pos", - "LID"], - ["s", - null]], - "93":[["t", - "nieuwe"], - ["pos", - "ADJ"], - ["s", - null]], - "94":[["t", - "fusiegroep"], - ["pos", - "N"], - ["s", - null]]}}, - { - "hit":{ - "106":[["t", - "De"], - ["pos", - "LID"], - ["s", - null]], - "107":[["t", - "Belgische"], - ["pos", - "ADJ"], - ["s", - null]], - "108":[["t", - "regering"], - ["pos", - "N"], - ["s", - null]]}}]}]}]} -``` - ---- - -<a name="left-and-right"></a> - -### Left and Right - -**Example** -Keyword in context with output type *token* and prefixes *t* and *s* for sentences starting with an article, expanded two positions to the left and one position to the right - -**CQL** -`<s>[pos="LID"]` - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%3Cs%3E%5Bpos%3D%5C%22LID%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.kwic=true&mtas.kwic.0.field=text&mtas.kwic.0.query.type=cql&mtas.kwic.0.query.value=%3Cs%3E%5Bpos%3D%22LID%22%5D&mtas.kwic.0.key=sentence+starting+with+article&mtas.kwic.0.prefix=t%2Cs&mtas.kwic.0.output=token&mtas.kwic.0.number=2&mtas.kwic.0.start=0&mtas.kwic.0.left=2&mtas.kwic.0.right=1&fl=%2A&start=0&rows=1&wt=json&indent=true` - -```json -"mtas":{ - "kwic":[{ - "key":"sentence starting with article", - "list":[{ - "documentKey":"61d2a1b3-9068-4815-ba4d-3370e5a809d7", - "documentTotal":10, - "documentMinPosition":0, - "documentMaxPosition":673, - "list":[{ - "startPosition":14, - "endPosition":14, - "tokens":[{ - "mtasId":136, - "prefix":"t", - "value":"fusiegroep", - "positionStart":15, - "positionEnd":15, - "parentMtasId":295}, - { - "mtasId":295, - "prefix":"s", - "value":"", - "positionStart":14, - "positionEnd":36, - "parentMtasId":417}, - { - "mtasId":128, - "prefix":"t", - "value":"De", - "positionStart":14, - "positionEnd":14, - "parentMtasId":295}, - { - "mtasId":113, - "prefix":"t", - "value":"afslanking", - "positionStart":13, - "positionEnd":13, - "parentMtasId":126}, - { - "mtasId":107, - "prefix":"t", - "value":"tot", - "positionStart":12, - "positionEnd":12, - "parentMtasId":126}, - { - "mtasId":126, - "prefix":"s", - "value":"", - "positionStart":9, - "positionEnd":13, - "parentMtasId":127}]}, - { - "startPosition":92, - "endPosition":92, - "tokens":[{ - "mtasId":729, - "prefix":"t", - "value":".", - "positionStart":91, - "positionEnd":91, - "parentMtasId":737}, - { - "mtasId":746, - "prefix":"t", - "value":"nieuwe", - "positionStart":93, - "positionEnd":93, - "parentMtasId":853}, - { - "mtasId":738, - "prefix":"t", - "value":"De", - "positionStart":92, - "positionEnd":92, - "parentMtasId":853}, - { - "mtasId":853, - "prefix":"s", - "value":"", - "positionStart":92, - "positionEnd":105, - "parentMtasId":1114}, - { - "mtasId":723, - "prefix":"t", - "value":"Parijs", - "positionStart":90, - "positionEnd":90, - "parentMtasId":737}, - { - "mtasId":737, - "prefix":"s", - "value":"", - "positionStart":59, - "positionEnd":91, - "parentMtasId":1114}]}]}]}]} -``` - ---- - -##Lucene - -To use keywords in context [directly in Lucene](installation_lucene.html), *ComponentKwic* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_query_list.md b/src/site/markdown/search_query_list.md deleted file mode 100644 index 1561660..0000000 --- a/src/site/markdown/search_query_list.md +++ /dev/null @@ -1,347 +0,0 @@ -#List - -Mtas can retrieve list of hits for Mtas queries within the (filtered) set of documents. To get this information, in Solr requests, besides the parameter to enable [Mtas queries](search_query.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.list | true | yes | - -List results on multiple spans can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required lists. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.list.\<identifier\>.key | \<string\> | key used in response | no | -| mtas.list.\<identifier\>.field | \<string\> | Mtas field | yes | -| mtas.list.\<identifier\>.query.type | \<string\> | query language: [cql](search_cql.html) | yes | -| mtas.list.\<identifier\>.query.value | \<string\> | query: [cql](search_cql.html) | yes | -| mtas.list.\<identifier\>.query.prefix | \<string\> | default prefix | no | -| mtas.list.\<identifier\>.query.ignore | \<string\> | ignore query: [cql](search_cql.html) | no | -| mtas.list.\<identifier\>.query.maximumIgnoreLength | \<integer\> | maximum number of succeeding occurrences to ignore | no | -| mtas.list.\<identifier\>.prefix | \<string\> | comma separated list of prefixes | no | -| mtas.list.\<identifier\>.number | \<double\> | maximum number of items in list | no | -| mtas.list.\<identifier\>.start | \<double\> | offset for selection of items in list | no | -| mtas.list.\<identifier\>.left | \<double\> | number of positions left of hit | no | -| mtas.list.\<identifier\>.right | \<double\> | number of positions right of hit | no | -| mtas.list.\<identifier\>.output | \<string\> | "token" or "hit" | no | - -## Variables - -The query may contain one or more variables, and the value(s) of these variables have to be defined - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.list.\<identifier\>.query.variable\<identifier variable\>.name | \<string\> | name of variable | yes | -| mtas.list.\<identifier\>.query.variable\<identifier variable\>.value | \<string\> | comma separated list of values | yes | - ---- - -## Examples -1. [Token](#token) : List of tokens with prefix *t*, *pos* and *s* for adjectives followed by a noun -2. [Hit](#hit) : List of hits with prefix *t*, *pos* and *s* for articles followed by an adjective and a noun -3. [Left and Right](#left-and-right) : List of tokens with prefix *t* and *s* for sentences starting with an article, expanded to the left and the right ---- - -<a name="token"></a> - -### Token - -**Example** -List with output type *token* and prefixes *t*, *pos* and *s* for adjectives followed by a noun - -**CQL** -`[pos="ADJ"][pos="N"]` - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bpos%3D%5C%22ADJ%5C%22%5D%5Bpos%3D%5C%22N%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.list=true&mtas.list.0.field=text&mtas.list.0.query.type=cql&mtas.list.0.query.value=%5Bpos%3D%22ADJ%22%5D%5Bpos%3D%22N%22%5D&mtas.list.0.key=adjective%2Bnoun&mtas.list.0.prefix=t%2Cpos%2Cs&mtas.list.0.output=token&mtas.list.0.number=2&mtas.list.0.start=0&mtas.list.0.left=0&mtas.list.0.right=0&fl=%2A&rows=0&wt=json&indent=true` - -```json -"mtas":{ - "list":[{ - "key":"adjective+noun", - "number":2, - "list":[{ - "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", - "documentHitPosition":0, - "documentHitTotal":239, - "documentMinPosition":0, - "documentMaxPosition":6385, - "startPosition":29, - "endPosition":30, - "tokens":[{ - "mtasId":191, - "prefix":"t", - "value":"beknopte", - "positionStart":29, - "positionEnd":29, - "parentMtasId":337}, - { - "mtasId":197, - "prefix":"pos", - "value":"ADJ", - "positionStart":29, - "positionEnd":29}, - { - "mtasId":199, - "prefix":"t", - "value":"levensschets", - "positionStart":30, - "positionEnd":30, - "parentMtasId":337}, - { - "mtasId":204, - "prefix":"pos", - "value":"N", - "positionStart":30, - "positionEnd":30}, - { - "mtasId":337, - "prefix":"s", - "value":"", - "positionStart":7, - "positionEnd":49, - "parentMtasId":1152}]}, - { - "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", - "documentHitPosition":1, - "documentHitTotal":239, - "documentMinPosition":0, - "documentMaxPosition":6385, - "startPosition":56, - "endPosition":57, - "tokens":[{ - "mtasId":380, - "prefix":"t", - "value":"gebied", - "positionStart":57, - "positionEnd":57, - "parentMtasId":610}, - { - "mtasId":387, - "prefix":"pos", - "value":"N", - "positionStart":57, - "positionEnd":57}, - { - "mtasId":373, - "prefix":"t", - "value":"velerlei", - "positionStart":56, - "positionEnd":56, - "parentMtasId":610}, - { - "mtasId":378, - "prefix":"pos", - "value":"ADJ", - "positionStart":56, - "positionEnd":56}, - { - "mtasId":610, - "prefix":"s", - "value":"", - "positionStart":50, - "positionEnd":90, - "parentMtasId":1152}]}]}]} -``` - - -<a name="hit"></a> - -### Hit - -**Example** -List with output type *hit* and prefixes *t*, *pos* and *s* for articles followed by an adjective and a noun - -**CQL** -`[pos="LID"][pos="ADJ"][pos="N"]` - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%5Bpos%3D%5C%22LID%5C%22%5D%5Bpos%3D%5C%22ADJ%5C%22%5D%5Bpos%3D%5C%22N%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.list=true&mtas.list.0.field=text&mtas.list.0.query.type=cql&mtas.list.0.query.value=%5Bpos%3D%22LID%22%5D%5Bpos%3D%22ADJ%22%5D%5Bpos%3D%22N%22%5D&mtas.list.0.key=article%2Badjective%2Bnoun&mtas.list.0.prefix=t%2Cpos%2Cs&mtas.list.0.output=hit&mtas.list.0.number=2&mtas.list.0.start=0&mtas.list.0.left=0&mtas.list.0.right=0&fl=%2A&rows=0&wt=json&indent=true` - -```json -"mtas":{ - "list":[{ - "key":"article+adjective+noun", - "number":2, - "list":[{ - "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", - "documentHitPosition":0, - "documentHitTotal":80, - "documentMinPosition":0, - "documentMaxPosition":6385, - "startPosition":210, - "endPosition":212, - "hit":{ - "210":[["t", - "het"], - ["pos", - "LID"], - ["s", - null]], - "211":[["t", - "Middelbaar"], - ["pos", - "ADJ"], - ["s", - null]], - "212":[["t", - "Onderwijs"], - ["pos", - "N"], - ["s", - null]]}}, - { - "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", - "documentHitPosition":1, - "documentHitTotal":80, - "documentMinPosition":0, - "documentMaxPosition":6385, - "startPosition":237, - "endPosition":239, - "hit":{ - "237":[["t", - "het"], - ["pos", - "LID"], - ["s", - null]], - "238":[["t", - "Middelbaar"], - ["pos", - "ADJ"], - ["s", - null]], - "239":[["t", - "Onderwijs"], - ["pos", - "N"], - ["s", - null]]}}]}]} -``` - ---- - -<a name="left-and-right"></a> - -### Left and Right - -**Example** -List with output type *token* and prefixes *t* and *s* for sentences starting with an article, expanded two positions to the left and one position to the right - -**CQL** -`<s>[pos="LID"]` - -**Request and response** -`fq=%7B%21mtas_cql+field%3D%22text%22+query%3D%22%3Cs%3E%5Bpos%3D%5C%22LID%5C%22%5D%22+++%7D&q=%2A%3A%2A&mtas=true&mtas.list=true&mtas.list.0.field=text&mtas.list.0.query.type=cql&mtas.list.0.query.value=%3Cs%3E%5Bpos%3D%22LID%22%5D&mtas.list.0.key=sentence+starting+with+article&mtas.list.0.prefix=t%2Cs&mtas.list.0.output=token&mtas.list.0.number=2&mtas.list.0.start=0&mtas.list.0.left=2&mtas.list.0.right=1&fl=%2A&rows=0&wt=json&indent=true` - -```json -"mtas":{ - "list":[{ - "key":"sentence starting with article", - "number":2, - "list":[{ - "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", - "documentHitPosition":0, - "documentHitTotal":18, - "documentMinPosition":0, - "documentMaxPosition":6385, - "startPosition":378, - "endPosition":378, - "tokens":[{ - "mtasId":2534, - "prefix":"t", - "value":"leven", - "positionStart":379, - "positionEnd":379, - "parentMtasId":2914}, - { - "mtasId":2517, - "prefix":"t", - "value":".", - "positionStart":377, - "positionEnd":377, - "parentMtasId":2526}, - { - "mtasId":2527, - "prefix":"t", - "value":"Het", - "positionStart":378, - "positionEnd":378, - "parentMtasId":2914}, - { - "mtasId":2914, - "prefix":"s", - "value":"", - "positionStart":378, - "positionEnd":433, - "parentMtasId":2915}, - { - "mtasId":2512, - "prefix":"t", - "value":"Landbouwkundige", - "positionStart":376, - "positionEnd":376, - "parentMtasId":2526}, - { - "mtasId":2526, - "prefix":"s", - "value":"", - "positionStart":307, - "positionEnd":377, - "parentMtasId":2915}]}, - { - "documentKey":"44e5620c-011c-11e4-b0ff-51bcbd7c379f", - "documentHitPosition":1, - "documentHitTotal":18, - "documentMinPosition":0, - "documentMaxPosition":6385, - "startPosition":878, - "endPosition":878, - "tokens":[{ - "mtasId":5794, - "prefix":"t", - "value":"De", - "positionStart":878, - "positionEnd":878, - "parentMtasId":5999}, - { - "mtasId":5801, - "prefix":"t", - "value":"eerzucht", - "positionStart":879, - "positionEnd":879, - "parentMtasId":5999}, - { - "mtasId":5999, - "prefix":"s", - "value":"", - "positionStart":878, - "positionEnd":908, - "parentMtasId":6305}, - { - "mtasId":5779, - "prefix":"t", - "value":"bewaarheid", - "positionStart":876, - "positionEnd":876, - "parentMtasId":5792}, - { - "mtasId":5786, - "prefix":"t", - "value":".", - "positionStart":877, - "positionEnd":877, - "parentMtasId":5792}, - { - "mtasId":5792, - "prefix":"s", - "value":"", - "positionStart":857, - "positionEnd":877, - "parentMtasId":5793}]}]}]} -``` - ---- - -**Lucene** - -To get a list of hits [directly in Lucene](installation_lucene.html), *ComponentList* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_query_prefix.md b/src/site/markdown/search_query_prefix.md deleted file mode 100644 index 4d56d20..0000000 --- a/src/site/markdown/search_query_prefix.md +++ /dev/null @@ -1,73 +0,0 @@ -#Prefix - -Mtas can produce a list of available prefixes. To get this information, in Solr requests, besides the parameter to enable [Mtas queries](search_query.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.stats.prefix | true | yes | - -Information for multiple fields can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required statistics. The list of available prefixes is independent of any restriction in the document set, and also prefixes of deleted documents can be taken into account when the core hasn't been optimized. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.stats.prefix.\<identifier\>.key | \<string\> | key used in response | no | -| mtas.stats.prefix.\<identifier\>.field | \<string\> | Mtas field | yes | - -The *key* is added to the response and may be used to distinguish between multiple lists, and should therefore be unique. The response will contain three lists: prefixes strictly used for single position tokens, prefixes (also) used for multiple position tokens and prefixes used for multiple non adjacent positions. Notice that the last list will always be a subset of the second list. - -## Examples -1. [Basic](#basic) : list of available prefixes. - -<a name="basic"></a> - -### Basic - -**Example** -List of avilable prefixes. - -**Request and response** -`q=*%3A*&mtas=true&mtas.prefix=true&mtas.prefix.0.field=text&mtas.prefix.0.key=example+-+basic&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "prefix":[{ - "key":"example - basic", - "singlePosition":["feat.buiging", - "feat.conjtype", - "feat.dial", - "feat.genus", - "feat.getal", - "feat.getal-n", - "feat.graad", - "feat.head", - "feat.lwtype", - "feat.naamval", - "feat.npagr", - "feat.ntype", - "feat.numtype", - "feat.pdtype", - "feat.persoon", - "feat.positie", - "feat.pvagr", - "feat.pvtijd", - "feat.spectype", - "feat.status", - "feat.vwtype", - "feat.vztype", - "feat.wvorm", - "lemma", - "morpheme", - "pos", - "t", - "t_lc"], - "multiplePosition":["div", - "entity", - "head", - "p", - "s"], - "setPosition":["entity"]}]} -``` - -**Lucene** - -To get a list of prefixes [directly in Lucene](installation_lucene.html), *ComponentPrefix* together with the provided *collect* method can be used. \ No newline at end of file diff --git a/src/site/markdown/search_query_stats.md b/src/site/markdown/search_query_stats.md deleted file mode 100644 index 0456640..0000000 --- a/src/site/markdown/search_query_stats.md +++ /dev/null @@ -1,10 +0,0 @@ -#Statistics - -To get statistics in Solr requests, besides the parameter to enable [Mtas queries](search_query.html), the following parameter should be used. - -| Parameter | Value | Obligatory | -|-------------|--------|-------------| -| mtas.stats | true | yes | - -Using this parameter, it is possible to add statistics on [positions](search_query_stats_positions.html), [tokens](search_query_stats_tokens.html) and [spans](search_query_stats_spans.html) to the response on a request. - diff --git a/src/site/markdown/search_query_stats_positions.md b/src/site/markdown/search_query_stats_positions.md deleted file mode 100644 index 34c56ba..0000000 --- a/src/site/markdown/search_query_stats_positions.md +++ /dev/null @@ -1,142 +0,0 @@ -#Statistics - positions - -To get statistics on the number of positions within a set of documents in Solr requests, besides the parameter to enable [statistics](search_query_stats.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.stats.positions | true | yes | - -Multiple statistics on positions can be produced within the same request. -To distinguish them, a unique identifier has to be provided for -each of the required statistics. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.stats.positions.\<identifier\>.key | \<string\> | key used in response | no | -| mtas.stats.positions.\<identifier\>.field | \<string\> | Mtas field | yes | -| mtas.stats.positions.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | -| mtas.stats.positions.\<identifier\>.minimum | \<double\> | minimum number of positions | no | -| mtas.stats.positions.\<identifier\>.maximum | \<double\> | maximum number of positions | no | - -The *key* is added to the response and may be used to distinguish between multiple statistics on positions, and should therefore be unique. The optional *minimum* and *maximum* can be used to focus only on documents satisfying a condition on the number of positions. - ---- - -## Examples -1. [Basic](#basic) : basic statistics on the number of positions. -2. [Minimum and maximum](#minimum-and-maximum) : statistics on the number of positions with restrictions on this number. -3. [Subset](#subset) : statistics on the number of positions within a subset of documents. - ---- - -<a name="basic"></a> - -### Basic - -**Example** -Total and average number of positions and the number of documents. - -**Request and response** -`q=*%3A*&rows=0&mtas=true&mtas.stats=true&mtas.stats.positions=true&mtas.stats.positions.0.field=text&mtas.stats.positions.0.key=example - basic&mtas.stats.positions.0.type=sum,mean,n&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "positions":[{ - "key":"example - basic", - "mean":244.26537188929916, - "sum":504361094, - "n":2064808}]}} -``` - -<a name="minimum-and-maximum"></a> - -### Minimum and maximum - -**Example** -Full statistics on positions for documents with a minimum of 100 positions, for documents with a maximum of 200 positions, and for documents with between 100 and 200 positions. - -**Request and response** -`q=*%3A*&rows=0&mtas=true&mtas.stats=true&mtas.stats.positions=true&mtas.stats.positions.0.field=text&mtas.stats.positions.0.key=example - minimum&mtas.stats.positions.0.type=all&mtas.stats.positions.0.minimum=100&mtas.stats.positions.1.field=text&mtas.stats.positions.1.key=example - maximum&mtas.stats.positions.1.type=all&mtas.stats.positions.1.maximum=200&mtas.stats.positions.2.field=text&mtas.stats.positions.2.key=example - minimum and maximum&mtas.stats.positions.2.type=all&mtas.stats.positions.2.minimum=100&mtas.stats.positions.2.maximum=200&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "positions":[{ - "key":"example - minimum", - "sumsq":4.407777345501E12, - "populationvariance":4021377.043206717, - "max":419252.0, - "sum":4.53494907E8, - "kurtosis":7589.040501278469, - "standarddeviation":2005.3380969650148, - "n":1047253, - "quadraticmean":2051.5590305379797, - "min":100.0, - "median":232.0, - "variance":4021380.883139267, - "mean":433.0328077360544, - "geometricmean":269.1549624469481, - "sumoflogs":5859681.392265234, - "skewness":70.39565176567714}, - { - "key":"example - maximum", - "sumsq":1.2589493055E10, - "populationvariance":2516.516960673755, - "max":200.0, - "sum":1.14146849E8, - "kurtosis":-0.5513713934014715, - "standarddeviation":50.164914844725146, - "n":1462493, - "quadraticmean":92.78060994263417, - "min":0.0, - "median":68.0, - "variance":2516.5186813785253, - "mean":78.04950109162947, - "geometricmean":0.0, - "sumoflogs":"-Infinity", - "skewness":0.6202671670124106}, - { - "key":"example - minimum and maximum", - "sumsq":9.370630488E9, - "populationvariance":832.9926334704653, - "max":200.0, - "sum":6.3280662E7, - "kurtosis":-1.0893405044786282, - "standarddeviation":28.861644194831847, - "n":444938, - "quadraticmean":145.12246855142547, - "min":100.0, - "median":139.0, - "variance":832.9945056290709, - "mean":142.22355024745016, - "geometricmean":139.3394542837307, - "sumoflogs":2196620.2289446634, - "skewness":0.31081665704505534}]}} -``` - -<a name="subset"></a> - -### Subset - -**Example** -Total and average number of positions and the number of documents for a subset of documents. - -**Request and response** -`q=text:koe&rows=0&mtas=true&mtas.stats=true&mtas.stats.positions=true&mtas.stats.positions.0.field=text&mtas.stats.positions.0.key=example - subset&mtas.stats.positions.0.type=sum,mean,n&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "positions":[{ - "key":"example - subset", - "mean":5265.321033210332, - "sum":14269020, - "n":2710}]}} -``` - ---- - -##Lucene - -To use statistics on the number of positions [directly in Lucene](installation_lucene.html), *ComponentPosition* together with the provided *collect* method can be used. diff --git a/src/site/markdown/search_query_stats_spans.md b/src/site/markdown/search_query_stats_spans.md deleted file mode 100644 index 4af2ba0..0000000 --- a/src/site/markdown/search_query_stats_spans.md +++ /dev/null @@ -1,416 +0,0 @@ -#Statistics - spans - -To get statistics on the occurrence of a span within a set of documents in Solr requests, besides the parameter to enable [statistics](search_query_stats.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.stats.spans | true | yes | - -Multiple statistics on the occurrence of a span can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required statistics. Furthermore, statistics for the occurrence of multiple spans can be produced. Spans are described by a query, and to distinguish multiple spans, also a query identifier has to be provided. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.stats.spans.\<identifier\>.key | \<string\> | key used in response | no | -| mtas.stats.spans.\<identifier\>.field | \<string\> | Mtas field | yes | -| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.type | \<string\> | query language: [cql](search_cql.html) | yes | -| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.value | \<string\> | query: [cql](search_cql.html) | yes | -| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.prefix | \<string\> | default prefix | no | -| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.ignore | \<string\> | ignore query: [cql](search_cql.html) | no | -| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.maximumIgnoreLength | \<integer\> | maximum number of succeeding occurrences to ignore | no | -| mtas.stats.spans.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | -| mtas.stats.spans.\<identifier\>.minimum | \<double\> | minimum number of occurrences span | no | -| mtas.stats.spans.\<identifier\>.maximum | \<double\> | maximum number of occurrences span | no | - -The *key* is added to the response and may be used to distinguish between multiple statistics on the occurrence of spans, and should therefore be unique. The optional *minimum* and *maximum* can be used to focus only on documents satisfying a condition on the number of occurrences of the spans. When multiple queries are provided, the provided boundary will hold on the sum of occurrences of the resulting spans. - -## Variables - -The query may contain one or more variables, and the value(s) of these variables have to be defined - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.variable\<identifier variable\>.name | \<string\> | name of variable | yes | -| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.variable\<identifier variable\>.value | \<string\> | comma separated list of values | yes | - -## Functions - -To compute statistics for values based on the occurrence of one or multiple spans, optionally [functions](search_functions.html) can be added. The parameters for these functions are the number of occurrences *$q0*, *$q1*, ... for each span and the number of positions *$n* in a document. Statistics on the value computed for each document in the set are added to the response. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.stats.spans.\<identifier\>.function.\<identifier function\>.key | \<string\> | key used in response | no | -| mtas.stats.spans.\<identifier\>.function.\<identifier function\>.expression | \<string\> | see [functions](search_functions.html) | yes | -| mtas.stats.spans.\<identifier\>.function.\<identifier function\>.type | \<string\> | required [type of statistics](search_stats.html) | no | - -Again, the *key* is added to the response and may be used to distinguish between multiple functions, and should therefore be unique. - ---- - -## Examples -1. [Basic](#basic) : basic statistics on the occurrence of a word. -2. [Minimum and Maximum](#minimum-and-maximum) : statistics on the occurrence of a word with restrictions on the number of occurrences. -3. [Subset](#subset) : statistics on the occurrence of a word within a subset of documents. -4. [Multiple](#multiple) : statistics on the occurrence of multiple words. -5. [Prefix](#prefix) : default prefix for query -5. [Ignore](#ignore) : query with ignore -6. [Ignore and maximumIgnoreLength](#ignore-and-maximumignorelength) : query with ignore and maximumIgnoreLength -6. [Functions](#functions) : statistics using functions. -7. [Multiple and Functions](#multiple-and-functions) : statistics using functions on the occurrence of multiple words. - ---- - -<a name="basic"></a> - -### Basic - -**Example** -Total and average number of occurrences of the word "de" and the number of documents. - -**CQL** -`[t="de"]` - -**Request and response** -`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=%5Bt%3D%22de%22%5D&mtas.stats.spans.0.key=example - basic&mtas.stats.spans.0.type=n%2Csum%2Cmean&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "spans":[{ - "key":"example - basic", - "mean":10.488239100197209, - "sum":21656200, - "n":2064808}]}} -``` - -<a name="minimum-and-maximum"></a> - -### Minimum and Maximum - -**Example** -Full statistics on the number of occurrences of the word "de" for documents with a minimum of 100 occurrences, for documents with a maximum of 200 occurrences, and for documents with between 100 and 200 occurrences. - -**CQL** -`[t="de"]` - -**Request and response** -`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t%3D"de"]&mtas.stats.spans.0.key=example - minimum&mtas.stats.spans.0.type=all&mtas.stats.spans.0.minimum=100&mtas.stats.spans.1.field=text&mtas.stats.spans.1.query.0.type=cql&mtas.stats.spans.1.query.0.value=[t%3D"de"]&mtas.stats.spans.1.key=example - maximum&mtas.stats.spans.1.type=all&mtas.stats.spans.1.maximum=200&mtas.stats.spans.2.field=text&mtas.stats.spans.2.query.0.type=cql&mtas.stats.spans.2.query.0.value=[t%3D"de"]&mtas.stats.spans.2.key=example - minimum and maximum&mtas.stats.spans.2.type=all&mtas.stats.spans.2.minimum=100&mtas.stats.spans.2.maximum=200&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "spans":[{ - "key":"example - minimum", - "sumsq":8.697655383E9, - "populationvariance":419224.862744871, - "max":18192.0, - "sum":4531747.0, - "kurtosis":164.01633761739456, - "standarddeviation":647.4937185426337, - "n":18030, - "quadraticmean":694.5495506941058, - "min":100.0, - "median":136.0, - "variance":419248.1155521673, - "mean":251.3448141985584, - "geometricmean":160.50112302303313, - "sumoflogs":91561.76594051626, - "skewness":10.552060273112971}, - { - "key":"example - maximum", - "sumsq":7.37391079E8, - "populationvariance":271.8217238864797, - "max":200.0, - "sum":1.9102393E7, - "kurtosis":31.734626574581217, - "standarddeviation":16.487020826545898, - "n":2061623, - "quadraticmean":18.91229851589547, - "min":0.0, - "median":4.0, - "variance":271.82185573495815, - "mean":9.265706193615522, - "geometricmean":0.0, - "sumoflogs":"-Infinity", - "skewness":4.741031505227169}, - { - "key":"example - minimum and maximum", - "sumsq":2.73698488E8, - "populationvariance":684.3248008017308, - "max":200.0, - "sum":1977940.0, - "kurtosis":-0.47377181206297303, - "standarddeviation":26.16048359466255, - "n":14845, - "quadraticmean":135.78321834689768, - "min":100.0, - "median":127.0, - "variance":684.3709019066084, - "mean":133.23947457056252, - "geometricmean":130.83072059647412, - "sumoflogs":72353.10901272473, - "skewness":0.7177265003819447}]}} -``` - -<a name="subset"></a> - -### Subset - -**Example** -Total and average number of occurrences of the word "de" and the number of documents for a subset of documents. - -**CQL** -`[t="de"]` - -**Request and response** -`q=text:koe&rows=0&mtas=true&mtas.stats=true&mtas.stats.tokens=true&mtas.stats.tokens.0.field=text&mtas.stats.tokens.0.key=example - subset&mtas.stats.tokens.0.type=sum,mean,n&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "tokens":[{ - "key":"example - subset", - "mean":42901.60996309963, - "sum":116263363, - "n":2710}]}} -``` - -<a name="multiple"></a> - -### Multiple - -**Example** -Total and average number of occurrences of the word "de" and "het", and the number of documents. - -**CQL** -1. combined cql: `[t="de"|t="het"]` -2. combined regexp: `[t="(de|het)"]` -3. two queries: `[t="de"]` `[t="het"]` - -**Request and response** -`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t%3D"de"|t%3D"het"]&mtas.stats.spans.0.key=multiple+-+combined+cql&mtas.stats.spans.0.type=n%2Csum%2Cmean&mtas.stats.spans.1.field=text&mtas.stats.spans.1.query.0.type=cql&mtas.stats.spans.1.query.0.value=[t%3D"(de|het)"]&mtas.stats.spans.1.key=multiple+-+combined+regexp&mtas.stats.spans.1.type=n%2Csum%2Cmean&mtas.stats.spans.2.field=text&mtas.stats.spans.2.query.0.type=cql&mtas.stats.spans.2.query.0.value=[t%3D"de"]&mtas.stats.spans.2.query.1.type=cql&mtas.stats.spans.2.query.1.value=[t%3D"het"]&mtas.stats.spans.2.key=multiple+-+two+queries&mtas.stats.spans.2.type=n%2Csum%2Cmean&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "spans":[{ - "key":"multiple - combined cql", - "mean":15.178130848001365, - "sum":31339926, - "n":2064808}, - { - "key":"multiple - combined regexp", - "mean":15.178130848001365, - "sum":31339926, - "n":2064808}, - { - "key":"multiple - two queries", - "mean":15.178130848001365, - "sum":31339926, - "n":2064808}]}} -``` - -<a name="prefix"></a> - -### Prefix - -**Example** -Total and average number of occurrences of the word "de" followed by an adjective. - -**CQL** -`"de" [pos="ADJ"]` - -**Request and response** -`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value="de" [pos%3D"ADJ"]&mtas.stats.spans.0.query.0.prefix=t_lc&mtas.stats.spans.0.key=example - prefix&mtas.stats.spans.0.type=n%2Csum%2Cmean&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "spans":[{ - "key":"example - prefix", - "mean":2.1725308115815127, - "sum":4485859, - "n":2064808}]}} -``` - -<a name="ignore"></a> - -### Ignore - -**Example** -Total and average number of occurrences of an article followed by a noun, ignoring adjectives. - -**CQL** -`[pos="LID"][pos="N"]` - -**Ignore** -`[pos="ADJ"]` - - -**Request and response** -`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t_lc%3D"de"]&mtas.stats.spans.0.key=functions+-+de&mtas.stats.spans.0.type=n%2Csum%2Cmean&mtas.stats.spans.0.function.0.expression=%24q0%2F%24n&mtas.stats.spans.0.function.0.key=relative+frequency&mtas.stats.spans.0.function.0.type=mean%2Cstandarddeviation%2Cdistribution(start%3D0%2Cend%3D0.1%2Cnumber%3D10)&mtas.stats.spans.0.function.1.expression=%24n&mtas.stats.spans.0.function.1.key=number+of+words&mtas.stats.spans.0.function.1.type=n%2Csum&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "spans":[{ - "key":"functions - de", - "mean":12.352043386116287, - "sum":25504598, - "n":2064808, - "functions":{ - "number of words":{ - "sum":504361094, - "n":2064808}, - "relative frequency":{ - "distribution(start=0,end=0.1,number=10)":{ - "[0.000,0.010)":390003, - "[0.010,0.020)":120903, - "[0.020,0.030)":173830, - "[0.030,0.040)":209994, - "[0.040,0.050)":245098, - "[0.050,0.060)":253528, - "[0.060,0.070)":218325, - "[0.070,0.080)":163982, - "[0.080,0.090)":115929, - "[0.090,0.100)":77207}, - "mean":0.04538673326024501, - "errorList":{"division by zero":1039}, - "standarddeviation":0.03284884758453086, - "errorNumber":1039}}}]}} -``` - -<a name="ignore-and-maximumignorelength"></a> - -### Ignore and maximumIgnoreLength - -<a name="functions"></a> - -### Functions - -**Example** -Statistics for the relative frequency of the word "de" and the total number of words in documents containing this word. - -**CQL** -`[t="de"]` - -**Functions** -`$q0/$n` -`$n` - -**Request and response** -`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t_lc%3D"de"]&mtas.stats.spans.0.key=functions+-+de&mtas.stats.spans.0.type=n%2Csum%2Cmean&mtas.stats.spans.0.function.0.expression=%24q0%2F%24n&mtas.stats.spans.0.function.0.key=relative+frequency&mtas.stats.spans.0.function.0.type=mean%2Cstandarddeviation%2Cdistribution(start%3D0%2Cend%3D0.1%2Cnumber%3D10)&mtas.stats.spans.0.function.1.expression=%24n&mtas.stats.spans.0.function.1.key=number+of+words&mtas.stats.spans.0.function.1.type=n%2Csum&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "spans":[{ - "key":"functions - de", - "mean":12.352043386116287, - "sum":25504598, - "n":2064808, - "functions":{ - "number of words":{ - "sum":504361094, - "n":2064808}, - "relative frequency":{ - "distribution(start=0,end=0.1,number=10)":{ - "[0.000,0.010)":390003, - "[0.010,0.020)":120903, - "[0.020,0.030)":173830, - "[0.030,0.040)":209994, - "[0.040,0.050)":245098, - "[0.050,0.060)":253528, - "[0.060,0.070)":218325, - "[0.070,0.080)":163982, - "[0.080,0.090)":115929, - "[0.090,0.100)":77207}, - "mean":0.04538673326024501, - "errorList":{"division by zero":1039}, - "standarddeviation":0.03284884758453086, - "errorNumber":1039}}}]}} -``` - -<a name="multiple-and-functions"></a> - -### Multiple and Functions - -**Example** -Statistics for the absolute and relative frequency of the words "de", "het" and "een", for *part of speech* type "LID" and the total number of words in documents containing this word. - -**CQL** -`[t="de"]` -`[t="het"]` -`[t="een"]` -`[pos="LID"]` - -**Functions** -`$q0/$n` -`$q1/$n` -`$q2/$n` -`$q3/$n` -`$q0/$q3` -`$q1/$q3` -`$q2/$q3` -`($q0+$q1+$q2)/$q3` - -**Request and response** -`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t_lc%3D"de"]&mtas.stats.spans.0.query.1.type=cql&mtas.stats.spans.0.query.1.value=[t_lc%3D"het"]&mtas.stats.spans.0.query.2.type=cql&mtas.stats.spans.0.query.2.value=[t_lc%3D"een"]&mtas.stats.spans.0.query.3.type=cql&mtas.stats.spans.0.query.3.value=[pos%3D"LID"]&mtas.stats.spans.0.key=multiple+and+functions+-+de%2Bhet%2Been+and+LID&mtas.stats.spans.0.type=n&mtas.stats.spans.0.minimum=1&mtas.stats.spans.0.function.0.expression=%24q0&mtas.stats.spans.0.function.0.key=de+-+absolute&mtas.stats.spans.0.function.0.type=n%2Csum&mtas.stats.spans.0.function.1.expression=%24q1&mtas.stats.spans.0.function.1.key=het+-+absolute&mtas.stats.spans.0.function.1.type=n%2Csum&mtas.stats.spans.0.function.2.expression=%24q2&mtas.stats.spans.0.function.2.key=een+-+absolute&mtas.stats.spans.0.function.2.type=n%2Csum&mtas.stats.spans.0.function.3.expression=%24q3&mtas.stats.spans.0.function.3.key=LID+-+absolute&mtas.stats.spans.0.function.3.type=n%2Csum&mtas.stats.spans.0.function.4.expression=%24q0%2F%24n&mtas.stats.spans.0.function.4.key=de+-+relative+to+positions&mtas.stats.spans.0.function.4.type=n%2Cmean&mtas.stats.spans.0.function.5.expression=%24q1%2F%24n&mtas.stats.spans.0.function.5.key=het+-+relative+to+positions&mtas.stats.spans.0.function.5.type=n%2Cmean&mtas.stats.spans.0.function.6.expression=%24q2%2F%24n&mtas.stats.spans.0.function.6.key=een+-+relative+to+positions&mtas.stats.spans.0.function.6.type=n%2Cmean&mtas.stats.spans.0.function.7.expression=%24q3%2F%24n&mtas.stats.spans.0.function.7.key=LID+-+relative+to+positions&mtas.stats.spans.0.function.7.type=n%2Cmean&mtas.stats.spans.0.function.8.expression=%24q0%2F%24q3&mtas.stats.spans.0.function.8.key=de+-+relative+to+LID&mtas.stats.spans.0.function.8.type=n%2Cmean&mtas.stats.spans.0.function.9.expression=%24q1%2F%24q3&mtas.stats.spans.0.function.9.key=het+-+relative+to+LID&mtas.stats.spans.0.function.9.type=n%2Cmean&mtas.stats.spans.0.function.10.expression=%24q2%2F%24q3&mtas.stats.spans.0.function.10.key=een+-+relative+to+LID&mtas.stats.spans.0.function.10.type=n%2Cmean&mtas.stats.spans.0.function.11.expression=(%24q0%2B%24q1%2B%24q2)%2F%24q3&mtas.stats.spans.0.function.11.key=de%2Bhet%2Been+-+relative+to+LID&mtas.stats.spans.0.function.11.type=n%2Cmean&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "spans":[{ - "key":"multiple and functions - de+het+een and LID", - "n":1890377, - "functions":{ - "een - relative to LID":{ - "mean":0.26177400695591124, - "errorList":{"division by zero":24175}, - "n":1890377, - "errorNumber":24175}, - "LID - absolute":{ - "sum":44077220, - "n":1890377}, - "de+het+een - relative to LID":{ - "mean":1.0864079360130154, - "errorList":{"division by zero":24175}, - "n":1890377, - "errorNumber":24175}, - "het - relative to LID":{ - "mean":0.2740826070638114, - "errorList":{"division by zero":24175}, - "n":1890377, - "errorNumber":24175}, - "een - relative to positions":{ - "mean":0.021631171906706374, - "n":1890377}, - "een - absolute":{ - "sum":10620744, - "n":1890377}, - "het - relative to positions":{ - "mean":0.02235754528581941, - "n":1890377}, - "de - absolute":{ - "sum":25504598, - "n":1890377}, - "het - absolute":{ - "sum":11530937, - "n":1890377}, - "LID - relative to positions":{ - "mean":0.08693980190126971, - "n":1890377}, - "de - relative to LID":{ - "mean":0.5505513219945993, - "errorList":{"division by zero":24175}, - "n":1890377, - "errorNumber":24175}, - "de - relative to positions":{ - "mean":0.049574709134571515, - "n":1890377}}}]}} -``` - ---- - -##Lucene - -To use statistics on the occurrence of a span [directly in Lucene](installation_lucene.html), *ComponentSpan* together with the provided *collect* method can be used. \ No newline at end of file diff --git a/src/site/markdown/search_query_stats_tokens.md b/src/site/markdown/search_query_stats_tokens.md deleted file mode 100644 index 9c5eed6..0000000 --- a/src/site/markdown/search_query_stats_tokens.md +++ /dev/null @@ -1,142 +0,0 @@ -#Statistics - tokens - -To get statistics on the number of tokens within a set of documents in Solr requests, besides the parameter to enable [statistics](search_query_stats.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.stats.tokens | true | yes | - -Multiple statistics on tokens can be produced within the same request. -To distinguish them, a unique identifier has to be provided for -each of the required statistics. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.stats.tokens.\<identifier\>.key | \<string\> | key used in response | no | -| mtas.stats.tokens.\<identifier\>.field | \<string\> | Mtas field | yes | -| mtas.stats.tokens.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | -| mtas.stats.tokens.\<identifier\>.minimum | \<double\> | minimum number of tokens | no | -| mtas.stats.tokens.\<identifier\>.maximum | \<double\> | maximum number of tokens | no | - -The *key* is added to the response and may be used to distinguish between multiple statistics on tokens, and should therefore be unique. The optional *minimum* and *maximum* can be used to focus only on documents satisfying a condition on the number of tokens. - ---- - -## Examples -1. [Basic](#basic) : basic statistics on the number of tokens. -2. [Minimum and maximum](#minimum-and-maximum) : statistics on the number of tokens with restrictions on this number. -3. [Subset](#subset) : statistics on the number of tokens within a subset of documents. - ---- - -<a name="basic"></a> - -### Basic - -**Example** -Total and average number of tokens and the number of documents. - -**Request and response** -`q=*%3A*&rows=0&mtas=true&mtas.stats=true&mtas.stats.tokens=true&mtas.stats.tokens.0.field=text&mtas.stats.tokens.0.key=example - basic&mtas.stats.tokens.0.type=sum,mean,n&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "tokens":[{ - "key":"example - basic", - "mean":1949.101406523028, - "sum":4024520177, - "n":2064808}]}} -``` - -<a name="minimum-and-maximum"></a> - -### Minimum and maximum - -**Example** -Full statistics on tokens for documents with a minimum of 100 tokens, for documents with a maximum of 200 tokens, and for documents with between 100 and 200 tokens. - -**Request and response** -`q=*%3A*&rows=0&mtas=true&mtas.stats=true&mtas.stats.tokens=true&mtas.stats.tokens.0.field=text&mtas.stats.tokens.0.key=example - minimum&mtas.stats.tokens.0.type=all&mtas.stats.tokens.0.minimum=500&mtas.stats.tokens.1.field=text&mtas.stats.tokens.1.key=example - maximum&mtas.stats.tokens.1.type=all&mtas.stats.tokens.1.maximum=1000&mtas.stats.tokens.2.field=text&mtas.stats.tokens.2.key=example - minimum and maximum&mtas.stats.tokens.2.type=all&mtas.stats.tokens.2.minimum=500&mtas.stats.tokens.2.maximum=1000&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "tokens":[{ - "key":"example - minimum", - "sumsq":2.91825668357275E14, - "populationvariance":2.022964435797023E8, - "max":3320612.0, - "sum":3.837278477E9, - "kurtosis":9580.99014557769, - "standarddeviation":14223.100544366072, - "n":1390207, - "quadraticmean":14488.452755067281, - "min":500.0, - "median":1359.0, - "variance":2.0229658909514648E8, - "mean":2760.2209433559033, - "geometricmean":1584.8982392362057, - "sumoflogs":1.0243428152831953E7, - "skewness":79.47215006871889}, - { - "key":"example - maximum", - "sumsq":3.33432806009E11, - "populationvariance":65815.48228216589, - "max":1000.0, - "sum":5.49051031E8, - "kurtosis":-0.9495132030213522, - "standarddeviation":256.54539199058576, - "n":1178024, - "quadraticmean":532.0189410229931, - "min":0.0, - "median":441.0, - "variance":65815.53815160331, - "mean":466.07796700236827, - "geometricmean":0.0, - "sumoflogs":"-Infinity", - "skewness":0.2518109944817064}, - { - "key":"example - minimum and maximum", - "sumsq":2.70110872559E11, - "populationvariance":20021.06838039624, - "max":1000.0, - "sum":3.61809331E8, - "kurtosis":-1.0824803795579663, - "standarddeviation":141.49596513804715, - "n":503423, - "quadraticmean":732.4947329880449, - "min":500.0, - "median":704.0, - "variance":20021.108150347452, - "mean":718.6984523949043, - "geometricmean":704.889293672351, - "sumoflogs":3301468.553637138, - "skewness":0.2634725299866506}]}} -``` - -<a name="subset"></a> - -### Subset - -**Example** -Total and average number of tokens and the number of documents for a subset of documents. - -**Request and response** -`q=text:koe&rows=0&mtas=true&mtas.stats=true&mtas.stats.tokens=true&mtas.stats.tokens.0.field=text&mtas.stats.tokens.0.key=example - subset&mtas.stats.tokens.0.type=sum,mean,n&wt=json&indent=true` - -``` json -"mtas":{ - "stats":{ - "tokens":[{ - "key":"example - subset", - "mean":42901.60996309963, - "sum":116263363, - "n":2710}]}} -``` - ---- - -##Lucene - -To use statistics on the number of tokens [directly in Lucene](installation_lucene.html), *ComponentToken* together with the provided *collect* method can be used. \ No newline at end of file diff --git a/src/site/markdown/search_query_termvector.md b/src/site/markdown/search_query_termvector.md deleted file mode 100644 index 8d94b0b..0000000 --- a/src/site/markdown/search_query_termvector.md +++ /dev/null @@ -1,326 +0,0 @@ -#Termvector - -Mtas can produce termvectors for the set of documents satisfying the condition and/or filter. To get this information, in Solr requests, besides the parameter to enable [Mtas queries](search_query.html), the following parameter should be provided. - -| Parameter | Value | Obligatory | -|-----------------------|--------|-------------| -| mtas.termvector | true | yes | - -Multiple termvector results can be produced within the same request. To distinguish them, a unique identifier has to be provided for each of the required document results. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.termvector.\<identifier\>.key | \<string\> | key used in response | no | -| mtas.termvector.\<identifier\>.field | \<string\> | Mtas field | yes | -| mtas.termvector.\<identifier\>.prefix | \<string\> | prefix |yes | -| mtas.termvector.\<identifier\>.number | \<double\> | number of terms in list | no | -| mtas.termvector.\<identifier\>.start | \<string\> | begin list after provided term, only if sorted on term | no | -| mtas.termvector.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no | -| mtas.termvector.\<identifier\>.regexp | \<string\> | regular expression condition on term | no | -| mtas.termvector.\<identifier\>.ignoreRegexp | \<string\> | regular expression condition for terms that have to be ignored | no | -| mtas.termvector.\<identifier\>.sort.type | \<string\> | sort on term or [type of statistics](search_stats.html) | no | -| mtas.termvector.\<identifier\>.sort.direction | \<string\> | sort direction: asc or desc | no | - - -## Full - -When using distributed search, instead of applying the more efficient default algorithm where in two rounds lists of terms are collected and combined from the participating cores, also another approach can be used. Using the *full* option, the complete lists of terms (matching all requirements) is collected from the participating cores, and combined afterwards. This approach is likely to be less efficient when huge lists are involved, but necessary for example when results have to be sorted on specific statistics. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.termvector.\<identifier\>.full | \<boolean\> | compute full list of terms | no | - -## List - -If a list of terms is provided, the termvector will be restricted to items from this list. These items may be configured to be interpreted as explicit terms or as regular expressions. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.termvector.\<identifier\>.list | [\<string\>,...] | list of terms | yes | -| mtas.termvector.\<identifier\>.listRegexp | \<boolean\> | interpret items in provided list as regular expressions | no | - -Furthermore, a list of terms can be provided that should be ignored within the termvector. These items may also be configured to be interpreted as explicit terms or as regular expressions. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.termvector.\<identifier\>.ignoreList | [\<string\>,...] | list of terms | yes | -| mtas.termvector.\<identifier\>.ignoreListRegexp | \<boolean\> | interpret items in provided ignoreList as regular expressions | no | - - -## Functions - -Besides the specified statistics on hits over the documents, also statistics on the computed value of functions on the number of hits and the total number of words over the documents can be provided. In the definition of such a function, the number of hits is referred to as *$q0*, and the number of words is referred to as $n. - -| Parameter | Value | Info | Obligatory | -|-------------------------------------------------|--------------|--------------------------------|-------------| -| mtas.termvector.\<identifier\>.function.\<identifier function\>.key | \<string\> | key used in response | no | -| mtas.termvector.\<identifier\>.function.\<identifier function\>.expression | \<string\> | definition of function | yes | -| mtas.termvector.\<identifier\>.function.\<identifier function\>.type | \<string\> | required [type of statistics](search_stats.html) | no | - - -Again, the key is added to the response and may be used to distinguish between multiple functions, and should therefore be unique within each specified termvector. - ---- - -## Examples -1. [Basic](#basic) : basic statistics on occurring part of speech -2. [Regexp](#regexp) : words of length 5 containing only characters a-z, sorted descending by number of hits -3. [Ignore](#ignore) : previous result, ignoring words ending with $-e$. -4. [List](#list) : termvector for provided list of words. -5. [Start](#start) : termvector for words containing only characters a-z sorted by term and > *koe*. -6. [Functions](#functions) : statistics on hits, relative frequency and total number of words in document for words containing only characters a-z. - ---- - -<a name="basic"></a> - -### Basic - -**Example** -Total and average number of occurrences of part of speech (pos). - -**Request and response** -`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - basic&mtas.termvector.0.field=text&mtas.termvector.0.prefix=pos&mtas.termvector.0.number=3&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "termvector":[{ - "key":"example - basic", - "list":[{ - "mean":200.22966889678833, - "sum":25797991, - "n":128842, - "key":"ADJ"}, - { - "mean":149.53835013602176, - "sum":18689303, - "n":124980, - "key":"BW"}, - { - "mean":459.93552395416265, - "sum":59963634, - "n":130374, - "key":"LET"}]}]} -``` - -<a name="regexp"></a> - -### Regexp - -**Example** -List of words with length 5 and containing only characters a-z, sorted descending by number of hits. - -**Regular expression** -`[a-z]{5}` - -**Request and response** -`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - regexp&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.number=5&mtas.termvector.0.type=n,sum&mtas.termvector.0.regexp=[a-z]{5}&mtas.termvector.0.sort.type=sum&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "termvector":[{ - "key":"example - regexp", - "list":[{ - "sum":972687, - "n":94160, - "key":"heeft"}, - { - "sum":645227, - "n":84306, - "key":"wordt"}, - { - "sum":436038, - "n":82453, - "key":"onder"}, - { - "sum":391488, - "n":40512, - "key":"zijne"}, - { - "sum":314539, - "n":62316, - "key":"welke"}]}]} -``` - -<a name="ignore"></a> - -### Ignore - -**Example** -List of words with length 5 and containing only characters a-z, sorted descending by number of hits, ignoring all words ending with $-e$. - -**Regular expressions** -`[a-z]{5}` -`.*e` - -**Request and response** -`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - ignore&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.number=5&mtas.termvector.0.type=n,sum&mtas.termvector.0.regexp=[a-z]{5}&mtas.termvector.0.ignoreRegexp=.*e&mtas.termvector.0.sort.type=sum&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "termvector":[{ - "key":"example - ignore", - "list":[{ - "sum":972687, - "n":94160, - "key":"heeft"}, - { - "sum":645227, - "n":84306, - "key":"wordt"}, - { - "sum":436038, - "n":82453, - "key":"onder"}, - { - "sum":304620, - "n":60555, - "key":"leven"}, - { - "sum":297160, - "n":58263, - "key":"waren"}]}]} -``` - -<a name="basic"></a> - -### List - -**Example** -Termvector for provided list of words. - -**List** -`koe,paard,schaap,geit,kip` - -**Request and response** -`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - list&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.list=koe,paard,schaap,geit,kip&mtas.termvector.0.type=n,sum&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "termvector":[{ - "key":"example - list", - "list":[{ - "sum":1128, - "n":683, - "key":"geit"}, - { - "sum":1410, - "n":864, - "key":"kip"}, - { - "sum":4432, - "n":2344, - "key":"koe"}, - { - "sum":15478, - "n":7436, - "key":"paard"}, - { - "sum":2154, - "n":1591, - "key":"schaap"}]}]} -``` - -<a name="start"></a> - -### Start - -**Example** -Termvector for words containing only characters a-z sorted by term and > *koe*. - -**Request and response** -`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - start&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.regexp=[a-z]*&mtas.termvector.0.number=5&mtas.termvector.0.start=koe&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "termvector":[{ - "key":"example - start", - "list":[{ - "mean":2.0, - "sum":2, - "n":1, - "key":"koea"}, - { - "mean":1.0, - "sum":1, - "n":1, - "key":"koeaan"}, - { - "mean":1.0, - "sum":2, - "n":2, - "key":"koeachtig"}, - { - "mean":1.5, - "sum":3, - "n":2, - "key":"koeachtige"}, - { - "mean":1.0, - "sum":2, - "n":2, - "key":"koeala"}]}]} -``` - - -<a name="functions"></a> - -### Functions - -**Example** -List of words containing only characters a-z, sorted descending by number of hits, with statistics on hits, relative frequency and total number of words in document. - -**Regular expression** -`[a-z]*` - -**Functions** -`$q0/$n` -`$n` - -**Request and response** -`q=*%3A*&mtas=true&mtas.termvector=true&mtas.termvector.0.key=example - list&mtas.termvector.0.field=text&mtas.termvector.0.prefix=t_lc&mtas.termvector.0.regexp=[a-z]*&mtas.termvector.0.sort.type=sum&mtas.termvector.0.type=n,sum&mtas.termvector.0.function.0.expression=%24q0%2F%24n&mtas.termvector.0.function.0.key=relative+frequency&mtas.termvector.0.function.0.type=n%2Cmean&mtas.termvector.0.function.1.expression=%24n&mtas.termvector.0.function.1.key=total+number+of+words&mtas.termvector.0.function.1.type=n%2Csum&mtas.termvector.0.number=3&rows=0&wt=json&indent=true` - -``` json -"mtas":{ - "termvector":[{ - "key":"example - list", - "list":[{ - "sum":15975272, - "n":127444, - "functions":{ - "total number of words":{ - "sum":391924648, - "n":127444}, - "relative frequency":{ - "mean":0.040967994034336694, - "n":127444}}, - "key":"de"}, - { - "sum":10565895, - "n":126197, - "functions":{ - "total number of words":{ - "sum":391190126, - "n":126197}, - "relative frequency":{ - "mean":0.028072930308247233, - "n":126197}}, - "key":"van"}, - { - "sum":8798835, - "n":125415, - "functions":{ - "total number of words":{ - "sum":391306760, - "n":125415}, - "relative frequency":{ - "mean":0.02376864203286862, - "n":125415}}, - "key":"en"}]}]} -``` - -**Lucene** - -To use termvectors [directly in Lucene](installation_lucene.html), *ComponentTermvector* together with the provided *collect* method can be used. - - diff --git a/src/site/markdown/search_sharding.md b/src/site/markdown/search_sharding.md index accd3ec..12fc035 100644 --- a/src/site/markdown/search_sharding.md +++ b/src/site/markdown/search_sharding.md @@ -1,6 +1,6 @@ #Sharding -All [Mtas queries](search_query.html) support sharding. +All [Mtas queries](search_component.html) support sharding. **Example** diff --git a/src/site/markdown/search_stats.md b/src/site/markdown/search_stats.md index c9bfca8..3802097 100644 --- a/src/site/markdown/search_stats.md +++ b/src/site/markdown/search_stats.md @@ -1,7 +1,7 @@ #Type of statistics -Mtas can produce several type of statistics, e.g. for [positions](search_query_stats_positions.html), -[tokens](search_query_stats_tokens.html) or [spans](search_query_stats_spans.html). +Mtas can produce several type of statistics, e.g. for [positions](search_component_stats_positions.html), +[tokens](search_component_stats_tokens.html) or [spans](search_component_stats_spans.html). In general, statistics of type *basic* will require less resources than statistics of type *advanced*, whereas statistics of type *advanced* will require less than these of type *full*. If multiple diff --git a/src/site/site.xml b/src/site/site.xml index 821ceb2..07ee26c 100644 --- a/src/site/site.xml +++ b/src/site/site.xml @@ -39,20 +39,27 @@ </item> <item name="Search" href="search.html" collapse="true"> <item name="Configuration" href="search_configuration.html"/> - <item name="Query" href="search_query.html"> - <item name="Statistics" href="search_query_stats.html" collapse="true"> - <item name="Positions" href="search_query_stats_positions.html"/> - <item name="Tokens" href="search_query_stats_tokens.html"/> - <item name="Spans" href="search_query_stats_spans.html"/> + <item name="Component" href="search_component.html"> + <item name="Statistics" href="search_component_stats.html" collapse="true"> + <item name="Positions" href="search_component_stats_positions.html"/> + <item name="Tokens" href="search_component_stats_tokens.html"/> + <item name="Spans" href="search_component_stats_spans.html"/> </item> - <item name="Kwic" href="search_query_kwic.html"/> - <item name="List" href="search_query_list.html"/> - <item name="Document" href="search_query_document.html"/> - <item name="Termvector" href="search_query_termvector.html"/> - <item name="Facet" href="search_query_facet.html"/> - <item name="Group" href="search_query_group.html"/> - <item name="Prefix" href="search_query_prefix.html"/> + <item name="Kwic" href="search_component_kwic.html"/> + <item name="List" href="search_component_list.html"/> + <item name="Document" href="search_component_document.html"/> + <item name="Termvector" href="search_component_termvector.html"/> + <item name="Facet" href="search_component_facet.html"/> + <item name="Group" href="search_component_group.html"/> + <item name="Prefix" href="search_component_prefix.html"/> + <item name="Collection" href="search_component_collection.html"/> </item> + <item name="Parser" href="search_parser.html"> + <item name="CQL query" href="search_parser_cql.html"/> + <item name="Join query" href="search_parser_join.html"/> + </item> + <item name="Handler" href="search_handler.html"> + </item> <item name="Type of statistics" href="search_stats.html"/> <item name="Functions" href="search_functions.html"/> <item name="CQL" href="search_cql.html"/> -- libgit2 0.22.2