Commit 66ab2eb54584e06f72ed88a460c037e1c3864ec0

Authored by Matthijs Brouwer
1 parent 655fdd7f

fix ignore

docker/Dockerfile
... ... @@ -33,6 +33,7 @@ RUN apt-get update && apt-get install -y lsof software-properties-common python-
33 33 && tar xzf solr-6.3.0.tgz solr-6.3.0/bin/install_solr_service.sh --strip-components=2 \
34 34 && bash ./install_solr_service.sh solr-6.3.0.tgz && rm install_solr_service.sh && rm -rf solr-6.3.0.tgz \
35 35 && service apache2 stop \
  36 +&& echo "ServerName localhost" | tee /etc/apache2/conf-available/fqdn.conf \
36 37 && a2enmod proxy \
37 38 && a2enmod proxy_http \
38 39 && a2enmod proxy_ajp \
... ... @@ -43,6 +44,7 @@ RUN apt-get update && apt-get install -y lsof software-properties-common python-
43 44 && a2enmod proxy_connect \
44 45 && a2enmod proxy_html \
45 46 && a2enmod xml2enc \
  47 +&& a2enconf fqdn \
46 48 && sed -i '/<\/VirtualHost>/ i ProxyPass /solr http://localhost:8983/solr\nProxyPassReverse /solr http://localhost:8983/solr' /etc/apache2/sites-enabled/000-default.conf \
47 49 && rm -rf /var/www/html/* \
48 50 && mkdir /var/www/html/demo \
... ... @@ -71,8 +73,12 @@ RUN apt-get update &amp;&amp; apt-get install -y lsof software-properties-common python-
71 73 && cp -rp demo1 demo2 \
72 74 && cp data/schemaFull.xml demo2/conf/schema.xml \
73 75 && echo "name=demo2" > demo2/core.properties\
  76 +&& cp -rp demo1 demo3 \
  77 +&& cp data/schemaFull.xml demo3/conf/schema.xml \
  78 +&& echo "name=demo3" > demo3/core.properties\
74 79 && mv demo1 /var/solr/data/ \
75   -&& mv demo2 /var/solr/data/
  80 +&& mv demo2 /var/solr/data/ \
  81 +&& mv demo3 /var/solr/data/
76 82  
77 83 CMD bash -C '/start.sh'; 'bash'
78 84  
... ...
docker/site/example_demo3.html 0 → 100644
  1 +<!DOCTYPE html>
  2 +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  3 + <head>
  4 + <meta charset="UTF-8" />
  5 + <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  6 + <title>Multi Tier Annotation Search</title>
  7 + <script type="text/javascript" src="./js/jquery-3.1.1.min.js"></script>
  8 + <script type="text/javascript" src="./js/solr.js"></script>
  9 + <link rel="stylesheet" type="text/css" href="css/style.css">
  10 + </head>
  11 + <body>
  12 +
  13 + <h1>Multi Tier Annotation Search - example demo3</h1>
  14 +
  15 + <div>
  16 + Go to <a href="index.html">main page</a>.
  17 + </div>
  18 +
  19 + <hr noshade />
  20 +
  21 + <h3>Create index</h3>
  22 + Post to /solr/demo3/update
  23 + <div class="solr" data-type="json" data-url="/solr/demo3/update?wt=json&commitWithin=1000">
  24 + <div class="post"><textarea data-autoresize></textarea></div>
  25 + <input class="button post" type="button" value="create index" />
  26 + <input class="button reset" type="button" value="reset" />
  27 + <div class="output"></div>
  28 + <div class="error"></div>
  29 + </div>
  30 +
  31 + <hr noshade />
  32 +
  33 + <h3>Empty index</h3>
  34 + Post to /solr/demo3/update
  35 + <div class="solr" data-type="json" data-url="/solr/demo3/update?wt=json&commitWithin=1000">
  36 + <div class="post"><textarea data-autoresize>{
  37 + "delete": {
  38 + "query": "*:*"
  39 + }
  40 +}</textarea></div>
  41 + <input class="button post" type="button" value="delete index" />
  42 + <input class="button reset" type="button" value="reset" />
  43 + <div class="output"></div>
  44 + <div class="error"></div>
  45 + </div>
  46 +
  47 + <hr noshade />
  48 +
  49 + <h3>Query</h3>
  50 +
  51 + <div>
  52 + description
  53 + <div class="solr" data-type="post" data-url="/solr/demo3/select?indent=true&wt=json">
  54 + <div class="post"><textarea data-autoresize></textarea></div>
  55 + <input class="button post" type="button" value="post" />
  56 + <input class="button reset" type="button" value="reset" />
  57 + <div class="output"></div>
  58 + <div class="error"></div>
  59 + </div>
  60 + </div>
  61 +
  62 + <br />
  63 +
  64 + <hr noshade />
  65 +
  66 + <div>
  67 + Go to <a href="index.html">main page</a>.
  68 + </div>
  69 +
  70 + </body>
  71 +</html>
... ...
docker/site/index.html
... ... @@ -16,6 +16,7 @@
16 16 <li>Go directly to <a target="_blank" href="/solr/">Solr</a></li>
17 17 <li>Go directly to <a target="_blank" href="/solr/#/demo1">Solr - demo1</a></li>
18 18 <li>Go directly to <a target="_blank" href="/solr/#/demo2">Solr - demo2</a></li>
  19 + <li>Go directly to <a target="_blank" href="/solr/#/demo3">Solr - demo3</a></li>
19 20 </ul>
20 21  
21 22 <h3>Available folia</h3>
... ... @@ -25,10 +26,18 @@
25 26 <li><a target="_blank" href="/demo/folia-samples/beets3.xml">/demo/folia-samples/beets3.xml</a> - Nicolaas Beets, <em>Varen en Rijden</em></li>
26 27 </ul>
27 28  
  29 + <h3>Available iso-tei</h3>
  30 + <ul>
  31 + <li><a target="_blank" href="/demo/isotei-samples/file1.xml">/demo/isotei-samples/file1.xml</a> - file 1</li>
  32 + <li><a target="_blank" href="/demo/isotei-samples/file2.xml">/demo/isotei-samples/file2.xml</a> - file 2</li>
  33 + <li><a target="_blank" href="/demo/isotei-samples/file3.xml">/demo/isotei-samples/file3.xml</a> - file 3</li>
  34 + </ul>
  35 +
28 36 <h3>Examples</h3>
29 37 <ul>
30 38 <li><a href="example_demo1.html">Example demo1</a></li>
31 39 <li><a href="example_demo2.html">Example demo2</a></li>
  40 + <li><a href="example_demo3.html">Example demo3</a></li>
32 41 </ul>
33 42  
34 43 <h3>More information</h3>
... ...
src/docker/Dockerfile
... ... @@ -33,6 +33,7 @@ RUN apt-get update &amp;&amp; apt-get install -y lsof software-properties-common python-
33 33 && tar xzf solr-${currentDevelopmentVersion}.tgz solr-${currentDevelopmentVersion}/bin/install_solr_service.sh --strip-components=2 \
34 34 && bash ./install_solr_service.sh solr-${currentDevelopmentVersion}.tgz && rm install_solr_service.sh && rm -rf solr-${currentDevelopmentVersion}.tgz \
35 35 && service apache2 stop \
  36 +&& echo "ServerName localhost" | tee /etc/apache2/conf-available/fqdn.conf \
36 37 && a2enmod proxy \
37 38 && a2enmod proxy_http \
38 39 && a2enmod proxy_ajp \
... ... @@ -43,6 +44,7 @@ RUN apt-get update &amp;&amp; apt-get install -y lsof software-properties-common python-
43 44 && a2enmod proxy_connect \
44 45 && a2enmod proxy_html \
45 46 && a2enmod xml2enc \
  47 +&& a2enconf fqdn \
46 48 && sed -i '/<\/VirtualHost>/ i ProxyPass /solr http://localhost:8983/solr\nProxyPassReverse /solr http://localhost:8983/solr' /etc/apache2/sites-enabled/000-default.conf \
47 49 && rm -rf /var/www/html/* \
48 50 && mkdir /var/www/html/demo \
... ... @@ -71,8 +73,12 @@ RUN apt-get update &amp;&amp; apt-get install -y lsof software-properties-common python-
71 73 && cp -rp demo1 demo2 \
72 74 && cp data/schemaFull.xml demo2/conf/schema.xml \
73 75 && echo "name=demo2" > demo2/core.properties\
  76 +&& cp -rp demo1 demo3 \
  77 +&& cp data/schemaFull.xml demo3/conf/schema.xml \
  78 +&& echo "name=demo3" > demo3/core.properties\
74 79 && mv demo1 /var/solr/data/ \
75   -&& mv demo2 /var/solr/data/
  80 +&& mv demo2 /var/solr/data/ \
  81 +&& mv demo3 /var/solr/data/
76 82  
77 83 CMD bash -C '/start.sh'; 'bash'
78 84  
... ...
src/mtas/search/spans/MtasSpanSequenceSpans.java
... ... @@ -50,7 +50,7 @@ public class MtasSpanSequenceSpans extends Spans implements MtasSpans {
50 50 docId = -1;
51 51 queueSpans = new ArrayList<QueueItem>();
52 52 queueMatches = new ArrayList<Match>();
53   - for (MtasSpanSequenceQuerySpans sequenceSpans : setSequenceSpans) {
  53 + for (MtasSpanSequenceQuerySpans sequenceSpans : setSequenceSpans) {
54 54 queueSpans.add(new QueueItem(sequenceSpans));
55 55 }
56 56 ignoreItem = new MtasIgnoreItem(ignoreSpans, maximumIgnoreLength);
... ... @@ -166,7 +166,9 @@ public class MtasSpanSequenceSpans extends Spans implements MtasSpans {
166 166 allItemsOptional = false;
167 167 }
168 168 if (!item.noMoreDocs) {
169   - if (newDocId == null) {
  169 + if(item.sequenceSpans.spans==null) {
  170 + spanDocId = NO_MORE_DOCS;
  171 + } else if (newDocId == null) {
170 172 spanDocId = item.sequenceSpans.spans.nextDoc();
171 173 } else {
172 174 if (!item.sequenceSpans.optional) {
... ... @@ -270,7 +272,9 @@ public class MtasSpanSequenceSpans extends Spans implements MtasSpans {
270 272 } else {
271 273 Integer spanDocId, newDocId = target;
272 274 for (QueueItem item : queueSpans) {
273   - if (item.sequenceSpans.spans.docID() < newDocId) {
  275 + if(item.sequenceSpans.spans==null) {
  276 + spanDocId = NO_MORE_DOCS;
  277 + } else if (item.sequenceSpans.spans.docID() < newDocId) {
274 278 spanDocId = item.sequenceSpans.spans.advance(newDocId);
275 279 if (spanDocId.equals(NO_MORE_DOCS)) {
276 280 item.noMoreDocs = true;
... ... @@ -362,7 +366,7 @@ public class MtasSpanSequenceSpans extends Spans implements MtasSpans {
362 366 QueueItem item = queueSpans.get(i);
363 367 // if span is optional, check docId
364 368 if (!item.sequenceSpans.optional
365   - || (item.sequenceSpans.spans.docID() == docId)) {
  369 + || (item.sequenceSpans.spans!=null && item.sequenceSpans.spans.docID() == docId)) {
366 370 // compute minimum startPosition until next non-optional item
367 371 // used as lower boundary on endPosition next
368 372 minStartPositionNext = null;
... ...
src/site/markdown/search_cql.md
... ... @@ -16,7 +16,13 @@ For each field containing Mtas tokenized text, every token is associated with a
16 16  
17 17 The optional postfix associated with a token can be queried within CQL by providing a *value*. This is a regular expression, the supported syntax is documented in the RegExp class provided by Lucene. By using a [termvector query](search_query_termvector.html), for each [prefix](#prefix) a list of postfix values can be produced.
18 18  
19   -<a name="#cql"></a>
  19 +<a name="variable"></a>
  20 +
  21 +#### Variable
  22 +
  23 +The optional postfix associated with a token can also be queried within CQL by providing a *variable*. Each variable may occur only once in a CQL query, and should be provided as a comma separated list together with this query. Each provided variable has to occur in the query.
  24 +
  25 +<a name="cql"></a>
20 26  
21 27 ## CQL
22 28  
... ... @@ -28,8 +34,8 @@ The optional postfix associated with a token can be queried within CQL by provid
28 34  
29 35 | Syntax | Description | Example |
30 36 |---------------------------------------|----------------------------------|--------------|
31   -| [cql](#cql)**{** \<number\> **}** | Matches provided number of occurrence from [cql](#cql)| `[pos="ADJ"]{2}` |
32   -| [cql](#cql)**{** \<number\> , \<number\>**}** | Matches each number between provided start and end of occurrence from [cql](#cql)| `[pos="ADJ"]{2,3}` |
  37 +| [cql](#cql) **{** \<number\> **}** | Matches provided number of occurrence from [cql](#cql)| `[pos="ADJ"]{2}` |
  38 +| [cql](#cql) **{** \<number\> , \<number\>**}** | Matches each number between provided start and end of occurrence from [cql](#cql)| `[pos="ADJ"]{2,3}` |
33 39  
34 40  
35 41  
... ... @@ -39,14 +45,17 @@ The optional postfix associated with a token can be queried within CQL by provid
39 45 | **\(** [cql](#cql) **\) !within \(** [cql](#cql) **\)** | Matches cql expression not within another cql expression | `([t="de"]) !within (<s/>)` |
40 46 | **\(** [cql](#cql) **\) containing \(** [cql](#cql) **\)** | Matches cql expression containing another cql expression | `(<s/>) containing ([t="de"])` |
41 47 | **\(** [cql](#cql) **\) !containing \(** [cql](#cql) **\)** | Matches cql expression not containing another cql expression | `(<s/>) !containing ([t="de"])` |
  48 +| **\(** [cql](#cql) **\) intersecting \(** [cql](#cql) **\)** | Matches cql expression intersecting another cql expression | `(<s/>) intersecting (<div/>)` |
  49 +| **\(** [cql](#cql) **\) !intersecting \(** [cql](#cql) **\)** | Matches cql expression not intersecting another cql expression | `(<s/>) !intersecting (<div/>)` |
42 50  
43   -<a name="#token"></a>
  51 +<a name="token"></a>
44 52  
45 53 ## Token
46 54  
47 55 | Syntax | Description | Example |
48 56 |-------------------------------------|-------------------------------------------------|---------|
49 57 | **\[ \]** | Matches each single position token | `[]` |
  58 +| **"** [value](#value) **"** | Matches a single position token with condition defined by a basic [single-position-expression](#single-position-expression), where the prefix is the default prefix provided with the query | `"de"` |
50 59 | **\[** [single-position-expression](#single-position-expression) **\]** | Matches single position token with condition defined by an [single-position-expression](#single-position-expression) | `[t="de"]` |
51 60  
52 61 <a name="single-position-expression"></a>
... ... @@ -56,6 +65,7 @@ The optional postfix associated with a token can be queried within CQL by provid
56 65 | Expression | Syntax | Example |
57 66 |-------------|---------------------------------------------|---------|
58 67 | basic | [prefix](#prefix) **= \"**[value](#value)**\"** | `t="de"`
  68 +| variable | [prefix](#prefix) **= $**[variable-name] | `t=$1`
59 69 | not | **\!** [single-position-expression](#single-position-expression) | `!t="de"` |
60 70 | and | **\(** [single-position-expression](#single-position-expression) **\&** [single-position-expression](#single-position-expression) **\&** ... **\)** | `t="de" & pos="LID"`|
61 71 | or | **\(** [single-position-expression](#single-position-expression) **\|** [single-position-expression](#single-position-expression) **\|** ... **\)** | `t="de" | t="het"` |
... ... @@ -63,7 +73,7 @@ The optional postfix associated with a token can be queried within CQL by provid
63 73 | range | **\#** \<position\> **-** \<position\> | `#100-110` |
64 74  
65 75  
66   -<a name="#multi-position"></a>
  76 +<a name="multi-position"></a>
67 77  
68 78 ## Multi-position
69 79  
... ... @@ -84,7 +94,7 @@ The optional postfix associated with a token can be queried within CQL by provid
84 94 | basic | [prefix](#prefix) **= \"**[value](#value)**\"** |
85 95  
86 96  
87   -<a name="#sequence"></a>
  97 +<a name="sequence"></a>
88 98  
89 99 ## Sequence
90 100  
... ...
src/site/markdown/search_query_stats_spans.md
... ... @@ -13,13 +13,25 @@ Multiple statistics on the occurrence of a span can be produced within the same
13 13 | mtas.stats.spans.\<identifier\>.key | \<string\> | key used in response | no |
14 14 | mtas.stats.spans.\<identifier\>.field | \<string\> | mtas field | yes |
15 15 | mtas.stats.spans.\<identifier\>.query.\<identifier query\>.type | \<string\> | query language: [cql](search_cql.html) | yes |
16   -| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.value | \<string\> | query: [cql](search_cql.html) | yes |
  16 +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.value | \<string\> | query: [cql](search_cql.html) | yes |
  17 +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.prefix | \<string\> | default prefix | no |
  18 +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.ignore | \<string\> | ignore query: [cql](search_cql.html) | no |
  19 +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.maximumIgnoreLength | \<integer\> | maximum number of succeeding occurrences to ignore | no |
17 20 | mtas.stats.spans.\<identifier\>.type | \<string\> | required [type of statistics](search_stats.html) | no |
18 21 | mtas.stats.spans.\<identifier\>.minimum | \<double\> | minimum number of occurrences span | no |
19 22 | mtas.stats.spans.\<identifier\>.maximum | \<double\> | maximum number of occurrences span | no |
20 23  
21 24 The *key* is added to the response and may be used to distinguish between multiple statistics on the occurrence of spans, and should therefore be unique. The optional *minimum* and *maximum* can be used to focus only on documents satisfying a condition on the number of occurrences of the spans. When multiple queries are provided, the provided boundary will hold on the sum of occurrences of the resulting spans.
22 25  
  26 +## Variables
  27 +
  28 +The query may contain one or more variables, and the value(s) of these variables have to be defined
  29 +
  30 +| Parameter | Value | Info | Obligatory |
  31 +|-------------------------------------------------|--------------|--------------------------------|-------------|
  32 +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.variable\<identifier variable\>.name | \<string\> | name of variable | yes |
  33 +| mtas.stats.spans.\<identifier\>.query.\<identifier query\>.variable\<identifier variable\>.value | \<string\> | comma separated list of values | yes |
  34 +
23 35 ## Functions
24 36  
25 37 To compute statistics for values based on the occurrence of one or multiple spans, optionally [functions](search_functions.html) can be added. The parameters for these functions are the number of occurrences *$q0*, *$q1*, ... for each span and the number of positions *$n* in a document. Statistics on the value computed for each document in the set are added to the response.
... ... @@ -39,8 +51,11 @@ Again, the *key* is added to the response and may be used to distinguish between
39 51 2. [Minimum and Maximum](#minimum-and-maximum) : statistics on the occurrence of a word with restrictions on the number of occurrences.
40 52 3. [Subset](#subset) : statistics on the occurrence of a word within a subset of documents.
41 53 4. [Multiple](#multiple) : statistics on the occurrence of multiple words.
42   -5. [Functions](#functions) : statistics using functions.
43   -6. [Multiple and Functions](#multiple-and-functions) : statistics using functions on the occurrence of multiple words.
  54 +5. [Prefix](#prefix) : default prefix for query
  55 +5. [Ignore](#ignore) : query with ignore
  56 +6. [Ignore and maximumIgnoreLength](#ignore-and-maximum-ignore-length) : query with ignore and maximumIgnoreLength
  57 +6. [Functions](#functions) : statistics using functions.
  58 +7. [Multiple and Functions](#multiple-and-functions) : statistics using functions on the occurrence of multiple words.
44 59  
45 60 ---
46 61  
... ... @@ -194,6 +209,80 @@ Total and average number of occurrences of the word &quot;de&quot; and &quot;het&quot;, and the numb
194 209 "n":2064808}]}}}
195 210 ```
196 211  
  212 +<a name="prefix"></a>
  213 +
  214 +### Prefix
  215 +
  216 +**Example**
  217 +Total and average number of occurrences of the word "de" followed by an adjective.
  218 +
  219 +**CQL**
  220 +`"de" [pos="ADJ"]`
  221 +
  222 +**Request and response**
  223 +`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value="de" [pos%3D"ADJ"]&mtas.stats.spans.0.query.0.prefix=t_lc&mtas.stats.spans.0.key=example - prefix&mtas.stats.spans.0.type=n%2Csum%2Cmean&rows=0&wt=json&indent=true`
  224 +
  225 +``` json
  226 +"mtas":{
  227 + "stats":{
  228 + "spans":[{
  229 + "key":"example - prefix",
  230 + "mean":2.1725308115815127,
  231 + "sum":4485859,
  232 + "n":2064808}]}}
  233 +```
  234 +
  235 +<a name="ignore"></a>
  236 +
  237 +### Ignore
  238 +
  239 +**Example**
  240 +Total and average number of occurrences of an article followed by a noun, ignoring adjectives.
  241 +
  242 +**CQL**
  243 +`[pos="LID"][pos="N"]`
  244 +
  245 +**Ignore**
  246 +`[pos="ADJ"]`
  247 +
  248 +
  249 +**Request and response**
  250 +`q=*%3A*&mtas=true&mtas.stats=true&mtas.stats.spans=true&mtas.stats.spans.0.field=text&mtas.stats.spans.0.query.0.type=cql&mtas.stats.spans.0.query.0.value=[t_lc%3D"de"]&mtas.stats.spans.0.key=functions+-+de&mtas.stats.spans.0.type=n%2Csum%2Cmean&mtas.stats.spans.0.function.0.expression=%24q0%2F%24n&mtas.stats.spans.0.function.0.key=relative+frequency&mtas.stats.spans.0.function.0.type=mean%2Cstandarddeviation%2Cdistribution(start%3D0%2Cend%3D0.1%2Cnumber%3D10)&mtas.stats.spans.0.function.1.expression=%24n&mtas.stats.spans.0.function.1.key=number+of+words&mtas.stats.spans.0.function.1.type=n%2Csum&rows=0&wt=json&indent=true`
  251 +
  252 +``` json
  253 +"mtas":{
  254 + "stats":{
  255 + "spans":[{
  256 + "key":"functions - de",
  257 + "mean":12.34790062804871,
  258 + "sum":25496044,
  259 + "n":2064808,
  260 + "functions":{
  261 + "number of words":{
  262 + "sum":337230767,
  263 + "n":2064808},
  264 + "relative frequency":{
  265 + "distribution(start=0,end=0.1,number=10)":{
  266 + "[0.000,0.010)":950500,
  267 + "[0.010,0.020)":80369,
  268 + "[0.020,0.030)":115695,
  269 + "[0.030,0.040)":139752,
  270 + "[0.040,0.050)":162877,
  271 + "[0.050,0.060)":168598,
  272 + "[0.060,0.070)":145493,
  273 + "[0.070,0.080)":109117,
  274 + "[0.080,0.090)":77214,
  275 + "[0.090,0.100)":51243},
  276 + "mean":0.030196372045937097,
  277 + "errorList":{"division by zero":691633},
  278 + "standarddeviation":0.03428066513492476,
  279 + "errorNumber":691633}}}]}}
  280 +```
  281 +
  282 +<a name="ignore-and-maximum-ignore-length"></a>
  283 +
  284 +### Ignore and maximumIgnoreLength
  285 +
197 286 <a name="functions"></a>
198 287  
199 288 ### Functions
... ...