Commit 19861e34e145fb7155939887fda2749c1311b80c

Authored by Matthijs Brouwer
1 parent bce3e1cc

repair distance for distributed setup

src/main/java/mtas/codec/util/CodecComponent.java
... ... @@ -2082,7 +2082,7 @@ public class CodecComponent {
2082 2082 public Double maximum;
2083 2083  
2084 2084 /** The parameters. */
2085   - Map<String, String> parameters;
  2085 + public Map<String, String> parameters;
2086 2086  
2087 2087 /** The distance. */
2088 2088 public transient Distance distance = null;
... ...
src/main/java/mtas/solr/handler/component/util/MtasSolrComponentTermvector.java
... ... @@ -32,6 +32,7 @@ import mtas.codec.util.CodecUtil;
32 32 import mtas.codec.util.CodecComponent.ComponentField;
33 33 import mtas.codec.util.CodecComponent.ComponentFields;
34 34 import mtas.codec.util.CodecComponent.ComponentTermVector;
  35 +import mtas.codec.util.CodecComponent.SubComponentDistance;
35 36 import mtas.codec.util.CodecComponent.SubComponentFunction;
36 37 import mtas.codec.util.collector.MtasDataCollector;
37 38 import mtas.codec.util.collector.MtasDataItemNumberComparator;
... ... @@ -477,10 +478,16 @@ public class MtasSolrComponentTermvector
477 478 + NAME_MTAS_TERMVECTOR_DISTANCE_BASE);
478 479 sreq.params.remove(PARAM_MTAS_TERMVECTOR + "." + key + "."
479 480 + NAME_MTAS_TERMVECTOR_DISTANCE + "." + distanceKey + "."
480   - + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER);
481   - sreq.params.remove(PARAM_MTAS_TERMVECTOR + "." + key + "."
482   - + NAME_MTAS_TERMVECTOR_DISTANCE + "." + distanceKey + "."
483 481 + NAME_MTAS_TERMVECTOR_DISTANCE_MAXIMUM);
  482 + Set<String> distanceParameters = MtasSolrResultUtil
  483 + .getIdsFromParameters(rb.req.getParams(), PARAM_MTAS_TERMVECTOR
  484 + + "." + key + "." + NAME_MTAS_TERMVECTOR_DISTANCE + "." + distanceKey + "."
  485 + + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER);
  486 + for(String distanceParameter : distanceParameters) {
  487 + sreq.params.remove(PARAM_MTAS_TERMVECTOR
  488 + + "." + key + "." + NAME_MTAS_TERMVECTOR_DISTANCE + "." + distanceKey + "."
  489 + + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER+"."+distanceParameter);
  490 + }
484 491 }
485 492 sreq.params.remove(PARAM_MTAS_TERMVECTOR + "." + key + "."
486 493 + NAME_MTAS_TERMVECTOR_REGEXP);
... ... @@ -1058,6 +1065,49 @@ public class MtasSolrComponentTermvector
1058 1065 + NAME_MTAS_TERMVECTOR_TYPE,
1059 1066 tv.subComponentFunction.type);
1060 1067 }
  1068 + if (tv.distances != null) {
  1069 + int distanceCounter = 0;
  1070 + for (SubComponentDistance distance : tv.distances) {
  1071 + paramsNewRequest.add(
  1072 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1073 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1074 + + distanceCounter + "."
  1075 + + NAME_MTAS_TERMVECTOR_DISTANCE_TYPE,
  1076 + distance.type);
  1077 + paramsNewRequest.add(
  1078 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1079 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1080 + + distanceCounter + "."
  1081 + + NAME_MTAS_TERMVECTOR_DISTANCE_BASE,
  1082 + distance.base);
  1083 + if(distance.key!=null) {
  1084 + paramsNewRequest.add(
  1085 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1086 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1087 + + distanceCounter + "."
  1088 + + NAME_MTAS_TERMVECTOR_DISTANCE_KEY,
  1089 + distance.key);
  1090 + }
  1091 + if(distance.maximum!=null) {
  1092 + paramsNewRequest.add(
  1093 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1094 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1095 + + distanceCounter + "."
  1096 + + NAME_MTAS_TERMVECTOR_DISTANCE_MAXIMUM,
  1097 + String.valueOf(distance.maximum));
  1098 + }
  1099 + if(distance.parameters!=null) {
  1100 + for(Entry<String,String> parameter : distance.parameters.entrySet()) {
  1101 + paramsNewRequest.add(
  1102 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1103 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1104 + + distanceCounter + "."
  1105 + + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER + "." +parameter.getKey(),
  1106 + parameter.getValue());
  1107 + }
  1108 + }
  1109 + }
  1110 + }
1061 1111 if (tv.functions != null) {
1062 1112 int functionCounter = 0;
1063 1113 for (SubComponentFunction function : tv.functions) {
... ... @@ -1162,6 +1212,49 @@ public class MtasSolrComponentTermvector
1162 1212 + NAME_MTAS_TERMVECTOR_TYPE,
1163 1213 tv.subComponentFunction.type);
1164 1214 }
  1215 + if (tv.distances != null) {
  1216 + int distanceCounter = 0;
  1217 + for (SubComponentDistance distance : tv.distances) {
  1218 + paramsNewRequest.add(
  1219 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1220 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1221 + + distanceCounter + "."
  1222 + + NAME_MTAS_TERMVECTOR_DISTANCE_TYPE,
  1223 + distance.type);
  1224 + paramsNewRequest.add(
  1225 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1226 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1227 + + distanceCounter + "."
  1228 + + NAME_MTAS_TERMVECTOR_DISTANCE_BASE,
  1229 + distance.base);
  1230 + if(distance.key!=null) {
  1231 + paramsNewRequest.add(
  1232 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1233 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1234 + + distanceCounter + "."
  1235 + + NAME_MTAS_TERMVECTOR_DISTANCE_KEY,
  1236 + distance.key);
  1237 + }
  1238 + if(distance.maximum!=null) {
  1239 + paramsNewRequest.add(
  1240 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1241 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1242 + + distanceCounter + "."
  1243 + + NAME_MTAS_TERMVECTOR_DISTANCE_MAXIMUM,
  1244 + String.valueOf(distance.maximum));
  1245 + }
  1246 + if(distance.parameters!=null) {
  1247 + for(Entry<String,String> parameter : distance.parameters.entrySet()) {
  1248 + paramsNewRequest.add(
  1249 + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "."
  1250 + + NAME_MTAS_TERMVECTOR_DISTANCE + "."
  1251 + + distanceCounter + "."
  1252 + + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER + "." +parameter.getKey(),
  1253 + parameter.getValue());
  1254 + }
  1255 + }
  1256 + }
  1257 + }
1165 1258 if (tv.functions != null) {
1166 1259 int functionCounter = 0;
1167 1260 for (SubComponentFunction function : tv.functions) {
... ...
src/site/markdown/search_component_termvector.md
... ... @@ -46,6 +46,26 @@ Furthermore, a list of terms can be provided that should be ignored within the
46 46 | mtas.termvector.\<identifier\>.ignoreList | [\<string\>,...] | list of terms | yes |
47 47 | mtas.termvector.\<identifier\>.ignoreListRegexp | \<boolean\> | interpret items in provided ignoreList as regular expressions | no |
48 48  
  49 +## Distances
  50 +
  51 +For each term in the termvector, the distance to a predefined `base` term can be computed. Two `type` of distance are available: [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) and [Damerau–Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), each with configurable parameters to define the weight of the relevant operations.
  52 +
  53 +| Parameter | Value | Info | Obligatory |
  54 +|-------------------------------------------------|--------------|--------------------------------|-------------|
  55 +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.key | \<string\> | key used in response | no |
  56 +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.type | \<string\> | type of distance | yes |
  57 +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.base | \<string\> | base term for distance | yes |
  58 +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.maximum | \<double\> | restrict termvector to terms with provided maximum | no |
  59 +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.parameter.* | \<string\> | type dependent parameters | no |
  60 +
  61 +The available type dependent additional parameters for type `levenshtein` and `damerau-levenshtein` are
  62 +
  63 +| Type | Type dependent parameter | Value | Info | Default |
  64 +|-----|-------------------------------------------------|--------------|--------------------------------|-------------|
  65 +| levenshtein, damerau-levenshtein | deletionDistance | \<double\> | distance for a deletion | 1.0 |
  66 +| levenshtein, damerau-levenshtein | insertionDistance | \<double\> | distance for an insertion | 1.0 |
  67 +| levenshtein, damerau-levenshtein | replaceDistance | \<double\> | distance for a replacement | 1.0 |
  68 +| damerau-levenshtein | transpositionDistance | \<double\> | distance for a transposition | 1.0 |
49 69  
50 70 ## Functions
51 71  
... ... @@ -68,7 +88,8 @@ Again, the key is added to the response and may be used to distinguish between m
68 88 3. [Ignore](#ignore) : previous result, ignoring words ending with $-e$.
69 89 4. [List](#list) : termvector for provided list of words.
70 90 5. [Start](#start) : termvector for words containing only characters a-z sorted by term and &gt; *koe*.
71   -6. [Functions](#functions) : statistics on hits, relative frequency and total number of words in document for words containing only characters a-z.
  91 +6. [Distances](#distances) : termvector for words with Levenshtein distance from 1 or less sorted descending by frequency.
  92 +7. [Functions](#functions) : statistics on hits, relative frequency and total number of words in document for words containing only characters a-z.
72 93  
73 94 ---
74 95  
... ... @@ -262,6 +283,18 @@ Termvector for words containing only characters a-z sorted by term and &amp;gt; *koe
262 283 "key":"koeala"}]}]}
263 284 ```
264 285  
  286 +<a name="distances"></a>
  287 +
  288 +### Distances
  289 +
  290 +**Example**
  291 +List of words sorted descending by number of hits with at most Levenshtein distance 1 from `regering`.
  292 +
  293 +**Request and response**
  294 +``
  295 +
  296 +``` json
  297 +```
265 298  
266 299 <a name="functions"></a>
267 300  
... ...