Commit 19861e34e145fb7155939887fda2749c1311b80c
1 parent
bce3e1cc
repair distance for distributed setup
Showing
3 changed files
with
131 additions
and
5 deletions
src/main/java/mtas/codec/util/CodecComponent.java
... | ... | @@ -2082,7 +2082,7 @@ public class CodecComponent { |
2082 | 2082 | public Double maximum; |
2083 | 2083 | |
2084 | 2084 | /** The parameters. */ |
2085 | - Map<String, String> parameters; | |
2085 | + public Map<String, String> parameters; | |
2086 | 2086 | |
2087 | 2087 | /** The distance. */ |
2088 | 2088 | public transient Distance distance = null; |
... | ... |
src/main/java/mtas/solr/handler/component/util/MtasSolrComponentTermvector.java
... | ... | @@ -32,6 +32,7 @@ import mtas.codec.util.CodecUtil; |
32 | 32 | import mtas.codec.util.CodecComponent.ComponentField; |
33 | 33 | import mtas.codec.util.CodecComponent.ComponentFields; |
34 | 34 | import mtas.codec.util.CodecComponent.ComponentTermVector; |
35 | +import mtas.codec.util.CodecComponent.SubComponentDistance; | |
35 | 36 | import mtas.codec.util.CodecComponent.SubComponentFunction; |
36 | 37 | import mtas.codec.util.collector.MtasDataCollector; |
37 | 38 | import mtas.codec.util.collector.MtasDataItemNumberComparator; |
... | ... | @@ -477,10 +478,16 @@ public class MtasSolrComponentTermvector |
477 | 478 | + NAME_MTAS_TERMVECTOR_DISTANCE_BASE); |
478 | 479 | sreq.params.remove(PARAM_MTAS_TERMVECTOR + "." + key + "." |
479 | 480 | + NAME_MTAS_TERMVECTOR_DISTANCE + "." + distanceKey + "." |
480 | - + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER); | |
481 | - sreq.params.remove(PARAM_MTAS_TERMVECTOR + "." + key + "." | |
482 | - + NAME_MTAS_TERMVECTOR_DISTANCE + "." + distanceKey + "." | |
483 | 481 | + NAME_MTAS_TERMVECTOR_DISTANCE_MAXIMUM); |
482 | + Set<String> distanceParameters = MtasSolrResultUtil | |
483 | + .getIdsFromParameters(rb.req.getParams(), PARAM_MTAS_TERMVECTOR | |
484 | + + "." + key + "." + NAME_MTAS_TERMVECTOR_DISTANCE + "." + distanceKey + "." | |
485 | + + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER); | |
486 | + for(String distanceParameter : distanceParameters) { | |
487 | + sreq.params.remove(PARAM_MTAS_TERMVECTOR | |
488 | + + "." + key + "." + NAME_MTAS_TERMVECTOR_DISTANCE + "." + distanceKey + "." | |
489 | + + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER+"."+distanceParameter); | |
490 | + } | |
484 | 491 | } |
485 | 492 | sreq.params.remove(PARAM_MTAS_TERMVECTOR + "." + key + "." |
486 | 493 | + NAME_MTAS_TERMVECTOR_REGEXP); |
... | ... | @@ -1058,6 +1065,49 @@ public class MtasSolrComponentTermvector |
1058 | 1065 | + NAME_MTAS_TERMVECTOR_TYPE, |
1059 | 1066 | tv.subComponentFunction.type); |
1060 | 1067 | } |
1068 | + if (tv.distances != null) { | |
1069 | + int distanceCounter = 0; | |
1070 | + for (SubComponentDistance distance : tv.distances) { | |
1071 | + paramsNewRequest.add( | |
1072 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1073 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1074 | + + distanceCounter + "." | |
1075 | + + NAME_MTAS_TERMVECTOR_DISTANCE_TYPE, | |
1076 | + distance.type); | |
1077 | + paramsNewRequest.add( | |
1078 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1079 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1080 | + + distanceCounter + "." | |
1081 | + + NAME_MTAS_TERMVECTOR_DISTANCE_BASE, | |
1082 | + distance.base); | |
1083 | + if(distance.key!=null) { | |
1084 | + paramsNewRequest.add( | |
1085 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1086 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1087 | + + distanceCounter + "." | |
1088 | + + NAME_MTAS_TERMVECTOR_DISTANCE_KEY, | |
1089 | + distance.key); | |
1090 | + } | |
1091 | + if(distance.maximum!=null) { | |
1092 | + paramsNewRequest.add( | |
1093 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1094 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1095 | + + distanceCounter + "." | |
1096 | + + NAME_MTAS_TERMVECTOR_DISTANCE_MAXIMUM, | |
1097 | + String.valueOf(distance.maximum)); | |
1098 | + } | |
1099 | + if(distance.parameters!=null) { | |
1100 | + for(Entry<String,String> parameter : distance.parameters.entrySet()) { | |
1101 | + paramsNewRequest.add( | |
1102 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1103 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1104 | + + distanceCounter + "." | |
1105 | + + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER + "." +parameter.getKey(), | |
1106 | + parameter.getValue()); | |
1107 | + } | |
1108 | + } | |
1109 | + } | |
1110 | + } | |
1061 | 1111 | if (tv.functions != null) { |
1062 | 1112 | int functionCounter = 0; |
1063 | 1113 | for (SubComponentFunction function : tv.functions) { |
... | ... | @@ -1162,6 +1212,49 @@ public class MtasSolrComponentTermvector |
1162 | 1212 | + NAME_MTAS_TERMVECTOR_TYPE, |
1163 | 1213 | tv.subComponentFunction.type); |
1164 | 1214 | } |
1215 | + if (tv.distances != null) { | |
1216 | + int distanceCounter = 0; | |
1217 | + for (SubComponentDistance distance : tv.distances) { | |
1218 | + paramsNewRequest.add( | |
1219 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1220 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1221 | + + distanceCounter + "." | |
1222 | + + NAME_MTAS_TERMVECTOR_DISTANCE_TYPE, | |
1223 | + distance.type); | |
1224 | + paramsNewRequest.add( | |
1225 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1226 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1227 | + + distanceCounter + "." | |
1228 | + + NAME_MTAS_TERMVECTOR_DISTANCE_BASE, | |
1229 | + distance.base); | |
1230 | + if(distance.key!=null) { | |
1231 | + paramsNewRequest.add( | |
1232 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1233 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1234 | + + distanceCounter + "." | |
1235 | + + NAME_MTAS_TERMVECTOR_DISTANCE_KEY, | |
1236 | + distance.key); | |
1237 | + } | |
1238 | + if(distance.maximum!=null) { | |
1239 | + paramsNewRequest.add( | |
1240 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1241 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1242 | + + distanceCounter + "." | |
1243 | + + NAME_MTAS_TERMVECTOR_DISTANCE_MAXIMUM, | |
1244 | + String.valueOf(distance.maximum)); | |
1245 | + } | |
1246 | + if(distance.parameters!=null) { | |
1247 | + for(Entry<String,String> parameter : distance.parameters.entrySet()) { | |
1248 | + paramsNewRequest.add( | |
1249 | + PARAM_MTAS_TERMVECTOR + "." + termvectorCounter + "." | |
1250 | + + NAME_MTAS_TERMVECTOR_DISTANCE + "." | |
1251 | + + distanceCounter + "." | |
1252 | + + NAME_MTAS_TERMVECTOR_DISTANCE_PARAMETER + "." +parameter.getKey(), | |
1253 | + parameter.getValue()); | |
1254 | + } | |
1255 | + } | |
1256 | + } | |
1257 | + } | |
1165 | 1258 | if (tv.functions != null) { |
1166 | 1259 | int functionCounter = 0; |
1167 | 1260 | for (SubComponentFunction function : tv.functions) { |
... | ... |
src/site/markdown/search_component_termvector.md
... | ... | @@ -46,6 +46,26 @@ Furthermore, a list of terms can be provided that should be ignored within the |
46 | 46 | | mtas.termvector.\<identifier\>.ignoreList | [\<string\>,...] | list of terms | yes | |
47 | 47 | | mtas.termvector.\<identifier\>.ignoreListRegexp | \<boolean\> | interpret items in provided ignoreList as regular expressions | no | |
48 | 48 | |
49 | +## Distances | |
50 | + | |
51 | +For each term in the termvector, the distance to a predefined `base` term can be computed. Two `type` of distance are available: [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) and [Damerau–Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), each with configurable parameters to define the weight of the relevant operations. | |
52 | + | |
53 | +| Parameter | Value | Info | Obligatory | | |
54 | +|-------------------------------------------------|--------------|--------------------------------|-------------| | |
55 | +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.key | \<string\> | key used in response | no | | |
56 | +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.type | \<string\> | type of distance | yes | | |
57 | +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.base | \<string\> | base term for distance | yes | | |
58 | +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.maximum | \<double\> | restrict termvector to terms with provided maximum | no | | |
59 | +| mtas.termvector.\<identifier\>.distance.\<identifier distance\>.parameter.* | \<string\> | type dependent parameters | no | | |
60 | + | |
61 | +The available type dependent additional parameters for type `levenshtein` and `damerau-levenshtein` are | |
62 | + | |
63 | +| Type | Type dependent parameter | Value | Info | Default | | |
64 | +|-----|-------------------------------------------------|--------------|--------------------------------|-------------| | |
65 | +| levenshtein, damerau-levenshtein | deletionDistance | \<double\> | distance for a deletion | 1.0 | | |
66 | +| levenshtein, damerau-levenshtein | insertionDistance | \<double\> | distance for an insertion | 1.0 | | |
67 | +| levenshtein, damerau-levenshtein | replaceDistance | \<double\> | distance for a replacement | 1.0 | | |
68 | +| damerau-levenshtein | transpositionDistance | \<double\> | distance for a transposition | 1.0 | | |
49 | 69 | |
50 | 70 | ## Functions |
51 | 71 | |
... | ... | @@ -68,7 +88,8 @@ Again, the key is added to the response and may be used to distinguish between m |
68 | 88 | 3. [Ignore](#ignore) : previous result, ignoring words ending with $-e$. |
69 | 89 | 4. [List](#list) : termvector for provided list of words. |
70 | 90 | 5. [Start](#start) : termvector for words containing only characters a-z sorted by term and > *koe*. |
71 | -6. [Functions](#functions) : statistics on hits, relative frequency and total number of words in document for words containing only characters a-z. | |
91 | +6. [Distances](#distances) : termvector for words with Levenshtein distance from 1 or less sorted descending by frequency. | |
92 | +7. [Functions](#functions) : statistics on hits, relative frequency and total number of words in document for words containing only characters a-z. | |
72 | 93 | |
73 | 94 | --- |
74 | 95 | |
... | ... | @@ -262,6 +283,18 @@ Termvector for words containing only characters a-z sorted by term and &gt; *koe |
262 | 283 | "key":"koeala"}]}]} |
263 | 284 | ``` |
264 | 285 | |
286 | +<a name="distances"></a> | |
287 | + | |
288 | +### Distances | |
289 | + | |
290 | +**Example** | |
291 | +List of words sorted descending by number of hits with at most Levenshtein distance 1 from `regering`. | |
292 | + | |
293 | +**Request and response** | |
294 | +`` | |
295 | + | |
296 | +``` json | |
297 | +``` | |
265 | 298 | |
266 | 299 | <a name="functions"></a> |
267 | 300 | |
... | ... |