ENIAMmstFeatures.ml
2.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
open Xstd
open ENIAMmstModel
open ENIAMtokenizerTypes
open ENIAM_LCGtypes
type disamb_info = {
tree: node array;
tokens: token_env ExtArray.t;
paths: (int * int) IntMap.t
}
let create_dist_str (data: disamb_info) (parent: node) (child: node) =
try
let (lp, rp) = IntMap.find data.paths parent.id in
let (lc, rc) = IntMap.find data.paths child.id in
let is_ra = (ExtArray.get data.tokens parent.id).beg >
(ExtArray.get data.tokens child.id).beg in
let arc_str =
if is_ra then
"RA"
else
"LA" in
let dist =
if is_ra then
lp - rc
else
lc - rp in
if dist < 0 then ""
else
let dist_str =
if dist > 10 then
"10"
else if dist > 5 then
"5"
else
string_of_int (dist - 1) in
"&" ^ arc_str ^ "&" ^ dist_str
with
_ -> ""
let apply_features features fv =
List.fold_left (|>) fv features
let add_linear_features f_type (obs: string array) first second distStr fv =
fv
let add_two_obs_features prefix item1F1 item1F2 item2F1 item2F2 distStr fv =
let add_diststr str =
if distStr != "" then
[str; str^"*"^distStr]
else
[str] in
let flist = List.map ((^) prefix)[
"2FF1="^item1F1;
"2FF1="^item1F1^" "^item1F2;
"2FF1="^item1F1^" "^item1F2^" "^item2F2;
"2FF1="^item1F1^" "^item1F2^" "^item2F2^" "^item2F1;
"2FF2="^item1F1^" "^item2F1;
"2FF3="^item1F1^" "^item2F2;
"2FF4="^item1F2^" "^item2F1^" "^item2F2;
"2FF5="^item1F2^" "^item2F2;
"2FF6="^item2F1^" "^item2F2;
"2FF7="^item1F2;
"2FF8="^item2F1;
"2FF9="^item2F2;
] in
let funs = List.map (MST_Model.add_feature) (List.flatten (List.map add_diststr flist)) in
apply_features funs fv
let score_edge (data: disamb_info) (parent: node) (child: node) =
let fv = MST_Model.empty_fv in
let dist_str = create_dist_str data parent child in
let fv = add_two_obs_features "HCC"
parent.lemma parent.pos child.lemma child.pos dist_str fv in
MST_Model.score_fv fv