ENIAM_MWE.ml
11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes
type sel = V of string | S of string | G
type t =
L of string * string * sel list
| O of string
| D of string * string
let process_interp lemma interp =
match Xstring.split ":" interp with
cat :: interp -> L(lemma,cat,Xlist.map interp (function
"$c" -> S "c"
| "$n" -> S "n"
| "$g" -> S "g"
| "$d" -> S "d"
| "$C" -> S "C"
| "_" -> G
| s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s))
| _ -> failwith "process_interp"
let load_mwe_dict filename dict =
File.fold_tab filename dict (fun dict -> function
[orths; lemma; interp] ->
let orths = Xstring.split " " orths in
if orths = [] then failwith "load_mwe_dict" else
let s = List.hd orths in
let orths = Xlist.map orths (fun s -> O s) in
let lemma,cat,interp = match process_interp lemma interp with
L(lemma,cat,interp) -> lemma,cat,interp
| _ -> failwith "load_mwe_dict2" in
StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
| l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))
let process_orth = function
[Lexer.T lemma; Lexer.B("(",")",[Lexer.T interp])] -> process_interp lemma interp
| [Lexer.T orth] -> O orth
| [Lexer.B("{","}",l); Lexer.B("(",")",[Lexer.T interp])] -> process_interp (Lexer.string_of_token_list l) interp
| [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l)
| tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens)
let load_mwe_dict2 filename (dict,dict2) =
File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function
[orths; lemma] ->
(* print_endline (orths ^ "\t" ^ lemma); *)
let tokens = Lexer.split "(\\|)\\|{\\|}\\| " orths in
(* print_endline ("load_dict2 1: " ^ Lexer.string_of_token_list tokens); *)
let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
(* print_endline ("load_dict2 2: " ^ Lexer.string_of_token_list tokens); *)
let orths = List.rev (Xlist.rev_map (Lexer.split_symbol (Lexer.T " ") [] tokens) process_orth) in
let tokens = Lexer.split "(\\|)\\|{\\|}" lemma in
(* print_endline ("load_dict2 3: " ^ Lexer.string_of_token_list tokens); *)
let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
(* print_endline ("load_dict2 4: " ^ Lexer.string_of_token_list tokens); *)
let lemma,cat,interp = match process_orth tokens with
L(lemma,cat,interp) -> lemma,cat,interp
| _ -> failwith "load_mwe_dict2" in
if orths = [] then failwith "load_mwe_dict2" else
(match List.hd orths with
L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
| O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2
| D _ -> failwith "load_mwe_dict2")
| l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'"))
let mwe_dict,mwe_dict2 =
let dict = File.catch_no_file (load_mwe_dict brev_filename) StringMap.empty in
let dict = File.catch_no_file (load_mwe_dict fixed_filename) dict in
let dict = File.catch_no_file (load_mwe_dict mwe_filename) dict in
let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejf_filename) (dict,StringMap.empty) in
let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejfek_filename) (dict,dict2) in
let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_filename) (dict,dict2) in
let dict,dict2 = File.catch_no_file (load_mwe_dict2 mwe2_filename) (dict,dict2) in
dict,dict2
let get_orths paths =
IntMap.fold paths StringSet.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
TokenEnvSet.fold l orths (fun orths t ->
StringSet.add orths (ENIAMtokens.get_orth t.token))))
let get_lemmas paths =
IntMap.fold paths StringSet.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
TokenEnvSet.fold l orths (fun orths t ->
StringSet.add orths (ENIAMtokens.get_lemma t.token))))
let get_intnum_orths paths =
IntMap.fold paths StringMap.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
TokenEnvSet.fold l orths (fun orths t ->
match t.token with
Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
| _ -> orths)))
let preselect orths lemmas rules l =
Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
let b = Xlist.fold match_list true (fun b -> function
O s -> StringSet.mem orths s && b
| L(s,_,_) -> StringSet.mem lemmas s && b
| D(s,_) -> failwith "preselect") in
if b then (match_list,lemma,cat,interp) :: rules else rules)
let preselect_dict orths lemmas dict rules =
StringSet.fold orths rules (fun rules orth ->
try
preselect orths lemmas rules (StringMap.find dict orth)
with Not_found -> rules)
let preselect_dict2 orths lemmas dict2 rules =
StringSet.fold lemmas rules (fun rules lemma ->
try
preselect orths lemmas rules (StringMap.find dict2 lemma)
with Not_found -> rules)
let add_ordnum_rules orths rules =
StringMap.fold orths rules (fun rules orth lemmas ->
StringSet.fold lemmas rules (fun rules lemma ->
(* Printf.printf "%s %s\n%!" orth lemma; *)
([D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))
let select_rules paths mwe_dict mwe_dict2 =
let orths = get_orths paths in
let lemmas = get_lemmas paths in
let intnum_orths = get_intnum_orths paths in
let rules = preselect_dict orths lemmas mwe_dict [] in
let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
let rules = add_ordnum_rules intnum_orths rules in
rules
let rec check_interp sels = function
[],[] -> true
| s :: interp, ["_"] :: interp2 -> check_interp sels (interp,interp2)
| V s :: interp, l2 :: interp2 -> if Xlist.mem l2 s then check_interp sels (interp,interp2) else false
| S s :: interp, l2 :: interp2 ->
(try
let l = Xlist.assoc sels s in
let b = Xlist.fold l false (fun b s -> Xlist.mem l2 s || b) in
if b then check_interp sels (interp,interp2) else false
with Not_found -> check_interp sels (interp,interp2))
| G :: interp, l2 :: interp2 -> check_interp sels (interp,interp2)
| _ -> failwith "check_interp"
let rec get_sels sels = function
[],[] -> sels
| s :: interp, ["_"] :: interp2 -> get_sels sels (interp,interp2)
| V s :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
| S s :: interp, l2 :: interp2 ->
(try
let l = Xlist.assoc sels s in
let sels = List.remove_assoc s sels in
let l = Xlist.fold l [] (fun l s -> if Xlist.mem l2 s then s :: l else l) in
get_sels ((s,l) :: sels) (interp,interp2)
with Not_found -> get_sels ((s,l2) :: sels) (interp,interp2))
| G :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
| _ -> failwith "get_sels"
let rec match_path_rec map found (t:token_env) sels rev = function
[] -> (t :: rev, sels) :: found
| s :: l ->
let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in
let found2 = IntMap.fold map2 [] (fun found2 _ l ->
TokenEnvSet.fold l found2 (fun found2 new_t ->
match s,new_t.token with
O s, token -> if ENIAMtokens.get_orth token = s then (new_t,sels) :: found2 else found2
| L(s,cat,interp), Lemma(s2,cat2,interps2) ->
Xlist.fold interps2 found2 (fun found2 interp2 ->
if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then
(new_t,get_sels sels (interp,interp2)) :: found2 else found2)
| D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
| _ -> found2)) in
Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)
let match_path map = function
[] -> failwith "match_path"
| s :: l ->
let found = IntMap.fold map [] (fun found i map2 ->
IntMap.fold map2 found (fun found j l ->
TokenEnvSet.fold l found (fun found t ->
match s,t.token with
O s, token -> if ENIAMtokens.get_orth token = s then (t,[]) :: found else found
| L(s,cat,interp), Lemma(s2,cat2,interps2) ->
Xlist.fold interps2 found (fun found interp2 ->
if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then
(t,get_sels [] (interp,interp2)) :: found else found)
| D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
| _ -> found))) in
Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)
let concat_orths l =
let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in
let n = Xstring.size s in
if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s
let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: problem z nazwami własnymi *)
let l = List.rev matching in
let beg = (List.hd l).beg in
let t = List.hd matching in
let len = t.beg + t.len - beg in
{empty_token_env with
orth=concat_orths l;
beg=beg;
len=len;
next=t.next;
token=Lemma(lemma,cat,[Xlist.map interp (function
S s -> (try Xlist.assoc sels s with Not_found -> ["_"])
| V s -> Xstring.split "\\." s
| G -> ["_"])]);
weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *)
attrs=ENIAMtokens.merge_attrs l}
let add_token paths t =
let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
let map = IntMap.add_inc map t.next (TokenEnvSet.singleton t) (fun set -> TokenEnvSet.add set t) in
IntMap.add paths t.beg map
let apply_rule paths (match_list,lemma,cat,interp) =
(* print_endline ("apply_rule: " ^ lemma); *)
let matchings_found = match_path paths match_list in
Xlist.fold matchings_found paths (fun paths (matching,sels) ->
try
let token = create_token matching sels lemma cat interp in
add_token paths token
with Not_found -> paths)
let process (paths,last) =
let paths = Xlist.fold paths IntMap.empty add_token in
let rules = select_rules paths mwe_dict mwe_dict2 in
let paths = Xlist.fold rules paths apply_rule in
let rules = select_rules paths mwe_dict mwe_dict2 in
let paths = Xlist.fold rules paths apply_rule in
let rules = select_rules paths mwe_dict mwe_dict2 in
let paths = Xlist.fold rules paths apply_rule in
let rules = select_rules paths mwe_dict mwe_dict2 in
let paths = Xlist.fold rules paths apply_rule in
let paths = IntMap.fold paths [] (fun paths _ map ->
IntMap.fold map paths (fun paths _ l ->
TokenEnvSet.fold l paths (fun paths t ->
t :: paths))) in
ENIAMpaths.sort (paths,last)