ENIAM_MWE.ml
18.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes
type sel = V of string | S of string | G
type t =
L of string * string * sel list
| O of string
| D of string * string
| I of string
let process_interp lemma interp =
match Xstring.split ":" interp with
cat :: interp -> L(lemma,cat,Xlist.map interp (function
"$c" -> S "c"
| "$n" -> S "n"
| "$g" -> S "g"
| "$d" -> S "d"
| "$C" -> S "C"
| "_" -> G
| s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s))
| _ -> failwith "process_interp"
let load_mwe_dict filename dict =
File.fold_tab filename dict (fun dict -> function
[orths; lemma; interp] ->
let orths = Xstring.split " " orths in
if orths = [] then failwith "load_mwe_dict" else
let s = List.hd orths in
let orths = Xlist.map orths (fun s -> O s) in
let lemma,cat,interp = match process_interp lemma interp with
L(lemma,cat,interp) -> lemma,cat,interp
| _ -> failwith "load_mwe_dict2" in
StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
| l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))
(*<<<<<<< HEAD
let load_mwe_dict () =
let dict = load_dict StringMap.empty brev_filename in
let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
(* let dict = load_dict dict complete_entries_filename in*)
let dict = load_dict dict mwe_filename in
dict
let mwe_dict = ref (StringMap.empty : (string * string * string) list StringMap.t)
let preselect_dict orths dict =
StringSet.fold orths [] (fun rules orth ->
try
let l = StringMap.find dict orth in
Xlist.fold l rules (fun rules (orth,lemma,interp) ->
(* print_endline ("preselect_dict: " ^ orth); *)
let match_list = Str.split (Str.regexp " ") orth in
let b = Xlist.fold match_list true (fun b s ->
(* if not (StringSet.mem orths s) then print_endline s; *)
StringSet.mem orths s && b) in
if b then (match_list,lemma,interp) :: rules else rules)
with Not_found -> rules)*)
let process_orth = function
[Lexer.T lemma; Lexer.B("(",")",[Lexer.T interp])] -> process_interp lemma interp
| [Lexer.T orth] -> O orth
| [Lexer.B("{","}",l); Lexer.B("(",")",[Lexer.T interp])] -> process_interp (Lexer.string_of_token_list l) interp
| [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l)
| tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens)
let load_mwe_dict2 filename (dict,dict2) =
File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function
[orths; lemma] ->
(* print_endline (orths ^ "\t" ^ lemma); *)
let tokens = Lexer.split "(\\|)\\|{\\|}\\| " orths in
(* print_endline ("load_dict2 1: " ^ Lexer.string_of_token_list tokens); *)
let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
(* print_endline ("load_dict2 2: " ^ Lexer.string_of_token_list tokens); *)
let orths = List.rev (Xlist.rev_map (Lexer.split_symbol (Lexer.T " ") [] tokens) process_orth) in
let tokens = Lexer.split "(\\|)\\|{\\|}" lemma in
(* print_endline ("load_dict2 3: " ^ Lexer.string_of_token_list tokens); *)
let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
(* print_endline ("load_dict2 4: " ^ Lexer.string_of_token_list tokens); *)
let lemma,cat,interp = match process_orth tokens with
L(lemma,cat,interp) -> lemma,cat,interp
| _ -> failwith "load_mwe_dict2" in
if orths = [] then failwith "load_mwe_dict2" else
(match List.hd orths with
L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
| O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2
| _ -> failwith "load_mwe_dict2")
| l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'"))
let load_mwe_dicts () =
let dict = File.catch_no_file (load_mwe_dict brev_filename) StringMap.empty in
let dict = File.catch_no_file (load_mwe_dict fixed_filename) dict in
let dict = File.catch_no_file (load_mwe_dict mwe_filename) dict in
let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejf_filename) (dict,StringMap.empty) in
let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejfek_filename) (dict,dict2) in
(* let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_filename) (dict,dict2) in *)
let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_sort_filename) (dict,dict2) in
let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_ulice_filename) (dict,dict2) in
let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_dzielnice_filename) (dict,dict2) in
let dict,dict2 = File.catch_no_file (load_mwe_dict2 mwe2_filename) (dict,dict2) in
dict,dict2
let mwe_dict = ref (StringMap.empty : (t list * string * string * sel list) list StringMap.t)
let mwe_dict2 = ref (StringMap.empty : (t list * string * string * sel list) list StringMap.t)
let get_orths paths =
IntMap.fold paths StringSet.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
TokenEnvSet.fold l orths (fun orths t ->
StringSet.add orths (ENIAMtokens.get_orth t.token))))
let get_lemmas paths =
IntMap.fold paths StringSet.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
TokenEnvSet.fold l orths (fun orths t ->
StringSet.add orths (ENIAMtokens.get_lemma t.token))))
let get_intnum_orths paths =
IntMap.fold paths StringMap.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
TokenEnvSet.fold l orths (fun orths t ->
match t.token with
Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
| _ -> orths)))
let get_intnum_orths paths =
IntMap.fold paths StringMap.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
TokenEnvSet.fold l orths (fun orths t ->
match t.token with
Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
| _ -> orths)))
let get_year_orths paths =
IntMap.fold paths StringSet.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
TokenEnvSet.fold l orths (fun orths t ->
match t.token with
Dig(lemma,"year") -> StringSet.add orths lemma
| _ -> orths)))
let get_single_letter_orths paths =
IntMap.fold paths StringSet.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
TokenEnvSet.fold l orths (fun orths t ->
match t.token with
SmallLetter lemma -> (*if lemma <> "g" then*) StringSet.add orths lemma (*else orths*) (* FIXME: !!!! *)
| CapLetter(lemma,_) -> StringSet.add orths lemma
| _ -> orths)))
let preselect orths lemmas rules l =
Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
(* print_endline ("preselect: " ^ lemma); *)
let b = Xlist.fold match_list true (fun b -> function
O s -> StringSet.mem orths s && b
| L(s,_,_) -> StringSet.mem lemmas s && b
| _ -> failwith "preselect") in
if b then (Xlist.size match_list > 1,match_list,lemma,cat,interp) :: rules else rules)
let preselect_dict orths lemmas dict rules =
StringSet.fold orths rules (fun rules orth ->
try
(* print_endline ("preselect_dict: " ^ orth); *)
preselect orths lemmas rules (StringMap.find dict orth)
with Not_found -> rules)
let preselect_dict2 orths lemmas dict2 rules =
StringSet.fold lemmas rules (fun rules lemma ->
try
preselect orths lemmas rules (StringMap.find dict2 lemma)
with Not_found -> rules)
let add_ordnum_rules orths rules =
StringMap.fold orths rules (fun rules orth lemmas ->
StringSet.fold lemmas rules (fun rules lemma ->
(* Printf.printf "%s %s\n%!" orth lemma; *)
(false,[D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))
let add_quot_rule rules =
(false,[I "„x";I "<sentence>"; I "<clause>"],"„","Interp",[]) :: rules
let add_building_number_rules dig_orths letter_orths rules =
StringSet.fold dig_orths rules (fun rules dig1 ->
let rules = StringSet.fold letter_orths rules (fun rules letter1 ->
(true,[D(dig1,"year");O letter1],dig1^letter1,"building-number",[]) :: rules) in
StringSet.fold dig_orths rules (fun rules dig2 ->
let rules = (true,[D(dig1,"year");O "/";D(dig2,"year")],dig1^"/"^dig2,"building-number",[]) :: rules in
let rules = StringSet.fold letter_orths rules (fun rules letter1 ->
(true,[D(dig1,"year");O letter1;O "/";D(dig2,"year")],dig1^letter1^"/"^dig2,"building-number",[]) ::
(true,[D(dig1,"year");O "/";D(dig2,"year");O letter1],dig1^"/"^dig2^letter1,"building-number",[]) :: rules) in
StringSet.fold dig_orths rules (fun rules dig3 ->
let rules = (true,[D(dig1,"year");O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^"/"^dig2^"/"^dig3,"building-number",[]) :: rules in
let rules = StringSet.fold letter_orths rules (fun rules letter1 ->
(true,[D(dig1,"year");O letter1;O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^letter1^"/"^dig2^"/"^dig3,"building-number",[]) ::
(true,[D(dig1,"year");O "/";D(dig2,"year");O letter1;O "/";D(dig3,"year")],dig1^"/"^dig2^letter1^"/"^dig3,"building-number",[]) :: rules) in
rules)))
let select_rules paths mwe_dict mwe_dict2 =
let orths = get_orths paths in
(* print_endline ("ENIAM_MWE.select_rules 1 orths=[" ^ String.concat ";" (StringSet.to_list orths) ^ "]"); *)
let lemmas = get_lemmas paths in
let intnum_orths = get_intnum_orths paths in
(* let year_orths = get_year_orths paths in *)
(* let letter_orths = get_single_letter_orths paths in *)
let rules = preselect_dict orths lemmas mwe_dict [] in
(* print_endline ("ENIAM_MWE.select_rules 1 |rules|=" ^ string_of_int (Xlist.size rules)); *)
(* Xlist.iter rules (fun (is_mwe,match_list,lemma,cat,interp) -> print_endline lemma); *)
let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
(* print_endline ("ENIAM_MWE.select_rules 2 |rules|=" ^ string_of_int (Xlist.size rules)); *)
let rules = add_ordnum_rules intnum_orths rules in
(* print_endline ("ENIAM_MWE.select_rules 3 |rules|=" ^ string_of_int (Xlist.size rules)); *)
let rules = add_quot_rule rules in
(* print_endline ("ENIAM_MWE.select_rules 4 |rules|=" ^ string_of_int (Xlist.size rules)); *)
(* let rules = add_building_number_rules year_orths letter_orths rules in *) (* FIXME !!!! *)
(* print_endline ("ENIAM_MWE.select_rules 5 |rules|=" ^ string_of_int (Xlist.size rules) ^ " |year_orths|=" ^ string_of_int (StringSet.size year_orths) ^ " |letter_orths|=" ^ string_of_int (StringSet.size letter_orths)); *)
rules
let rec check_interp sels = function
[],[] -> true
| s :: interp, ["_"] :: interp2 -> check_interp sels (interp,interp2)
| V s :: interp, l2 :: interp2 -> if Xlist.mem l2 s then check_interp sels (interp,interp2) else false
| S s :: interp, l2 :: interp2 ->
(try
let l = Xlist.assoc sels s in
let b = Xlist.fold l false (fun b s -> Xlist.mem l2 s || b) in
if b then check_interp sels (interp,interp2) else false
with Not_found -> check_interp sels (interp,interp2))
| G :: interp, l2 :: interp2 -> check_interp sels (interp,interp2)
| _ -> failwith "check_interp"
let rec get_sels sels = function
[],[] -> sels
| s :: interp, ["_"] :: interp2 -> get_sels sels (interp,interp2)
| V s :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
| S s :: interp, l2 :: interp2 ->
(try
let l = Xlist.assoc sels s in
let sels = List.remove_assoc s sels in
let l = Xlist.fold l [] (fun l s -> if Xlist.mem l2 s then s :: l else l) in
get_sels ((s,l) :: sels) (interp,interp2)
with Not_found -> get_sels ((s,l2) :: sels) (interp,interp2))
| G :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
| _ -> failwith "get_sels"
let rec match_path_rec map found (t:token_env) sels rev = function
[] -> (t :: rev, sels) :: found
| s :: l ->
let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in
let found2 = IntMap.fold map2 [] (fun found2 _ l ->
TokenEnvSet.fold l found2 (fun found2 new_t ->
match s,new_t.token with
O s, token -> if ENIAMtokens.get_orth token = s then (new_t,sels) :: found2 else found2
| L(s,cat,interp), Lemma(s2,cat2,interps2) ->
Xlist.fold interps2 found2 (fun found2 interp2 ->
if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then
(new_t,get_sels sels (interp,interp2)) :: found2 else found2)
| D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
| I s, Interp s2 -> if s=s2 then (new_t,sels) :: found2 else found2
(* | SL, SmallLetter _ -> (new_t,sels) :: found
| SL, CapLetter _ -> (new_t,sels) :: found *)
| _ -> found2)) in
Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)
let match_path map = function
[] -> failwith "match_path"
| s :: l ->
let found = IntMap.fold map [] (fun found i map2 ->
IntMap.fold map2 found (fun found j l ->
TokenEnvSet.fold l found (fun found t ->
match s,t.token with
O s, token -> if ENIAMtokens.get_orth token = s then (t,[]) :: found else found
| L(s,cat,interp), Lemma(s2,cat2,interps2) ->
Xlist.fold interps2 found (fun found interp2 ->
if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then
(t,get_sels [] (interp,interp2)) :: found else found)
| D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
| I s, Interp s2 -> if s=s2 then (t,[]) :: found else found
(* | SL, SmallLetter _ -> (t,[]) :: found
| SL, CapLetter _ -> (t,[]) :: found *)
| _ -> found))) in
Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)
let concat_orths l =
let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in
let n = Xstring.size s in
if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s
let create_token is_mwe (matching:token_env list) sels lemma cat interp = (* FIXME: problem z nazwami własnymi *)
let l = List.rev matching in
let beg = (List.hd l).beg in
let t = List.hd matching in
let len = t.beg + t.len - beg in
{empty_token_env with
orth=concat_orths l;
beg=beg;
len=len;
next=t.next;
token=if cat = "Interp" then Interp lemma else
Lemma(lemma,cat,[Xlist.map interp (function
S s -> (try Xlist.assoc sels s with Not_found -> ["_"])
| V s -> Xstring.split "\\." s
| G -> ["_"])]);
weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *)
attrs=(if is_mwe then [MWE ]else []) @ ENIAMtokens.merge_attrs l}
let add_token paths t =
let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
let map = IntMap.add_inc map t.next (TokenEnvSet.singleton t) (fun set -> TokenEnvSet.add set t) in
IntMap.add paths t.beg map
let apply_rule paths (is_mwe,match_list,lemma,cat,interp) =
(* print_endline ("apply_rule: " ^ lemma); *)
let matchings_found = match_path paths match_list in
Xlist.fold matchings_found paths (fun paths (matching,sels) ->
try
let token = create_token is_mwe matching sels lemma cat interp in
add_token paths token
with Not_found -> paths)
let count_path_size paths =
IntMap.fold paths 0 (fun n _ map2 ->
IntMap.fold map2 n (fun n _ set ->
TokenEnvSet.size set + n))
let process (paths,last) =
(* print_endline ("ENIAM_MWE.process 1 |paths|=" ^ string_of_int (Xlist.size paths)); *)
let paths = Xlist.fold paths IntMap.empty add_token in
(* print_endline ("ENIAM_MWE.process 2 |paths|=" ^ string_of_int (count_path_size paths)); *)
let rules = select_rules paths !mwe_dict !mwe_dict2 in
(* print_endline ("ENIAM_MWE.process 3 |rules|=" ^ string_of_int (Xlist.size rules)); *)
let paths = Xlist.fold rules paths apply_rule in
(* print_endline ("ENIAM_MWE.process 4 |paths|=" ^ string_of_int (count_path_size paths)); *)
let rules = select_rules paths !mwe_dict !mwe_dict2 in
(* print_endline ("ENIAM_MWE.process 5 |rules|=" ^ string_of_int (Xlist.size rules)); *)
let paths = Xlist.fold rules paths apply_rule in
(* print_endline ("ENIAM_MWE.process 6 |paths|=" ^ string_of_int (count_path_size paths)); *)
let rules = select_rules paths !mwe_dict !mwe_dict2 in
(* print_endline ("ENIAM_MWE.process 7 |rules|=" ^ string_of_int (Xlist.size rules)); *)
let paths = Xlist.fold rules paths apply_rule in
(* print_endline "ENIAM_MWE.process 8"; *)
let rules = select_rules paths !mwe_dict !mwe_dict2 in
(* print_endline "ENIAM_MWE.process 9"; *)
let paths = Xlist.fold rules paths apply_rule in
(* print_endline "ENIAM_MWE.process 10"; *)
let paths = IntMap.fold paths [] (fun paths _ map ->
IntMap.fold map paths (fun paths _ l ->
TokenEnvSet.fold l paths (fun paths t ->
t :: paths))) in
(* print_endline "ENIAM_MWE.process 11"; *)
ENIAMpaths.sort (paths,last)