ENIAM_MWE.ml
12.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes
let load_dict dict filename =
File.fold_tab filename dict (fun dict -> function
[orth; lemma; interp] ->
let s = List.hd (Str.split_delim (Str.regexp " ") orth) in
StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l)
| l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))
let mwe_dict =
let dict = load_dict StringMap.empty brev_filename in
let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
(* let dict = load_dict dict complete_entries_filename in*)
let dict = load_dict dict mwe_filename in
dict
let preselect_dict orths dict =
StringSet.fold orths [] (fun rules orth ->
try
let l = StringMap.find dict orth in
Xlist.fold l rules (fun rules (orth,lemma,interp) ->
(* print_endline ("preselect_dict: " ^ orth); *)
let match_list = Str.split (Str.regexp " ") orth in
let b = Xlist.fold match_list true (fun b s ->
(* if not (StringSet.mem orths s) then print_endline s; *)
StringSet.mem orths s && b) in
if b then (match_list,lemma,interp) :: rules else rules)
with Not_found -> rules)
(*
type matching = {
prefix: tokens list;
matched: token_record list;
suffix: tokens list;
pattern: pat list;
command: token_record list -> token;
last: int
}
let rec find_abr_pattern_tail matchings found = function
[] -> found
| token :: l ->
let matchings,found = Xlist.fold matchings ([],found) (fun (matchings,found) matching ->
match matching.pattern with
[pat] ->
let matchings = if token.beg <= matching.last then matching :: matchings else matchings in
if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then
matchings, {matching with matched = token :: matching.matched; last=token.next; pattern=[]} :: found else
matchings, found
| pat :: pattern ->
let matchings = if token.beg <= matching.last then matching :: matchings else matchings in
if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then
{matching with matched = token :: matching.matched; last=token.next; pattern=pattern} :: matchings, found else
matchings, found
| [] -> matchings, matching :: found) in
if matchings = [] then found else find_abr_pattern_tail matchings found l
let rec find_abr_pattern all_matchings found = function
token :: l ->
let matchings = Xlist.fold all_matchings [] (fun matchings matching ->
match matching.pattern with
pat :: pattern ->
(if PrePatterns.match_token (pat,token.token) then
[{matching with matched = token :: matching.matched; last=token.next; pattern=pattern}] else []) @ matchings
| _ -> failwith "find_abr_pattern: ni") in
let found = if matchings = [] then found else find_abr_pattern_tail matchings found l in
find_abr_pattern all_matchings found l
| [] -> found
let rec make_abr_orth = function
[] -> ""
| [t] -> t.orth
| t :: l -> if t.beg + t.len = t.next then t.orth ^ (make_abr_orth l) else t.orth ^ " " ^ (make_abr_orth l)
let find_abr_patterns patterns tokens =
let found = find_abr_pattern (Xlist.map patterns (fun pattern ->
{prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); last=0})) [] tokens in
Xlist.rev_map found (fun matching ->
let t1 = List.hd (List.rev matching.matched) in
let t2 = List.hd matching.matched in
t1.beg,
t2.beg + t2.len - t1.beg,
t2.next,
make_abr_orth (List.rev matching.matched))
let split_interp line gloss interp =
if interp = "xxx" then [gloss, "xxx"] else
Xlist.map (Str.split (Str.regexp " ") interp) (fun s ->
match Str.split (Str.regexp "|") s with
[lemma;interp] -> lemma, interp
| _ -> failwith ("bad brev entry: " ^ line))
let load_brev_dict () =
let lines = File.load_lines "data/brev_20151215.tab" in
List.rev (Xlist.rev_map lines (fun line ->
match Str.split_delim (Str.regexp "\t") line with
[_; orth; gloss; interp; _] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp
| [_; orth; gloss; interp] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp
| _ -> failwith ("load_brev_dict: " ^ line)))
let parse_lemma lemma =
if lemma = ":" then lemma,"" else
match Str.split (Str.regexp ":") lemma with
[x] -> x,""
| [x;y] -> x,y
| _ -> failwith ("parse_lemma: " ^ lemma)
let make_orths orth beg len lexeme_postags_list =
let n = Xlist.size lexeme_postags_list in
let orth_list =
if n = 1 then [orth,beg,len] else
List.rev (Int.fold 1 n [] (fun l i ->
(orth ^ "_" ^ string_of_int i,
(if i=1 then beg else beg+len-n+i-1),
if i=1 then len-n+1 else 1) :: l)) in
List.rev (Xlist.fold (List.combine orth_list lexeme_postags_list) [] (fun orth_list ((orth,beg,len),(lemma,postags)) ->
(orth, fst (parse_lemma lemma), ENIAMtokens.parse_postags postags, beg, len) :: orth_list))
let brev_dict = load_brev_dict ()
(* FIXME: trzeba zmienić reprezentację skrótów nazw własnych: przenieść do mwe,
Gdy skrót jest częścią nazwy własnej powinien być dalej przetwarzalny *)
let process_brev paths (*tokens*) = paths
(* let paths = Xlist.fold brev_dict paths (fun paths (pattern,lexeme_postags_list) ->
let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in
Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->
let orths = make_orths orth beg len lexeme_postags_list in
ENIAMpaths.add_path paths beg next orths)) in
paths*)
let rec preselect_mwe_dict_token set = function
SmallLetter orth -> StringSet.add set orth
| CapLetter(orth,lc) -> StringSet.add set orth
| AllSmall orth -> StringSet.add set orth
| AllCap(orth,lc,lc2) -> StringSet.add set orth
| FirstCap(orth,lc,_,_) -> StringSet.add set orth
| SomeCap orth -> StringSet.add set orth
| Symbol orth -> StringSet.add set orth
| Dig(v,"dig") -> StringSet.add set v
| Other2 orth -> StringSet.add set orth
| _ -> set
let rec preselect_mwe_dict_tokens set = function
Token t -> preselect_mwe_dict_token set t.token
| Seq l -> Xlist.fold l set preselect_mwe_dict_tokens
| Variant l -> Xlist.fold l set preselect_mwe_dict_tokens
let preselect_mwe_dict mwe_dict tokens =
let set = Xlist.fold tokens StringSet.empty preselect_mwe_dict_tokens in
let set = StringSet.fold set StringSet.empty (fun set orth ->
try
let l = StringMap.find mwe_dict orth in
Xlist.fold l set StringSet.add
with Not_found -> set) in
(* StringSet.iter set print_endline; *)
StringSet.fold set [] (fun l s ->
match Str.split_delim (Str.regexp "\t") s with
[lemma; interp; sense] ->
(match Str.split_delim (Str.regexp ":") interp with
orths :: tags -> (Str.split (Str.regexp " ") orths, lemma, String.concat ":" tags, sense) :: l
| _ -> failwith "preselect_mwe_dict")
| _ -> failwith "preselect_mwe_dict")
let simplify_lemma lemma =
match Str.split (Str.regexp "-") lemma with
[x;"1"] -> x
| [x;"2"] -> x
| [x;"3"] -> x
| [x;"4"] -> x
| [x;"5"] -> x
| _ -> lemma
let mwe_dict = load_mwe_dict ()
let process_mwe paths (*tokens*) = paths
(* let mwe_dict = preselect_mwe_dict mwe_dict tokens in
let paths = Xlist.fold mwe_dict paths (fun paths (pattern,lexeme,interp,sense) ->
let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in
Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->
let orths = make_orths orth beg len [simplify_lemma lexeme,interp] in
ENIAMpaths.add_path paths beg next orths)) in
paths*)
*)
let get_orths paths =
IntMap.fold paths StringSet.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
Xlist.fold l orths (fun orths t ->
StringSet.add orths (ENIAMtokens.get_orth t.token))))
let get_intnum_orths paths =
IntMap.fold paths StringMap.empty (fun orths _ map ->
IntMap.fold map orths (fun orths _ l ->
Xlist.fold l orths (fun orths t ->
match t.token with
Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
| _ -> orths)))
let rec match_path_rec map found (t:token_env) rev = function
[] -> (t :: rev) :: found
| s :: l ->
let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in
let found2 = IntMap.fold map2 [] (fun found2 _ l ->
Xlist.fold l found2 (fun found2 new_t ->
if ENIAMtokens.get_orth new_t.token = s then new_t :: found2 else found2)) in
Xlist.fold found2 found (fun found new_t -> match_path_rec map found new_t (t :: rev) l)
let match_path map = function
[] -> failwith "match_path"
| s :: l ->
let found = IntMap.fold map [] (fun found i map2 ->
IntMap.fold map2 found (fun found j l ->
Xlist.fold l found (fun found t ->
if ENIAMtokens.get_orth t.token = s then t :: found else found))) in
Xlist.fold found [] (fun found t -> match_path_rec map found t [] l)
let concat_orths l =
let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in
let n = Xstring.size s in
if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s
let create_token (matching:token_env list) lemma interp = (* FIXME: problem z nazwami własnymi *)
let l = List.rev matching in
let beg = (List.hd l).beg in
let t = List.hd matching in
let len = t.beg + t.len - beg in
{empty_token_env with
orth=concat_orths l;
beg=beg;
len=len;
next=t.next;
token=ENIAMtokens.make_lemma (lemma,interp);
weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *)
attrs=ENIAMtokens.merge_attrs l}
let add_token paths t =
let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
let map = IntMap.add_inc map t.next [t] (fun l -> t :: l) in
IntMap.add paths t.beg map
let apply_rule paths (match_list,lemma,interp) =
(* print_endline ("apply_rule: " ^ lemma); *)
let matchings_found = match_path paths match_list in
Xlist.fold matchings_found paths (fun paths matching ->
try
let token = create_token matching lemma interp in
add_token paths token
with Not_found -> paths)
(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *)
let add_ordnum_rules rules paths =
let orths = get_intnum_orths paths in
StringMap.fold orths rules (fun rules orth lemmas ->
StringSet.fold lemmas rules (fun rules lemma ->
(* Printf.printf "%s %s\n%!" orth lemma; *)
([orth;"."],lemma,"ordnum") :: rules))
let process (paths,last) =
let paths = Xlist.fold paths IntMap.empty add_token in
let orths = get_orths paths in
let rules = preselect_dict orths mwe_dict in
let rules = add_ordnum_rules rules paths in
let paths = Xlist.fold rules paths apply_rule in
let paths = IntMap.fold paths [] (fun paths _ map ->
IntMap.fold map paths (fun paths _ l ->
Xlist.fold l paths (fun paths t ->
t :: paths))) in
ENIAMpaths.sort (paths,last)