ENIAMsubsyntax.ml
14.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes
open Xstd
let load_lemma_frequencies filename map =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
Xlist.fold l map (fun map line ->
if String.length line = 0 then map else
if String.get line 0 = '#' then map else
match Str.split_delim (Str.regexp "\t") line with
[count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.))
| _ -> failwith ("load_lemma_frequencies: " ^ line))
let lemma_frequencies = File.catch_no_file (load_lemma_frequencies lemma_frequencies_filename) StringMap.empty
let modify_weights paths =
List.rev (Xlist.fold paths [] (fun paths t ->
let w = Xlist.fold t.attrs t.weight (fun w -> function
"token not found" -> w -. 25.
| "lemma not validated"-> w -. 20.
| "notvalidated proper" -> w -. 1.
| "lemmatized as lowercase" -> w -. 0.1
| _ -> w) in
let w = match t.token with
Lemma(lemma,cat,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
| Proper(lemma,cat,_,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
| _ -> w in
{t with weight = w} :: paths))
let translate_digs paths =
Xlist.map paths (fun t ->
match t.token with
Dig(lemma,"dig") -> t
| Dig(lemma,"intnum") -> {t with token=Lemma(lemma,"intnum",[[]])}
| Dig(lemma,"realnum") -> {t with token=Lemma(lemma,"realnum",[[]])}
| Dig(lemma,"year") -> {t with token=Proper(lemma,"year",[[]],["rok"])}
| Dig(lemma,"month") -> t (*{t with token=Proper(lemma,"month",[[]],["miesiąc"])}*)
| Dig(lemma,"hour") -> {t with token=Proper(lemma,"hour",[[]],["godzina"])}
| Dig(lemma,"day") -> {t with token=Proper(lemma,"day",[[]],["dzień"])}
| Dig(lemma,"minute") -> t (*{t with token=Proper(lemma,"minute",[[]],["minuta"])}*)
| Dig(lemma,"2dig") -> t
| Dig(lemma,"3dig") -> t
| Dig(lemma,"pref3dig") -> t
| RomanDig(lemma,"roman") -> {t with token=Lemma(lemma,"roman",[[]]); attrs=t.attrs}
| RomanDig(lemma,"month") -> t (*{t with token=Proper(lemma,"symbol",[[]],["month"]); attrs="roman" :: t.attrs}*)
| Dig(lemma,"ordnum") -> {t with token=Lemma(lemma,"ordnum",[[]])}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("day-month",[Dig(d,"day");Dig(m,"month")]) -> {t with token=Proper(d ^ "." ^ m,"day-month",[[]],["data"])}
| Compound("hour-minute",[Dig(h,"hour");Dig(m,"minute")]) -> {t with token=Proper(h ^ ":" ^ m,"hour-minute",[[]],["godzina"])}
| Compound("match-result",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Proper(x ^ ":" ^ y,"match-result",[[]],["rezultat"])}
| Compound("intnum-interval",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"intnum-interval",[[]])}
| Compound("roman-interval",[RomanDig(x,"roman");RomanDig(y,"roman")]) -> {t with token=Lemma(x ^ "-" ^ y,"roman-interval",[[]]); attrs=t.attrs}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("realnum-interval",[Dig(x,"intnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("date-interval",[Compound("date",[Dig(d1,"day");Dig(m1,"month");Dig(y1,"year")]);
Compound("date",[Dig(d2,"day");Dig(m2,"month");Dig(y2,"year")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "." ^ y1 ^ "-" ^ d2 ^ "." ^ m2 ^ "." ^ y2,"date-interval",[[]],["interwał"])}
| Compound("day-month-interval",[Compound("day-month",[Dig(d1,"day");Dig(m1,"month")]);
Compound("day-month",[Dig(d2,"day");Dig(m2,"month")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "-" ^ d2 ^ "." ^ m2,"day-month-interval",[[]],["interwał"])}
| Compound("day-interval",[Dig(d1,"day");Dig(d2,"day")]) -> {t with token=Proper(d1 ^ "-" ^ d2,"day-interval",[[]],["interwał"])}
| Compound("month-interval",[Dig(m1,"month");Dig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"])}
| Compound("month-interval",[RomanDig(m1,"month");RomanDig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"]); attrs="roman" :: t.attrs}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"year")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"2dig")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
| Compound("hour-minute-interval",[Compound("hour-minute",[Dig(h1,"hour");Dig(m1,"minute")]);Compound("hour-minute",[Dig(h2,"hour");Dig(m2,"minute")])]) ->
{t with token=Proper(h1 ^ ":" ^ m1 ^ "-" ^ h2 ^ ":" ^ m2,"hour-minute-interval",[[]],["interwał"])}
| Compound("hour-interval",[Dig(h1,"hour");Dig(h2,"hour")]) -> {t with token=Proper(h1 ^ "-" ^ h2,"hour-interval",[[]],["interwał"])}
| Compound("minute-interval",[Dig(m1,"minute");Dig(m2,"minute")]) -> t (*{t with token=Proper(m1 ^ "-" ^ m2,"minute-interval",[[]],["interwał"])}*)
| Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])}
| Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])}
| Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])}
| Dig(cat,_) -> failwith ("translate_digs: Dig " ^ cat)
| RomanDig(cat,_) -> failwith ("translate_digs: Romandig " ^ cat)
| Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t)
| _ -> t)
(**********************************************************************************)
module OrderedStringList = struct
type t = string list
let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)
end
module OrderedStringListList = struct
type t = string list list
let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)
end
module StringListMap = Xmap.Make(OrderedStringList)
module StringListListMap = Xmap.Make(OrderedStringListList)
module StringListListSet = Xset.Make(OrderedStringListList)
type tree = T of tree StringListMap.t | S of StringSet.t
let single_tags = function
[_] :: _ -> true
| _ -> false
let rec make_tree interp =
if single_tags interp then S (StringSet.of_list (List.flatten (List.flatten interp))) else
let map = Xlist.fold interp StringListMap.empty (fun map tags ->
StringListMap.add_inc map (List.hd tags) [List.tl tags] (fun l -> (List.tl tags) :: l)) in
T(StringListMap.map map make_tree)
let is_s_tree map =
StringListListMap.fold map false (fun b _ -> function
S _ -> true
| T _ -> b)
let rec fold_tree_rec rev s f = function
S set -> f s (List.rev rev) set
| T map -> StringListMap.fold map s (fun s tag tree ->
fold_tree_rec (tag :: rev) s f tree)
let fold_tree tree s f = fold_tree_rec [] s f tree
let rec combine_interps_rec map =
if is_s_tree map then
StringListListMap.fold map [] (fun interp tail_tags -> function
S tag -> ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) :: interp
| _ -> failwith "combine_interps_rec")
else
let map = StringListListMap.fold map StringListListMap.empty (fun map tail_tags tree ->
fold_tree tree map (fun map head_tags tag ->
StringListListMap.add_inc map ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) [head_tags] (fun l -> head_tags :: l))) in
combine_interps_rec (StringListListMap.map map make_tree)
let combine_interp interp =
let map = StringListListMap.add StringListListMap.empty [] (make_tree interp) in
combine_interps_rec map
let combine_pos = StringSet.of_list ["subst"; "depr"; "ppron12"; "ppron3"; "siebie"; "adj"; "num"; "ger"; "praet"; "fin"; "impt"; "imps"; "pcon"; "ppas"; "pact";
"inf"; "bedzie"; "aglt"; "winien"; "pant"; "prep"]
let combine_interps paths =
List.rev (Xlist.rev_map paths (fun t ->
match t.token with
Lemma(lemma,pos,interp) ->
(* Printf.printf "%s %s %s\n" lemma pos (PreTokenizer.string_of_interps interp); *)
let interp =
if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
else interp in
let interp =
if StringSet.mem combine_pos pos then combine_interp interp else
StringListListSet.to_list (StringListListSet.of_list interp) in
{t with token=Lemma(lemma,pos,interp)}
| Proper(lemma,pos,interp,cat) ->
(* Printf.printf "%s %s %s\n" lemma pos (PreTokenizer.string_of_interps interp); *)
let interp =
if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
else interp in
let interp =
if StringSet.mem combine_pos pos then combine_interp interp else
StringListListSet.to_list (StringListListSet.of_list interp) in
{t with token=Proper(lemma,pos,interp,cat)}
| _ -> t))
(**********************************************************************************)
let select_tokens paths =
List.rev (Xlist.fold paths [] (fun paths t ->
match t.token with
(* RomanDig(v,cat) -> {t with token=Lemma(v,cat,[[]])} :: paths
| Interp orth -> {t with token=Lemma(orth,"interp",[[]])} :: paths
| Dig(value,cat) -> {t with token=Lemma(value,cat,[[]])} :: paths
| Other2 orth -> {t with token=Lemma(orth,"unk",[[]])} :: paths
| Lemma(lemma,cat,interp) -> t :: paths
| Proper _ -> failwith "select_tokens"
| Compound _ -> t :: paths*)
(* RomanDig(v,cat) -> t :: paths *)
| Interp orth -> t :: paths
(* | Dig(value,cat) -> t :: paths *)
| Other orth -> t :: paths
| Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths
| Proper(lemma,pos,interp,cat) -> if pos = "brev" then paths else t :: paths
(* | Compound _ -> t :: paths *)
| _ -> paths))
let load_proper_name proper = function
[lemma; types] ->
let types = Str.split (Str.regexp "|") types in
StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
| l -> failwith ("proper_names: " ^ String.concat " " l)
let load_proper_names filename proper =
File.fold_tab filename proper load_proper_name
let proper_names =
let proper = File.catch_no_file (load_proper_names proper_names_filename) StringMap.empty in
let proper = File.catch_no_file (load_proper_names proper_names_filename2) proper in
let proper = File.catch_no_file (load_proper_names proper_names_filename3) proper in
proper
let remove l s =
Xlist.fold l [] (fun l t ->
if s = t then l else t :: l)
let find_proper_names t =
match t.token with
Lemma(lemma,pos,interp) ->
if StringMap.mem proper_names lemma then
{t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
attrs=remove t.attrs "notvalidated proper"} else
if Xlist.mem t.attrs "notvalidated proper" then
{t with token=Proper(lemma,pos,interp,[])}
else t
| _ -> t
let parse query =
let l = ENIAMtokenizer.parse query in
(* print_endline "a6"; *)
let paths = ENIAMpaths.translate_into_paths l in
(* print_endline "a7"; *)
let paths = ENIAMpaths.lemmatize paths in
(* print_endline "a8"; *)
let paths,_ = ENIAM_MWE.process paths in
(* print_endline "a12"; *)
(* let paths = find_proper_names paths in*)
let paths = List.rev (Xlist.rev_map paths find_proper_names) in
(* print_endline "a13"; *)
let paths = modify_weights paths in
let paths = translate_digs paths in
(* let paths = assign_senses paths in
(* print_endline "a14"; *)
let paths = assign_valence paths in*)
(* print_endline "a15"; *)
let paths = combine_interps paths in
(* print_endline "a16"; *)
(* let paths = disambiguate_senses paths in
let paths = assign_simplified_valence paths in
let paths = PreSemantics.assign_semantics paths in*)
(* print_endline "a16"; *)
let paths = select_tokens paths in
(* print_endline "a17"; *)
(* print_endline "a18"; *)
paths(*, next_id*)
let parse_text_tokens tokens query =
(* print_endline ("parse_text_tokens: " ^ query); *)
let paragraphs = Xstring.split "\n\\|\r" query in
let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
let n = if Xlist.size paragraphs = 1 then 0 else 1 in
let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph ->
let paths = parse paragraph in
(* print_endline "parse_text 1"; *)
let pid = if n = 0 then "" else string_of_int n ^ "_" in
let sentences = ENIAMsentences.split_into_sentences pid paragraph tokens paths in
(AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1) in
AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens
let parse_text query =
(* print_endline ("parse_text: " ^ query); *)
let tokens = ExtArray.make 100 empty_token_env in
let _ = ExtArray.add tokens empty_token_env in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
parse_text_tokens tokens query