ENIAMsubsyntax.ml
12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes
open Xstd
let load_lemma_frequencies filename =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
Xlist.fold l StringMap.empty (fun map line ->
if String.length line = 0 then map else
if String.get line 0 = '#' then map else
match Str.split_delim (Str.regexp "\t") line with
[count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.))
| _ -> failwith ("load_lemma_frequencies: " ^ line))
let lemma_frequencies = load_lemma_frequencies lemma_frequencies_filename
let modify_weights paths =
List.rev (Xlist.fold paths [] (fun paths t ->
let w = Xlist.fold t.attrs t.weight (fun w -> function
"token not found" -> w -. 25.
| "lemma not validated"-> w -. 20.
| "notvalidated proper" -> w -. 1.
| "lemmatized as lowercase" -> w -. 0.1
| _ -> w) in
let w = match t.token with
Lemma(lemma,cat,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
| Proper(lemma,cat,_,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
| _ -> w in
{t with weight = w} :: paths))
let translate_digs paths =
Xlist.map paths (fun t ->
match t.token with
Dig(lemma,"dig") -> t
| Dig(lemma,"intnum") -> {t with token=Lemma(lemma,"intnum",[[]])}
| Dig(lemma,"realnum") -> {t with token=Lemma(lemma,"realnum",[[]])}
| Dig(lemma,"year") -> {t with token=Proper(lemma,"year",[[]],["rok"])}
| Dig(lemma,"month") -> t (*{t with token=Proper(lemma,"month",[[]],["miesiąc"])}*)
| Dig(lemma,"hour") -> {t with token=Proper(lemma,"hour",[[]],["godzina"])}
| Dig(lemma,"day") -> {t with token=Proper(lemma,"day",[[]],["dzień"])}
| Dig(lemma,"minute") -> t (*{t with token=Proper(lemma,"minute",[[]],["minuta"])}*)
| Dig(lemma,"2dig") -> t
| Dig(lemma,"3dig") -> t
| Dig(lemma,"pref3dig") -> t
| RomanDig(lemma,"roman") -> {t with token=Lemma(lemma,"roman",[[]]); attrs=t.attrs}
| RomanDig(lemma,"month") -> t (*{t with token=Proper(lemma,"symbol",[[]],["month"]); attrs="roman" :: t.attrs}*)
| Dig(lemma,"ordnum") -> {t with token=Lemma(lemma,"ordnum",[[]])}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("day-month",[Dig(d,"day");Dig(m,"month")]) -> {t with token=Proper(d ^ "." ^ m,"day-month",[[]],["data"])}
| Compound("hour-minute",[Dig(h,"hour");Dig(m,"minute")]) -> {t with token=Proper(h ^ ":" ^ m,"hour-minute",[[]],["godzina"])}
| Compound("match-result",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Proper(x ^ ":" ^ y,"match-result",[[]],["rezultat"])}
| Compound("intnum-interval",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"intnum-interval",[[]])}
| Compound("roman-interval",[RomanDig(x,"roman");RomanDig(y,"roman")]) -> {t with token=Lemma(x ^ "-" ^ y,"roman-interval",[[]]); attrs=t.attrs}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("realnum-interval",[Dig(x,"intnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("date-interval",[Compound("date",[Dig(d1,"day");Dig(m1,"month");Dig(y1,"year")]);
Compound("date",[Dig(d2,"day");Dig(m2,"month");Dig(y2,"year")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "." ^ y1 ^ "-" ^ d2 ^ "." ^ m2 ^ "." ^ y2,"date-interval",[[]],["interwał"])}
| Compound("day-month-interval",[Compound("day-month",[Dig(d1,"day");Dig(m1,"month")]);
Compound("day-month",[Dig(d2,"day");Dig(m2,"month")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "-" ^ d2 ^ "." ^ m2,"day-month-interval",[[]],["interwał"])}
| Compound("day-interval",[Dig(d1,"day");Dig(d2,"day")]) -> {t with token=Proper(d1 ^ "-" ^ d2,"day-interval",[[]],["interwał"])}
| Compound("month-interval",[Dig(m1,"month");Dig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"])}
| Compound("month-interval",[RomanDig(m1,"month");RomanDig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"]); attrs="roman" :: t.attrs}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"year")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"2dig")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
| Compound("hour-minute-interval",[Compound("hour-minute",[Dig(h1,"hour");Dig(m1,"minute")]);Compound("hour-minute",[Dig(h2,"hour");Dig(m2,"minute")])]) ->
{t with token=Proper(h1 ^ ":" ^ m1 ^ "-" ^ h2 ^ ":" ^ m2,"hour-minute-interval",[[]],["interwał"])}
| Compound("hour-interval",[Dig(h1,"hour");Dig(h2,"hour")]) -> {t with token=Proper(h1 ^ "-" ^ h2,"hour-interval",[[]],["interwał"])}
| Compound("minute-interval",[Dig(m1,"minute");Dig(m2,"minute")]) -> t (*{t with token=Proper(m1 ^ "-" ^ m2,"minute-interval",[[]],["interwał"])}*)
| Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])}
| Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])}
| Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])}
| Dig(cat,_) -> failwith ("translate_digs: Dig " ^ cat)
| RomanDig(cat,_) -> failwith ("translate_digs: Romandig " ^ cat)
| Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t)
| _ -> t)
(**********************************************************************************)
module OrderedStringList = struct
type t = string list
let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)
end
module OrderedStringListList = struct
type t = string list list
let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)
end
module StringListMap = Xmap.Make(OrderedStringList)
module StringListListMap = Xmap.Make(OrderedStringListList)
module StringListListSet = Xset.Make(OrderedStringListList)
type tree = T of tree StringListMap.t | S of StringSet.t
let single_tags = function
[_] :: _ -> true
| _ -> false
let rec make_tree interp =
if single_tags interp then S (StringSet.of_list (List.flatten (List.flatten interp))) else
let map = Xlist.fold interp StringListMap.empty (fun map tags ->
StringListMap.add_inc map (List.hd tags) [List.tl tags] (fun l -> (List.tl tags) :: l)) in
T(StringListMap.map map make_tree)
let is_s_tree map =
StringListListMap.fold map false (fun b _ -> function
S _ -> true
| T _ -> b)
let rec fold_tree_rec rev s f = function
S set -> f s (List.rev rev) set
| T map -> StringListMap.fold map s (fun s tag tree ->
fold_tree_rec (tag :: rev) s f tree)
let fold_tree tree s f = fold_tree_rec [] s f tree
let rec combine_interps_rec map =
if is_s_tree map then
StringListListMap.fold map [] (fun interp tail_tags -> function
S tag -> ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) :: interp
| _ -> failwith "combine_interps_rec")
else
let map = StringListListMap.fold map StringListListMap.empty (fun map tail_tags tree ->
fold_tree tree map (fun map head_tags tag ->
StringListListMap.add_inc map ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) [head_tags] (fun l -> head_tags :: l))) in
combine_interps_rec (StringListListMap.map map make_tree)
let combine_interp interp =
let map = StringListListMap.add StringListListMap.empty [] (make_tree interp) in
combine_interps_rec map
let combine_pos = StringSet.of_list ["subst"; "depr"; "ppron12"; "ppron3"; "siebie"; "adj"; "num"; "ger"; "praet"; "fin"; "impt"; "imps"; "pcon"; "ppas"; "pact";
"inf"; "bedzie"; "aglt"; "winien"; "pant"; "prep"]
let combine_interps paths =
List.rev (Xlist.rev_map paths (fun t ->
match t.token with
Lemma(lemma,pos,interp) ->
(* Printf.printf "%s %s %s\n" lemma pos (PreTokenizer.string_of_interps interp); *)
let interp =
if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
else interp in
let interp =
if StringSet.mem combine_pos pos then combine_interp interp else
StringListListSet.to_list (StringListListSet.of_list interp) in
{t with token=Lemma(lemma,pos,interp)}
| _ -> t))
(**********************************************************************************)
let select_tokens paths =
List.rev (Xlist.fold paths [] (fun paths t ->
match t.token with
(* RomanDig(v,cat) -> {t with token=Lemma(v,cat,[[]])} :: paths
| Interp orth -> {t with token=Lemma(orth,"interp",[[]])} :: paths
| Dig(value,cat) -> {t with token=Lemma(value,cat,[[]])} :: paths
| Other2 orth -> {t with token=Lemma(orth,"unk",[[]])} :: paths
| Lemma(lemma,cat,interp) -> t :: paths
| Proper _ -> failwith "select_tokens"
| Compound _ -> t :: paths*)
(* RomanDig(v,cat) -> t :: paths *)
| Interp orth -> t :: paths
(* | Dig(value,cat) -> t :: paths *)
| Other orth -> t :: paths
| Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths
| Proper(lemma,pos,interp,cat) -> t :: paths
(* | Compound _ -> t :: paths *)
| _ -> paths))
let parse query =
let l = ENIAMtokenizer.parse query in
(* print_endline "a6"; *)
let paths = ENIAMpaths.translate_into_paths l in
(* print_endline "a7"; *)
let paths = ENIAMpaths.lemmatize paths in
(* print_endline "a8"; *)
let paths,_ = ENIAM_MWE.process paths in
(* print_endline "a12"; *)
(* let paths = find_proper_names paths in*)
(* print_endline "a13"; *)
let paths = modify_weights paths in
let paths = translate_digs paths in
(* let paths = assign_senses paths in
(* print_endline "a14"; *)
let paths = assign_valence paths in*)
(* print_endline "a15"; *)
let paths = combine_interps paths in
(* print_endline "a16"; *)
(* let paths = disambiguate_senses paths in
let paths = assign_simplified_valence paths in
let paths = PreSemantics.assign_semantics paths in*)
(* print_endline "a16"; *)
let paths = select_tokens paths in
(* print_endline "a17"; *)
(* print_endline "a18"; *)
paths(*, next_id*)
let parse_text query =
(* print_endline ("parse_text: " ^ query); *)
let tokens = ExtArray.make 100 empty_token in
let _ = ExtArray.add tokens empty_token in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
let paragraphs = Xstring.split "\n\\|\r" query in
let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
let n = if Xlist.size paragraphs = 1 then 0 else 1 in
let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph ->
let paths = parse paragraph in
(* print_endline "parse_text 1"; *)
let pid = if n = 0 then "" else string_of_int n ^ "_" in
let sentences = ENIAMsentences.split_into_sentences pid paragraph tokens paths in
(AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1) in
AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens