ENIAMtokenizer.ml
9.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
(*
* ENIAMtokenizer, a tokenizer for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAMtokenizerTypes
open ENIAMinflexion
let load_set filename set =
Xlist.fold (File.load_lines filename) set StringSet.add
let initialize () =
ENIAMinflexion.initialize ();
ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns ();
ENIAMurl.top_level_domains := ENIAMurl.load_top_level_domains ();
known_lemmata := File.catch_no_file (load_set known_lemmata_filename) StringSet.empty;
known_orths := File.catch_no_file (load_set known_orths_filename) StringSet.empty;
known_lemmata := File.catch_no_file (load_set user_known_lemmata_filename) !known_lemmata;
known_orths := File.catch_no_file (load_set user_known_orths_filename) !known_orths;
known_lemmata :=
Xlist.fold !theories_paths !known_lemmata (fun set path ->
File.catch_no_file (load_set (path ^ "/known_lemmata.tab")) set);
known_orths :=
Xlist.fold !theories_paths !known_orths (fun set path ->
File.catch_no_file (load_set (path ^ "/known_orths.tab")) set);
()
let string_of =
ENIAMtokens.string_of_tokens
let get_ontological_category lemma pos tags =
[StringSet.mem !known_lemmata lemma,false,false,"X",tags] (* FIXME: todo *)
(* let lemmatize_strings l =
let l1,l2 = Xlist.fold l ([],[]) (fun (l1,l2) (s,obs,lem) ->
let l = ENIAMinflexion.get_interpretations s in
Xlist.fold l (l1,l2) (fun (l1,l2) i ->
if i.ENIAMinflexion.status = ENIAMinflexion.LemmaVal ||
i.ENIAMinflexion.status = ENIAMinflexion.LemmaAlt then (i,obs,lem) :: l1, l2 else
if StringSet.mem !known_lemmata i.ENIAMinflexion.lemma then (i,obs,lem) :: l1, l2 else l1, (i,obs,lem) :: l2)) in
if l1 = [] then l2,false else l1,true *)
let calculate_priority2 star has_agl_suf equal_case =
if has_agl_suf || (not equal_case) || star = ENIAMmorphologyTypes.Aux2 then max_int else 3
let calculate_priority is_in_lexicon lemma_in_sgjp is_validated_by_sgjp has_no_sgjp_tag star has_poss_ndm_tag has_agl_suf equal_case =
match is_in_lexicon,lemma_in_sgjp,is_validated_by_sgjp,has_no_sgjp_tag,(star = ENIAMmorphologyTypes.Ndm),has_poss_ndm_tag with
true,true,true,_,_,_ -> 1
| true,true,false,true,_,_ -> 1
| true,true,false,false,_,_ -> calculate_priority2 star has_agl_suf equal_case
| true,false,_,_,true,true -> 1
| true,false,_,_,true,false -> calculate_priority2 star has_agl_suf equal_case
| true,false,_,_,false,_ -> 1
| false,_,true,_,_,_ -> 2
| false,_,false,_,_,_ -> calculate_priority2 star has_agl_suf equal_case
let lemmatize_strings2 lemma pos tags best_prior best_l is_validated_by_sgjp star has_agl_suf has_equal_cases =
Xlist.fold (get_ontological_category lemma pos tags) (best_prior,best_l) (fun (best_prior,best_l) (is_in_lexicon,has_no_sgjp_tag,has_poss_ndm_tag,cat,tags) ->
let prior = calculate_priority is_in_lexicon
(StringSet.mem !ENIAMinflexion.lemmata lemma) is_validated_by_sgjp has_no_sgjp_tag
star has_poss_ndm_tag has_agl_suf has_equal_cases in
if prior > best_prior then best_prior,best_l else
if prior < best_prior then prior,[(*obs,lem,*)lemma,pos,tags,cat] else
best_prior,((*obs,lem,*)lemma,pos,tags,cat) :: best_l)
let lemmatize_strings has_agl_suf l =
Xlist.fold l (1000,[]) (fun (best_prior,best_l) (s,obs,lem) ->
Xlist.fold (get_interpretations s) (best_prior,best_l) (fun (best_prior,best_l) i ->
let is_validated_by_sgjp = i.status = LemmaVal || i.status = LemmaAlt in
Xlist.fold (ENIAMtagset.parse i.interp) (best_prior,best_l) (fun (best_prior,best_l) (pos,tags) -> (* zakładam, że tagi generowane przez analizator morfologiczny są poprawne i nie mają _ *)
lemmatize_strings2 i.lemma pos tags best_prior best_l is_validated_by_sgjp i.star has_agl_suf (obs = lem))))
let lemmatize_token has_agl_suf = function
SmallLetter(uc,lc) -> lemmatize_strings has_agl_suf [uc,(SL : letter_size),(CL : letter_size);lc,SL,SL]
| CapLetter(uc,lc) -> lemmatize_strings has_agl_suf [uc,(CL : letter_size),(CL : letter_size);lc,CL,SL]
| AllSmall(uc,fc,lc) -> lemmatize_strings has_agl_suf [uc,(AS : letter_size),AC;fc,AS,FC;lc,AS,AS]
| AllCap(uc,fc,lc) -> lemmatize_strings has_agl_suf [uc,(AC : letter_size),AC;fc,AC,FC;lc,AC,AS]
| FirstCap(uc,fc,lc) -> lemmatize_strings has_agl_suf [uc,(FC : letter_size),AC;fc,FC,FC;lc,FC,AS]
| SomeCap(uc,orth,lc) -> lemmatize_strings has_agl_suf [uc,(SC : letter_size),AC;orth,SC,SC;lc,SC,AS]
| Lemma(lemma,pos,interp) ->
Xlist.fold interp (1000,[]) (fun (best_prior,best_l) tags ->
lemmatize_strings2 lemma pos tags best_prior best_l true ENIAMmorphologyTypes.Star has_agl_suf true)
| t -> 1000,[]
let is_known_orth = function
SmallLetter(uc,lc) -> true
| CapLetter(uc,lc) -> true
| AllSmall(uc,fc,lc) -> StringSet.mem !known_orths uc || StringSet.mem !known_orths fc || StringSet.mem !known_orths lc
| AllCap(uc,fc,lc) -> StringSet.mem !known_orths uc || StringSet.mem !known_orths fc || StringSet.mem !known_orths lc
| FirstCap(uc,fc,lc) -> StringSet.mem !known_orths uc || StringSet.mem !known_orths fc || StringSet.mem !known_orths lc
| SomeCap(uc,orth,lc) -> StringSet.mem !known_orths uc || StringSet.mem !known_orths orth || StringSet.mem !known_orths lc
| Lemma _ -> false
| t -> true
let rec lemmatize_rec = function
Token t ->
(* print_endline ("lemmatize_rec: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token); *)
(*let l,b = lemmatize_token (Xlist.mem t.attrs HasAglSuffix) t.token in
if Xlist.mem t.attrs HasAglSuffix && not b then Variant[],false else
let t = {t with attrs=Xlist.remove_all t.attrs HasAglSuffix} in
let attrs = if b then t.attrs else LemmNotVal :: t.attrs in
let l = List.flatten (Xlist.rev_map l (fun (i,obs,lem) ->
if (not b) && obs <> lem then [] else
Xlist.rev_map (ENIAMtagset.parse i.ENIAMinflexion.interp) (fun (pos,tags) -> (* zakładam, że tagi generowane przez analizator morfologiczny są poprawne i nie mają _ *)
Token{t with token=Lemma(i.ENIAMinflexion.lemma,pos,[tags]); attrs=attrs}))) in (* FIXME obs,lem,i.ENIAMinflexion.star,i.ENIAMinflexion.freq,i.ENIAMinflexion.tags *)
if is_known_orth t.token then Variant(Token t :: l), true else Variant l, b*)
let has_agl_suf = Xlist.mem t.attrs HasAglSuffix in
let prior,l = lemmatize_token has_agl_suf t.token in
let t = {t with attrs=Xlist.remove_all t.attrs HasAglSuffix} in
let attrs = if prior <= 2 then t.attrs else LemmNotVal :: t.attrs in
let l = Xlist.rev_map l (fun ((*obs,lem,*)lemma,pos,tags,cat) ->
Token{t with token=Lemma(lemma,pos,[tags]); attrs=attrs}) in
if is_known_orth t.token && not has_agl_suf then [Token t],Variant l,prior else [],Variant l,prior
| Seq l ->
(try
let l,prior = Xlist.fold l ([],0) (fun (l,prior) t ->
let t1,t2,prior2 = lemmatize_rec t in
match t1, t2 with
[], Variant[] -> raise Not_found
| [t], Variant[] -> t :: l, prior
| [], _ -> t2 :: l, max prior prior2
| _ -> failwith "lemmatize_rec") in
if prior = 0 then [Seq(List.rev l)], Variant[], 1000
else [], Seq(List.rev l), prior
with Not_found -> [],Variant[],1000)
(* if t1 <>
if t = Variant[] then raise Not_found else
t :: l, b && b2) in
Seq(List.rev l), b
with Not_found -> Variant[],false)*)
| Variant l ->
let l1,l2,prior = Xlist.fold l ([],[],1000) (fun (l1,l2,best_prior) t ->
let t1,t2,prior = lemmatize_rec t in
if prior > best_prior then t1 @ l1,l2,best_prior else
if prior < best_prior then t1 @ l1,[t2],prior else
t1 @ l1,t2 :: l2,best_prior) in
l1,Variant l2,prior
(* let l1,l2 = Xlist.fold l ([],[]) (fun (l1,l2) t ->
let t,b = lemmatize_rec t in
if t = Variant[] then l1,l2 else
if b then t :: l1, l2 else l1, t :: l2) in
if l1 = [] then Variant l2,false else Variant l1,true *)
let lemmatize l =
List.rev (Xlist.rev_map l (fun t ->
let t1,t2,_ = lemmatize_rec t in
Variant (t2 :: t1)))
(* fst (lemmatize_rec t))) *)
let parse query =
let l = Xunicode.classified_chars_of_utf8_string query in
let l = ENIAMtokens.tokenize l in
let l = ENIAMpatterns.normalize_tokens [] l in
let l = ENIAMpatterns.find_replacement_patterns l in
let l = ENIAMpatterns.remove_spaces [] l in
let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.abr_patterns l in
(* let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.query_patterns l in *)
let l = ENIAMpatterns.normalize_tokens [] l in
let l = lemmatize l in
let l = ENIAMpatterns.normalize_tokens [] l in
(* let l = ENIAMpatterns.process_interpunction [] l in
let l = ENIAMpatterns.normalize_tokens [] l in *)
(* let l = ENIAMpatterns.manage_query_boundaries l in *)
l