validateMorphology.ml
10.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
(*
* ENIAM_NKJP, an interface for National Corpus of Polish (NKJP).
* Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMtokenizerTypes
open Xstd
open ValidateTokenizer
let rec select_interp = function (* przejście z m1 do m1.p1 *)
"n" :: l,["n1"] :: ll -> ["n1"] :: (select_interp (l,ll))
| "n" :: l,["n2"] :: ll -> ["n2"] :: (select_interp (l,ll))
| "n" :: l,["p2"] :: ll -> ["p2"] :: (select_interp (l,ll))
| "n" :: l,["p3"] :: ll -> ["p3"] :: (select_interp (l,ll))
| "n" :: l,["n1";"n2"] :: ll -> ["n1";"n2"] :: (select_interp (l,ll))
| "n" :: l,["m1";"m2";"m3";"f";"n1";"n2";"p1";"p2";"p3"] :: ll -> ["n1";"n2";"p2";"p3"] :: (select_interp (l,ll))
| "n" :: l,["m1";"m2";"m3";"f";"n1";"n2";"p1";"p2"] :: ll -> ["n1";"n2";"p2"] :: (select_interp (l,ll))
| "n" :: l,["m1";"m2";"m3";"n1";"n2"] :: ll -> ["n1";"n2"] :: (select_interp (l,ll))
| "n" :: l,["m1";"m2";"m3";"f";"n1";"n2"] :: ll -> ["n1";"n2"] :: (select_interp (l,ll))
| "n" :: l,["m2";"m3";"f";"n1";"n2";"p2";"p3"] :: ll -> ["n1";"n2";"p2";"p3"] :: (select_interp (l,ll))
| "m1" :: l,["_"] :: ll -> ["m1"] :: (select_interp (l,ll))
| "m2" :: l,["_"] :: ll -> ["m2"] :: (select_interp (l,ll))
| "m3" :: l,["_"] :: ll -> ["m3"] :: (select_interp (l,ll))
| "f" :: l,["_"] :: ll -> ["f"] :: (select_interp (l,ll))
| "n" :: l,["_"] :: ll -> ["n1";"n2";"p2";"p3"] :: (select_interp (l,ll))
| a :: l,al :: ll -> if Xlist.mem al a then [a] :: (select_interp (l,ll)) else raise Not_found
| [],[] -> []
| _ -> raise Not_found
let lowercase s = function
AllSmall _ -> s
| SmallLetter _ -> s
| FirstCap(_,_,c,l) ->
if Xstring.check_prefix c s then
l ^ Xstring.cut_prefix c s
else failwith ("lowercase: " ^ s ^ " " ^ c)
| t -> failwith ("lowercase: " ^ ENIAMtokens.string_of_token t)
let match_lemmatize stats t lemma cat interp =
if cat = "brev" then stats else
if t.token = Symbol "." then stats else
let l = ENIAMpaths.lemmatize_token t in
let l2 = Xlist.fold l [] (fun l2 t2 ->
match t2.token with
Lemma(lemma2,cat2,interp2) -> if lemma = lemma2 || lemma = lowercase lemma2 t.token then t2 :: l2 else l2
| Proper(lemma2,cat2,interp2,_) -> if lemma = lemma2 || lemma = lowercase lemma2 t.token then t2 :: l2 else l2
| _ -> t2 :: l2) in
if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ lemma) else
let l3 = Xlist.fold l2 [] (fun l3 t ->
match t.token with
Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3
| Proper(lemma2,cat2,interp2,_) -> if cat = cat2 then t :: l3 else l3
| _ -> t :: l3) in
if l3 = [] then StringQMap.add stats ("no cat: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat) else
let l4 = Xlist.fold l3 [] (fun l4 t ->
match t.token with
Lemma(lemma2,cat2,interp2) ->
let interp2 = Xlist.fold interp2 [] (fun interp2 interp3 -> try select_interp (interp,interp3) :: interp2 with Not_found -> interp2) in
if interp2 = [] then l4 else (
if Xlist.size interp2 > 1 then print_endline "match_lemmatize: multiple interp";
(try {t with token=Lemma(lemma2,cat2,interp2)} :: l4 with Not_found -> l4))
| Proper(lemma2,cat2,interp2,sense2) ->
let interp2 = Xlist.fold interp2 [] (fun interp2 interp3 -> try select_interp (interp,interp3) :: interp2 with Not_found -> interp2) in
if interp2 = [] then l4 else (
if Xlist.size interp2 > 1 then print_endline "match_lemmatize: multiple interp";
(try {t with token=Proper(lemma2,cat2,interp2,sense2)} :: l4 with Not_found -> l4))
| _ -> t :: l4) in
match l4 with
[] -> StringQMap.add stats ("no interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ " " ^ String.concat ":" interp ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))
(* | [{token=AllSmall _}] -> StringQMap.add stats ("no Lemma: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ " " ^ String.concat ":" interp ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env)) *)
| [{token=AllSmall _}] -> StringQMap.add stats "no Lemma"
| [{token=FirstCap _}] -> StringQMap.add stats ("no Lemma 2: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ " " ^ String.concat ":" interp ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))
(* | [{token=FirstCap _}] -> StringQMap.add stats "no Lemma2" *)
| [{token=Lemma _};{token=AllSmall _}] -> stats
| [{token=Lemma _};{token=SmallLetter _}] -> stats
| [{token=Lemma _};{token=FirstCap _}] -> stats
| l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))
let is_lemmatizable = function
| AllSmall _ -> true
| SmallLetter _ -> true
| FirstCap _ -> true
| AllCap _ -> true
| CapLetter _ -> true
| SomeCap _ -> true
| t -> false
let validate_token stats = function
AT(t,[sent,orth,lemma,"brev",interp]) -> StringQMap.add stats "brev"
| AT(t,l(*[sent,orth,lemma,cat,interp]*)) ->
if is_lemmatizable t.token then
StringQMap.add stats "lemmatizable" else StringQMap.add stats "non lemmatizable"
(*match_lemmatize stats t lemma cat interp*)
(* | AT(_,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)*)
| AV(tl,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)
| AR(stat,tl,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)
(* | _ -> StringQMap.add stats "validate_token: ni" *)
let validate_morphology stats name typ channel entries =
prerr_endline name;
Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
let paragraph,tokens = annotate name sentences in
(* print_endline paragraph; *)
(*let s = "W Specjalnym Ośrodku Szkolno-Wychowawczym" in
if String.length paragraph >= String.length s && String.sub paragraph 0 (String.length s) = s then*)
Xlist.fold tokens stats validate_token
(*else stats*)))
let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
"620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727";
"620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056";
"711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";*)
(*"040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083";
"120-2-900092";"120-2-900094";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900008";"120-4-900010";"130-3-900001";"130-3-910001";"130-5-000000267";
"130-5-000000406";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001338";"130-5-000001628";"130-5-000001742";"200-1-000011";"200-1-000026";"200-2-000078";
"200-2-000173";"200-2-000175";"200-4-000000307";"200-4-000000316";"310-2-000007";"320-2-000000094";"320-2-000034";"320-2-000064";"320-3-000226";"330-2-000000030";
"330-2-000000033";"330-2-000000200";"330-2-000000213";"330-2-000003";"330-2-000013";"620-3-010000057";"620-3-010000838";"620-3-010001103";"620-3-010001107";"620-3-010001108";
"620-3-010001109";"620-3-010001125";"620-3-010001274";"620-3-010001448";"620-3-010001732";"620-3-010001772";"711-3-010000021";"712-1-900003";"712-1-900004";"720-3-000071";
"720-3-010000323";"DP1999";"DP2002";"DP2003";"EkspressWieczorny";"forumowisko.pl_20218";"forumowisko.pl_42911";"forumowisko.pl_724";"GazetaGoleniowska";"GazetaTczewska";
"NIE";"SuperExpress";"TrybunaSlaska";*)
(* "120-2-000009";"120-2-000010";"120-2-000012";"120-2-900019";"120-2-900041";"120-2-900044";"120-2-900092";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900001";
"120-4-900008";"130-3-900001";"130-5-000000267";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001628";"130-5-000001635";"130-5-000001742";"200-1-000011";
"200-2-000078";"200-2-000181";"200-4-000000314";"200-4-000026";"200-4-000059";"310-2-000007";"320-2-000000087";"320-2-000000094";"320-2-000034";"330-2-000013";"620-3-010000057";
"620-3-010000099";"620-3-010000838";"620-3-010000839";"620-3-010001729";"620-3-010001743";"620-3-010001853";"620-3-010001873";"620-3-010001895";"711-3-010000021";"720-3-000071";
"720-3-010000323";"720-3-010000337";"DP2000";"EkspressWieczorny";"forumowisko.pl_12517";"forumowisko.pl_20218";"forumowisko.pl_42911";"GazetaTczewska";"SuperExpress" *)
(* "120-2-900092";"120-4-900000001";"120-4-900008";"130-3-900001";"200-2-000078";"200-4-000059";"330-2-000013";"720-3-000071";"720-3-010000337";"EkspressWieczorny" *)
(* "110-4-000000102";"120-2-000006";"120-2-900032";"120-2-900035";"130-3-900005";"130-3-910001";
"130-5-000000507";"130-5-000000765";"130-5-000001156";"200-2-000191";"330-2-000000030";
"620-3-010000835";"620-3-010001772";"DP1999";"GazetaGoleniowska";"GazetaMalborska";"KOT";
"KurierKwidzynski";"NIE";"Rzeczpospolita";"TrybunaSlaska" *)
(* "110-4-000000102";"120-2-000006";"120-2-900032";"130-5-000000507";"130-5-000001156";
"620-3-010000835";"GazetaGoleniowska";"KurierKwidzynski";"NIE";"Rzeczpospolita"; *)
"110-4-000000102";"KurierKwidzynski";
(* "620-3-010001496"; *)
]
let _ =
ENIAMtokenizer.initialize ();
ENIAMinflexion.initialize ();
let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
validate_morphology stats name typ channel entries) in
(* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
validate_segmentation stats name typ channel entries) in *)
let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k);
()