ENIAMwalRealizations.ml
13.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
(*
* ENIAMwalenty, an interface for Polish Valence Dictionary "Walenty".
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMwalTypes
open Xstd
let rec expand_schema_morf expands = function
PhraseAbbr(Advp "misc",[]) -> PhraseAbbr(Advp "misc",[])
| PhraseAbbr(Advp "mod",[]) -> PhraseAbbr(Advp "mod",[])
(* | PhraseAbbr(ComparP s,[]) -> PhraseAbbr(ComparP s,[Phrase(ComparNP(s,Str));Phrase(ComparPP(s))]) (* FIXME: albo do walTEI albo usunąć *) *)
| PhraseAbbr(abbr,[]) -> (try PhraseAbbr(abbr,AbbrMap.find expands abbr) with Not_found -> failwith "expand_schema_morf")
| PhraseAbbr(abbr,morfs) -> PhraseAbbr(abbr,Xlist.map morfs (expand_schema_morf expands))
| LexPhrase(pos_lex,(restr,schema)) -> LexPhrase(pos_lex,(restr,expand_schema expands schema))
(* | LexPhraseMode(mode,pos_lex,(restr,schema)) -> LexPhraseMode(mode,pos_lex,(restr,expand_schema expands schema)) *)
| morf -> morf
and expand_schema expands schema =
Xlist.map schema (fun s ->
{s with morfs=Xlist.map s.morfs (expand_schema_morf expands)})
(* let get_mode = function
Xp(m) -> m
| Advp(m) -> m
(* | ComparP prep -> "compar" *)
| Nonch -> "nonch"
| Distrp -> "distrp"
| Possp -> "possp" *)
let rec expand_subtypes_morf subtypes = function
PhraseComp(comp_morf,(ctype,comps)) ->
let comps = if comps = [] then (try CompMap.find subtypes ctype with Not_found -> failwith "expand_subtypes_schema") else comps in
Xlist.map comps (fun comp -> Phrase(match comp_morf with
Cp -> CP(ctype,comp)
| Ncp case -> NCP(case,ctype,comp)
| Prepncp(prep,case) -> PrepNCP(prep,case,ctype,comp)))
| LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,expand_subtypes subtypes schema))]
(* | LexPhraseMode(mode,pos_lex,(restr,schema)) -> [LexPhraseMode(mode,pos_lex,(restr,expand_subtypes subtypes schema))] *)
| PhraseAbbr(abbr,morfs) ->
(* let mode = get_mode abbr in *)
List.flatten (Xlist.map morfs (expand_subtypes_morf subtypes))
(* Xlist.map morfs (function
Phrase p -> PhraseMode(mode,p)
| PhraseMode(_,p) -> PhraseMode(mode,p)
| LexPhrase(pos_lex,r) -> LexPhraseMode(mode,pos_lex,r)
| LexRPhrase(pos_lex,r) -> LexRPhraseMode(mode,pos_lex,r)
| LexPhraseMode(m,pos_lex,r) -> LexPhraseMode(mode,pos_lex,r)
| LexRPhraseMode(m,pos_lex,r) -> LexRPhraseMode(mode,pos_lex,r)
| _ -> failwith "expand_subtypes_morf") *)
| E Null -> [E(NP(Str));E(NCP(Str,CompTypeUndef,CompUndef));E(CP(CompTypeUndef,CompUndef)); E(Or)]
| morf -> [morf]
and expand_subtypes subtypes schema =
Xlist.map schema (fun s ->
{s with morfs=List.flatten (Xlist.map s.morfs (expand_subtypes_morf subtypes))})
let expand_equivs_phrase equivs = function
| PrepNP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> PrepNP(prep,case))
| PrepAdjP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> PrepAdjP(prep,case))
| PrepNumP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> PrepNumP(prep,case))
| ComprepNP(prep) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> ComprepNP(prep))
(* | ComparNP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> ComparNP(prep,case)) *)
| ComparP(prep) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> ComparP(prep))
| CP(ctype,Comp comp) -> Xlist.map (try StringMap.find equivs comp with Not_found -> [comp]) (fun comp -> CP(ctype,Comp comp))
| NCP(case,ctype,Comp comp) -> Xlist.map (try StringMap.find equivs comp with Not_found -> [comp]) (fun comp -> NCP(case,ctype,Comp comp))
| PrepNCP(prep,case,ctype,Comp comp) -> List.flatten (
Xlist.map (try StringMap.find equivs comp with Not_found -> [comp]) (fun comp ->
Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep ->
PrepNCP(prep,case,ctype,Comp comp))))
| phrase -> [phrase]
let rec expand_equivs_lex equivs = function
Lexeme s -> (try XOR(Xlist.map (StringMap.find equivs s) (fun s -> Lexeme s)) with Not_found -> Lexeme s)
| ORconcat l -> ORconcat(Xlist.map l (expand_equivs_lex equivs))
| ORcoord l -> ORcoord(Xlist.map l (expand_equivs_lex equivs))
| XOR l -> XOR(Xlist.map l (expand_equivs_lex equivs))
| Elexeme gender -> Elexeme gender
let rec expand_equivs_morf equivs = function
Phrase phrase -> Xlist.map (expand_equivs_phrase equivs phrase) (fun phrase -> Phrase phrase)
(* | PhraseMode(mode,phrase) -> Xlist.map (expand_equivs_phrase equivs phrase) (fun phrase -> PhraseMode(mode,phrase)) *)
| E phrase -> Xlist.map (expand_equivs_phrase equivs phrase) (fun phrase -> E phrase)
| LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))]
(* | LexRPhrase(pos_lex,(restr,schema)) -> [LexRPhrase(Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))]
| LexPhraseMode(mode,pos_lex,(restr,schema)) -> [LexPhraseMode(mode,Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))]
| LexRPhraseMode(mode,pos_lex,(restr,schema)) -> [LexRPhraseMode(mode,Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))] *)
(* | PhraseAbbr(abbr,morfs) -> [PhraseAbbr(abbr,List.flatten (Xlist.map morfs (expand_equivs_morf equivs)))] *)
| morf -> failwith ("expand_equivs_morf: " ^ ENIAMwalStringOf.morf morf)
and expand_equivs_schema equivs schema =
Xlist.map schema (fun s ->
{s with morfs=List.flatten (Xlist.map s.morfs (expand_equivs_morf equivs))})
let rec load_realizations_rec (expands,subtypes,equivs) found rev = function
[] -> if rev <> [] || found <> [] then failwith "load_realizations_rec" else expands,subtypes,equivs
| [Str.Text s; Str.Delim "-->"] :: l -> load_realizations_rec (expands,subtypes,equivs) ((s,rev) :: found) [] l
| [Str.Delim " "; Str.Text s; Str.Delim "\t"; Str.Text t] :: l ->
load_realizations_rec (expands,subtypes,equivs) found ((s,t) :: rev) l
| [Str.Delim " "; Str.Text s] :: l ->
load_realizations_rec (expands,subtypes,equivs) found ((s,"") :: rev) l
| [Str.Delim "% "; Str.Text "Phrase types expand:"] :: l -> load_realizations_rec (found,subtypes,equivs) [] rev l
| [Str.Delim "% "; Str.Text "Attributes subtypes:"] :: l -> load_realizations_rec (expands,found,equivs) [] rev l
| [Str.Delim "% "; Str.Text "Attributes equivalents:"] :: l -> load_realizations_rec (expands,subtypes,found) [] rev l
(* | [Str.Delim "% "; Str.Text s] :: l -> print_endline s; load_realizations_rec found rev l *)
| [] :: l -> load_realizations_rec (expands,subtypes,equivs) found rev l
| _ -> failwith "load_realizations_rec"
let rec get_lexemes = function
Lexeme s -> [s]
| ORconcat l -> List.flatten (Xlist.map l get_lexemes)
| ORcoord l -> List.flatten (Xlist.map l get_lexemes)
| XOR l -> List.flatten (Xlist.map l get_lexemes)
| Elexeme gender -> failwith "get_lexemes"
let find_comprep_reqs compreps =
Xlist.fold compreps StringMap.empty (fun comprep_reqs (s,l) ->
let l = Xlist.map l (function
LexPhrase(pos_lex,_) -> Xlist.fold pos_lex StringSet.empty (fun set -> function
_,Lexeme s -> StringSet.add set s
| _ -> set)
(* | LexPhraseMode(_,pos_lex,_) -> Xlist.fold pos_lex StringSet.empty (fun set -> function
_,Lexeme s -> StringSet.add set s
| _ -> set) *)
| morf -> failwith ("find_compreps_reqs: " ^ ENIAMwalStringOf.morf morf)) in
if l = [] then failwith "find_compreps_reqs";
StringMap.add comprep_reqs s (StringSet.to_list (Xlist.fold (List.tl l) (List.hd l) StringSet.union)))
let create_comprep_dict compreps =
Xlist.fold compreps StringMap.empty (fun compreps (s,l) ->
Xlist.fold l compreps (fun compreps -> function
LexPhrase([PREP _,_;SUBST _,lex],_) as morf ->
let lexemes = get_lexemes lex in
Xlist.fold lexemes compreps (fun compreps lexeme ->
StringMap.add_inc compreps lexeme ["subst",(s,morf)] (fun l -> ("subst",(s,morf)) :: l))
(* | LexPhraseMode("misc",[ADV grad,lex],restr) ->
let morf = LexPhrase([ADV grad,lex],restr) in
let lexemes = get_lexemes lex in
Xlist.fold lexemes compreps (fun compreps lexeme ->
StringMap.add_inc compreps lexeme ["adv",(s,morf)] (fun l -> ("adv",(s,morf)) :: l)) *)
| LexPhrase([PREP _,_;NUM _,_;SUBST _,lex],_) as morf ->
let lexemes = get_lexemes lex in
Xlist.fold lexemes compreps (fun compreps lexeme ->
StringMap.add_inc compreps lexeme ["subst",(s,morf)] (fun l -> ("subst",(s,morf)) :: l))
| morf -> failwith ("create_comprep_dict: " ^ ENIAMwalStringOf.morf morf)))
let load_realizations () =
(* let lines = Str.split (Str.regexp "\n") (File.load_file realizations_filename) in
let lines = Xlist.rev_map lines (fun line -> Str.full_split (Str.regexp "% \\|-->\\| \\|\t") line) in
let expands,subtypes,equivs = load_realizations_rec ([],[],[]) [] [] lines in *)
let subtypes = Xlist.fold ENIAMwalTEI.subtypes CompMap.empty (fun subtypes -> function
"int",l -> CompMap.add subtypes Int (List.flatten (Xlist.map l (fun v -> snd(ENIAMwalTEI.parse_comp v))))
| "rel",l -> CompMap.add subtypes Rel (List.flatten (Xlist.map l (fun v -> snd(ENIAMwalTEI.parse_comp v))))
| _ -> failwith "load_realizations 1") in
let equivs = Xlist.fold ENIAMwalTEI.equivs StringMap.empty (fun equivs (k,l) -> StringMap.add equivs k (k :: l)) in
let expands,compreps = Xlist.fold ENIAMwalTEI.expands (AbbrMap.empty,[]) (fun (expands, compreps) (id,k,l) ->
match k with
PhraseAbbr(Advp m,[]) -> AbbrMap.add expands (Advp m) l, compreps
| PhraseAbbr(Nonch,[]) -> AbbrMap.add expands Nonch l, compreps
| PhraseAbbr(Xp m,[]) -> AbbrMap.add expands (Xp m) (List.flatten (Xlist.map l (function
PhraseAbbr(Advp m,[]) -> (try AbbrMap.find expands (Advp m) with Not_found -> [PhraseAbbr(Advp m,[])]) (* FIXME: zakładam, że advp się nie rozmnoży *)
| morf -> [morf]))), compreps
| Phrase(ComprepNP s) -> expands, (s, l) :: compreps
| PhraseAbbr(Distrp,[]) -> AbbrMap.add expands Distrp l, compreps
| PhraseAbbr(Possp,[]) -> AbbrMap.add expands Possp l, compreps
| _ -> failwith "load_realizations 2") in
let compreps = Xlist.map compreps (fun (s,morfs) ->
s, List.flatten (List.flatten (Xlist.map morfs (fun morf -> Xlist.map (expand_subtypes_morf subtypes (expand_schema_morf expands morf)) (expand_equivs_morf equivs))))) in
let comprep_reqs = find_comprep_reqs compreps in
(* let compreps = create_comprep_dict compreps in *)
expands(*,compreps*),comprep_reqs,subtypes,equivs
let expands(*,compreps*),comprep_reqs,subtypes,equivs = load_realizations ()
(* Wypisanie realizacji *)
(* let _ =
Xlist.iter ENIAMwalTEI.expands (fun (id,morf,l) ->
Printf.printf "%d %s:\n" id (ENIAMwalStringOf.morf morf);
Xlist.iter l (fun morf -> Printf.printf " %s\n" (ENIAMwalStringOf.morf morf))) *)
(* Wypisanie realizacji po przetworzeniu *)
(* let _ =
AbbrMap.iter expands (fun morf l ->
Printf.printf "%s:\n" (ENIAMwalStringOf.phrase_abbr morf);
Xlist.iter l (fun morf -> Printf.printf " %s\n" (ENIAMwalStringOf.morf morf))) *)
let has_realization = function
PhraseAbbr _ -> true
| PhraseComp _ -> true
| _ -> false
(* Wypisanie fraz, które podlegają rozwijaniu *)
(*let _ =
IntMap.iter ENIAMwalTEI.phrases (fun i morf ->
if has_realization morf then
Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf morf)) *)
let phrases =
IntMap.map ENIAMwalTEI.phrases (fun morf ->
let morf = expand_schema_morf expands morf in
let morfs = expand_subtypes_morf subtypes morf in
let morf = List.flatten (Xlist.map morfs (expand_equivs_morf equivs)) in
morf)
(* Wypisanie fraz, które podlegają rozwijaniu *)
(* let _ =
IntMap.iter phrases (fun i morf ->
if has_realization morf then
Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf morf)) *)
(* let test_phrases = [17088; 17133; 1642]
let _ =
Xlist.iter test_phrases (fun i ->
let m1 = IntMap.find ENIAMwalTEI.phrases i in
let m2 = IntMap.find phrases i in
Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf m1);
Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf m2)) *)