Blame view

walenty/ENIAMwalLex.ml 14.8 KB
Wojciech Jaworski authored
1
(*
Wojciech Jaworski authored
2
3
4
 *  ENIAMwalenty, a converter for Polish Valence Dictionary "Walenty".
 *  Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
Wojciech Jaworski authored
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)
Wojciech Jaworski authored
20
open ENIAMwalTypes
Wojciech Jaworski authored
21
22
open Xstd
Wojciech Jaworski authored
23
let position morfs = {empty_position with morfs=morfs}
Wojciech Jaworski authored
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

let rec split_elexeme = function
    Lexeme s -> [],[Lexeme s]
  | XOR l ->
    let genders,l = Xlist.fold l ([],[]) (fun (genders,lexs) lex ->
        let gender,lex = split_elexeme lex in
        gender @ genders, lex @ lexs) in
    genders,[XOR(List.rev l)]
  | ORconcat l ->
    let genders,l = Xlist.fold l ([],[]) (fun (genders,lexs) lex ->
        let gender,lex = split_elexeme lex in
        gender @ genders, lex @ lexs) in
    genders,[ORconcat(List.rev l)]
  | ORcoord l ->
    let genders,l = Xlist.fold l ([],[]) (fun (genders,lexs) lex ->
        let gender,lex = split_elexeme lex in
        gender @ genders, lex @ lexs) in
    genders,[ORcoord(List.rev l)]
  | Elexeme gender -> [gender],[]
Wojciech Jaworski authored
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
let rec get_lexemes = function
    Lexeme s -> [s]
  | ORconcat l -> List.flatten (Xlist.map l get_lexemes)
  | ORcoord l -> List.flatten (Xlist.map l get_lexemes)
  | XOR l -> List.flatten (Xlist.map l get_lexemes)
  | Elexeme gender -> failwith "get_lexemes"

let rec remove_list set = function
    [] -> []
  | s :: l -> if Xlist.mem set s then remove_list set l else s :: (remove_list set l)

let rec check_lexemes_morfs l = function
    LexPhrase(lexs,(_,schema)) ->
              let l = Xlist.fold lexs l (fun l (_,lex) ->
                  remove_list (get_lexemes lex) l) in
              check_lexemes_schema l schema
  | _ -> l

and check_lexemes_schema l schema =
  Xlist.fold schema l (fun l s ->
      Xlist.fold s.morfs l check_lexemes_morfs)

let add_refl_restr (restr,schema) =
    (match restr with
      Natr -> Ratr
    | Atr1 -> Atr
    | Atr -> Atr
    | Ratr1 -> Ratr
    | Ratr -> Ratr
    | Ratrs -> Ratrs
    | NoRestr -> failwith "add_refl_restr"),
    position [LexPhrase([QUB,Lexeme "się"],(Natr,[]))] :: schema
Wojciech Jaworski authored
77
78
79
80
81
82
83
84
85
86
let rec expand_lexicalizations_schema schema =
  Xlist.map schema (fun s ->
      {s with morfs=expand_lexicalizations_morfs s.morfs})

and expand_lexicalizations_morfs morfs = (* uproszczenie polegające na zezwoleniu na koordynację przy zwiększaniu ilości LexPhrase *)
  List.flatten (Xlist.map morfs (fun morf ->
      let morf = match morf with
          LexPhrase(pos_lex,(restr,schema)) -> LexPhrase(pos_lex,(restr,expand_lexicalizations_schema schema))
        | morf -> morf in
      match morf with
Wojciech Jaworski authored
87
(* | Phrase(PrepNumP(prep,case)) -> [LexPhrase([PREP case,Lexeme prep],(Ratrs,[position(*2*) [Phrase(NumP(case))]]))] *)
Wojciech Jaworski authored
88
      | Phrase(PrepNumP(prep,case)) -> [Phrase(PrepNP(prep,case))] (* FIXME: celowe uproszczenie *)
Wojciech Jaworski authored
89
90
      | LexPhrase([PREP pcase,plex;SUBST(n,c),slex],(Atr1,[{morfs=[LexPhrase([QUB,_],_)]} as s])) ->
        (*            print_endline (ENIAMwalStringOf.morf morf);  *)
Wojciech Jaworski authored
91
92
        [LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))]]));
         LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))];s(*{s with dir=Backward}*)]))]
Wojciech Jaworski authored
93
94
      | LexPhrase([PREP(pcase),plex;SUBST(n,c),slex],(Atr1,[{morfs=[LexPhrase([ADV _,_],_)]} as s])) ->
        (*            print_endline (ENIAMwalStringOf.morf morf);  *)
Wojciech Jaworski authored
95
96
        [LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))]]));
         LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))];s(*{s with dir=Backward}*)]))]
Wojciech Jaworski authored
97
98
      | LexPhrase([PREP pcase,plex;SUBST(n,c),slex],(Ratr1,[{morfs=[LexPhrase([ADV _,_],_)]} as s])) ->
        (*            print_endline (ENIAMwalStringOf.morf morf);  *)
Wojciech Jaworski authored
99
        [LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))];s(*{s with dir=Backward}*)]))]
Wojciech Jaworski authored
100
      | LexPhrase([PREP pcase,plex;pos,lex],restr) ->
Wojciech Jaworski authored
101
        [LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([pos,lex],restr)]]))]
Wojciech Jaworski authored
102
      | LexPhrase([PREP pcase,plex;NUM(c,g),nlex;pos,lex],restr) ->
Wojciech Jaworski authored
103
104
        let genders,lexs = split_elexeme lex in
        Xlist.map genders (fun gender ->
Wojciech Jaworski authored
105
            LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([NUM(c,gender),nlex],(Ratrs,[(*num*)position [Phrase Null(*Pro*)]]))]]))) @ (*FIXME*)
Wojciech Jaworski authored
106
        Xlist.map lexs (fun lex ->
Wojciech Jaworski authored
107
108
            LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([NUM(c,g),nlex],(Ratrs,[(*num*)position [LexPhrase([pos,lex],restr)]]))]])))
      | LexPhrase([NUM(c,g),nlex;pos,lex],restr) ->
Wojciech Jaworski authored
109
110
        let genders,lexs = split_elexeme lex in
        Xlist.map genders (fun gender ->
Wojciech Jaworski authored
111
            LexPhrase([NUM(c,gender),nlex],(Ratrs,[(*num*)position [Phrase Null(*Pro*)]]))) @
Wojciech Jaworski authored
112
        Xlist.map lexs (fun lex ->
Wojciech Jaworski authored
113
114
115
116
117
118
            LexPhrase([NUM(c,g),nlex],(Ratrs,[(*num*)position [LexPhrase([pos,lex],restr)]])))
      | LexPhrase([INF(a,n),lex;QUB,Lexeme "się"],restr) -> [LexPhrase([INF(a,n),lex],add_refl_restr restr)]
      | LexPhrase([COMP ctype,clex;pos,lex;QUB,Lexeme "się"],restr) ->
        if Xlist.size (check_lexemes_schema (get_lexemes clex) (snd restr)) = 0 then
          [LexPhrase([pos,lex],add_refl_restr restr)]
        else [LexPhrase([COMP ctype,clex],(Ratrs,[(*std*)position (*Forward*) [LexPhrase([pos,lex],add_refl_restr restr)]]))]
Wojciech Jaworski authored
119
      | LexPhrase([COMP ctype,clex;pos,lex],restr) ->
Wojciech Jaworski authored
120
121
122
        if Xlist.size (check_lexemes_schema (get_lexemes clex) (snd restr)) = 0 then
          [LexPhrase([pos,lex],restr)]
        else [LexPhrase([COMP ctype,clex],(Ratrs,[(*std*)position (*Forward*) [LexPhrase([pos,lex],restr)]]))]
Wojciech Jaworski authored
123
124
125
      | LexPhrase(_::_::_,_) -> failwith ("expand_lexicalizations_morfs: " ^ ENIAMwalStringOf.morf morf)
      | morf -> [morf]))
Wojciech Jaworski authored
126
127
128
129
130
131
let winien = StringSet.of_list ["winien"; "rad"; "powinien"; "nierad"; "niekontent"; "kontent"; "gotów"]
let pred = StringSet.of_list ["żal"; "śmiech"; "znać"; "wstyd"; "wolno"; "widać"; "wiadomo";
"warto"; "trzeba"; "trza"; "słychać"; "szkoda"; "strach"; "stać"; "sposób"; "potrzeba"; "pora";
"podobna"; "niewiada"; "niepodobno"; "niepodobna"; "można"; "lża"; "lza"; "dziw"; "dość"; "dosyć";
"czuć"; "czas"; "brak"]
Wojciech Jaworski authored
132
133
134
135
136
137
138
139
140
141
142
143
let get_pos lex = function
    SUBST _ ->
    (match lex with
       "ja" -> ["ppron12"]
     | "my" -> ["ppron12"]
     | "ty" -> ["ppron12"]
     | "wy" -> ["ppron12"]
     | "on" -> ["ppron3"]
     | "siebie" -> ["siebie"]
     | "się" -> ["qub"]
     | _ -> ["subst"])
  | PREP _ -> ["prep"]
Wojciech Jaworski authored
144
145
146
147
148
  | NUM _ ->
    (try
       let _ = int_of_string lex in
       ["intnum"]
     with _ -> ["num"])
Wojciech Jaworski authored
149
150
151
152
153
  | ADV _ -> ["adv"]
  | ADJ _ -> ["adj"]
  | GER _ -> ["ger"]
  | PPAS _ -> ["ppas"]
  | PACT _ -> ["pact"]
Wojciech Jaworski authored
154
155
156
157
  | PERS _ -> if lex = "być" then ["fin";"praet";"bedzie"] else
      if StringSet.mem winien lex then ["winien"] else
      if StringSet.mem pred lex then ["pred"] else
      ["fin";"praet"](*;"impt";"imps"*)
Wojciech Jaworski authored
158
159
160
161
162
  | INF _ -> ["inf"]
  | QUB -> ["qub"]
  | COMPAR -> ["compar"]
  | COMP _ -> ["comp"]
  | FIXED -> ["fixed"]
Wojciech Jaworski authored
163
164
165
166
167
168
169
170
171
172
173
174
175
176
  | _ -> failwith "get_pos"

let map_pos lemma = function
    SUBST(number,case) ->
    (match lemma with
       "ja" -> PPRON12(number,case)
     | "my" -> PPRON12(number,case)
     | "ty" -> PPRON12(number,case)
     | "wy" -> PPRON12(number,case)
     | "on" -> PPRON3(number,case)
     | "siebie" -> SIEBIE case
     | "się" -> QUB
     | _ -> SUBST(number,case))
  | p -> p
Wojciech Jaworski authored
177
Wojciech Jaworski authored
178
179
180
181
182
183
184
185
let lex_id_counter = ref 0

let get_lex_id () =
  incr lex_id_counter;
  !lex_id_counter

(* FIXME: to trzeba będzie poprawić przy unlike coordination *)
(* FIXME: słownik pos wywołuje redundancję *)
Wojciech Jaworski authored
186
(* FIXME: parametr refl z typu pos można przenieść do schematu *)
Wojciech Jaworski authored
187
188
189
190
191
let rec extract_lex_entries (morfs,entries) = function
    LexPhrase([pos,lex],(Natr,[])) ->
    let lexemes = get_lexemes lex in
    let entries = Xlist.fold lexemes entries (fun entries lemma ->
        Xlist.fold (get_pos lemma pos) entries (fun entries pos2 ->
Wojciech Jaworski authored
192
193
194
            (pos2,lemma,SimpleLexEntry(lemma,pos2)) :: entries)) in
            (* let entries = Xlist.fold lexemes entries (fun entries lemma ->
        Xlist.fold (get_pos lemma pos) entries (fun entries pos2 ->
Wojciech Jaworski authored
195
196
197
            let entries2 = try StringMap.find entries pos2 with Not_found -> StringMap.empty in
            let entry = SimpleLexEntry(lemma,pos2) in
            let entries2 = StringMap.add_inc entries2 lemma (EntrySet.singleton entry) (fun set -> EntrySet.add set entry) in
Wojciech Jaworski authored
198
199
            StringMap.add entries pos2 entries2)) in *)
    let morfs = Xlist.fold lexemes morfs (fun morfs lemma -> SimpleLexArg(lemma,map_pos lemma pos) :: morfs) in
Wojciech Jaworski authored
200
201
    morfs,entries
  | LexPhrase([pos,lex],(restr,schema)) ->
Wojciech Jaworski authored
202
    let id = get_lex_id () in
Wojciech Jaworski authored
203
204
205
206
    let lexemes = get_lexemes lex in
    let schema,entries = extract_lex_entries_schema entries schema in
    let entries = Xlist.fold lexemes entries (fun entries lemma ->
        Xlist.fold (get_pos lemma pos) entries (fun entries pos2 ->
Wojciech Jaworski authored
207
208
209
            (pos2,lemma,LexEntry(id,lemma,pos2,restr,schema)) :: entries)) in
    (* let entries = Xlist.fold lexemes entries (fun entries lemma ->
        Xlist.fold (get_pos lemma pos) entries (fun entries pos2 ->
Wojciech Jaworski authored
210
211
212
            let entries2 = try StringMap.find entries pos2 with Not_found -> StringMap.empty in
            let entry = LexEntry(id,lemma,pos2,restr,schema) in
            let entries2 = StringMap.add_inc entries2 lemma (EntrySet.singleton entry) (fun set -> EntrySet.add set entry) in
Wojciech Jaworski authored
213
214
            StringMap.add entries pos2 entries2)) in *)
    let morfs = Xlist.fold lexemes morfs (fun morfs lemma -> LexArg(id,lemma,map_pos lemma pos) :: morfs) in
Wojciech Jaworski authored
215
216
217
    morfs,entries
  | LexPhrase _ as morf -> failwith ("extract_lex_entries: " ^ ENIAMwalStringOf.morf morf)
  | morf -> morf :: morfs, entries
Wojciech Jaworski authored
218
Wojciech Jaworski authored
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
and extract_lex_entries_schema entries schema =
  let schema,entries = Xlist.fold schema ([],entries) (fun (schema,entries) p ->
      let morfs,entries = Xlist.fold p.morfs ([],entries) extract_lex_entries in
      {p with morfs=List.rev morfs} :: schema, entries) in
  List.rev schema, entries

let extract_lex_entries_comprepnp entries compreps =
  Xlist.fold compreps entries (fun entries (clemma,morfs) ->
      Xlist.fold morfs entries (fun entries -> function
            LexPhrase([pos,lex],(Natr,[])) -> failwith "extract_lex_entries_comprepnp"
          | LexPhrase([pos,lex],(restr,schema)) ->
            let lexemes = get_lexemes lex in
            let schema,entries = extract_lex_entries_schema entries schema in
            Xlist.fold lexemes entries (fun entries lemma ->
                Xlist.fold (get_pos lemma pos) entries (fun entries pos2 ->
Wojciech Jaworski authored
234
235
236
                    (pos2,lemma,ComprepNPEntry(clemma,restr,schema)) :: entries))
            (* Xlist.fold lexemes entries (fun entries lemma ->
                Xlist.fold (get_pos lemma pos) entries (fun entries pos2 ->
Wojciech Jaworski authored
237
238
239
                    let entries2 = try StringMap.find entries pos2 with Not_found -> StringMap.empty in
                    let entry = ComprepNPEntry(clemma,restr,schema) in
                    let entries2 = StringMap.add_inc entries2 lemma (EntrySet.singleton entry) (fun set -> EntrySet.add set entry) in
Wojciech Jaworski authored
240
                    StringMap.add entries pos2 entries2)) *)
Wojciech Jaworski authored
241
242
243
244
245
246
247
          | _ -> failwith "extract_lex_entries_comprepnp"))

let rec expand_restr valence lexeme pos = function
    SimpleLexEntry(lemma,pos2) -> [SimpleLexEntry(lemma,pos2)]
  (* | LexEntry(id,lemma,pos2,Natr,[]) -> [LexEntry(id,lemma,pos2,NoRestr,[])] *)
  | LexEntry(id,lemma,pos2,Natr,_) -> failwith "expand_restr"
  | LexEntry(id,lemma,pos2,restr,[]) ->
Wojciech Jaworski authored
248
    (* print_endline (lexeme ^ " " ^ pos); *)
Wojciech Jaworski authored
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
    [LexEntry(id,lemma,pos2,restr,[])] (* FIXME *)
(*    (*       print_endline "expand_restr"; *)
    let frames = try StringMap.find (StringMap.find valence lexeme) pos
      with Not_found -> failwith ("expand_restr:" ^ lexeme ^ " " ^ pos) in
    (*      Printf.printf "%s %s %d\n" lexeme pos (Xlist.size frames);
            Xlist.iter frames (fun frame -> print_endline (ENIAMwalStringOf.frame lexeme frame));
            print_endline "";*)
    (if restr = Atr || restr = Atr1 then [LexEntry(id,lemma,pos2,NoRestr,[])] else []) @
    (Xlist.fold frames [] (fun frames -> function
           Frame(_,schema) ->
           let schema = remove_pro_args schema in
           if schema = [] then frames else
             (expand_restr valence lexeme pos (LexEntry(id,lemma,pos2,restr,schema))) @ frames
         | _ -> frames))*)
  | LexEntry(id,lemma,pos2,Atr,schema) ->
    let schema = Xlist.map schema (fun p -> {p with morfs=Phrase Null :: p.morfs}) in
    [LexEntry(id,lemma,pos2,NoRestr,schema)]
  | LexEntry(id,lemma,pos2,Atr1,schema) ->
    LexEntry(id,lemma,pos2,NoRestr,[]) :: (Xlist.map schema (fun x -> LexEntry(id,lemma,pos2,NoRestr,[x])))
  | LexEntry(id,lemma,pos2,Ratr,schema) ->
    let schemas = Xlist.map (Xlist.multiply_list (Xlist.map schema (fun x -> [[x];[]]))) List.flatten in
    Xlist.fold schemas [] (fun schemas schema ->
        if schema = [] then schemas else LexEntry(id,lemma,pos2,NoRestr,schema) :: schemas)
  | LexEntry(id,lemma,pos2,Ratr1,schema) ->
    Xlist.map schema (fun x -> LexEntry(id,lemma,pos2,NoRestr,[x]))
  | LexEntry(id,lemma,pos2,Ratrs,schema) -> [LexEntry(id,lemma,pos2,NoRestr,schema)]
  | LexEntry(id,lemma,pos2,NoRestr,_) -> failwith "expand_restr"
  (* | ComprepNPEntry(lemma,Natr,[]) -> [ComprepNPEntry(lemma,NoRestr,[])] *)
  | ComprepNPEntry(lemma,Natr,_) -> failwith "expand_restr"
Wojciech Jaworski authored
278
  | ComprepNPEntry(lemma,restr,[]) as entry -> failwith ("expand_restr: " ^ ENIAMwalStringOf.lex_entry entry)
Wojciech Jaworski authored
279
280
281
282
283
284
285
286
287
288
289
290
291
  | ComprepNPEntry(lemma,Atr,schema) ->
    let schema = Xlist.map schema (fun p -> {p with morfs=Phrase Null :: p.morfs}) in
    [ComprepNPEntry(lemma,NoRestr,schema)]
  | ComprepNPEntry(lemma,Atr1,schema) ->
    ComprepNPEntry(lemma,NoRestr,[]) :: (Xlist.map schema (fun x -> ComprepNPEntry(lemma,NoRestr,[x])))
  | ComprepNPEntry(lemma,Ratr,schema) ->
    let schemas = Xlist.map (Xlist.multiply_list (Xlist.map schema (fun x -> [[x];[]]))) List.flatten in
    Xlist.fold schemas [] (fun schemas schema ->
        if schema = [] then schemas else ComprepNPEntry(lemma,NoRestr,schema) :: schemas)
  | ComprepNPEntry(lemma,Ratr1,schema) ->
    Xlist.map schema (fun x -> ComprepNPEntry(lemma,NoRestr,[x]))
  | ComprepNPEntry(lemma,Ratrs,schema) -> [ComprepNPEntry(lemma,NoRestr,schema)]
  | ComprepNPEntry(lemma,NoRestr,_) -> failwith "expand_restr"
Wojciech Jaworski authored
292
  (* | _ -> failwith "expand_restr" *)