(* * ENIAMwalenty, a converter for Polish Valence Dictionary "Walenty". * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences * * This library is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. *) open ENIAMwalTypes open Xstd let position morfs = {empty_position with morfs=morfs} let rec split_elexeme = function Lexeme s -> [],[Lexeme s] | XOR l -> let genders,l = Xlist.fold l ([],[]) (fun (genders,lexs) lex -> let gender,lex = split_elexeme lex in gender @ genders, lex @ lexs) in genders,[XOR(List.rev l)] | ORconcat l -> let genders,l = Xlist.fold l ([],[]) (fun (genders,lexs) lex -> let gender,lex = split_elexeme lex in gender @ genders, lex @ lexs) in genders,[ORconcat(List.rev l)] | ORcoord l -> let genders,l = Xlist.fold l ([],[]) (fun (genders,lexs) lex -> let gender,lex = split_elexeme lex in gender @ genders, lex @ lexs) in genders,[ORcoord(List.rev l)] | Elexeme gender -> [gender],[] let rec get_lexemes = function Lexeme s -> [s] | ORconcat l -> List.flatten (Xlist.map l get_lexemes) | ORcoord l -> List.flatten (Xlist.map l get_lexemes) | XOR l -> List.flatten (Xlist.map l get_lexemes) | Elexeme gender -> failwith "get_lexemes" let rec remove_list set = function [] -> [] | s :: l -> if Xlist.mem set s then remove_list set l else s :: (remove_list set l) let rec check_lexemes_morfs l = function LexPhrase(lexs,(_,schema)) -> let l = Xlist.fold lexs l (fun l (_,lex) -> remove_list (get_lexemes lex) l) in check_lexemes_schema l schema | _ -> l and check_lexemes_schema l schema = Xlist.fold schema l (fun l s -> Xlist.fold s.morfs l check_lexemes_morfs) let add_refl_restr (restr,schema) = (match restr with Natr -> Ratr | Atr1 -> Atr | Atr -> Atr | Ratr1 -> Ratr | Ratr -> Ratr | Ratrs -> Ratrs | NoRestr -> failwith "add_refl_restr"), position [LexPhrase([QUB,Lexeme "się"],(Natr,[]))] :: schema let rec expand_lexicalizations_schema schema = Xlist.map schema (fun s -> {s with morfs=expand_lexicalizations_morfs s.morfs}) and expand_lexicalizations_morfs morfs = (* uproszczenie polegające na zezwoleniu na koordynację przy zwiększaniu ilości LexPhrase *) List.flatten (Xlist.map morfs (fun morf -> let morf = match morf with LexPhrase(pos_lex,(restr,schema)) -> LexPhrase(pos_lex,(restr,expand_lexicalizations_schema schema)) | morf -> morf in match morf with (* | Phrase(PrepNumP(prep,case)) -> [LexPhrase([PREP case,Lexeme prep],(Ratrs,[position(*2*) [Phrase(NumP(case))]]))] *) | Phrase(PrepNumP(prep,case)) -> [Phrase(PrepNP(prep,case))] (* FIXME: celowe uproszczenie *) | LexPhrase([PREP pcase,plex;SUBST(n,c),slex],(Atr1,[{morfs=[LexPhrase([QUB,_],_)]} as s])) -> (* print_endline (ENIAMwalStringOf.morf morf); *) [LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))]])); LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))];s(*{s with dir=Backward}*)]))] | LexPhrase([PREP(pcase),plex;SUBST(n,c),slex],(Atr1,[{morfs=[LexPhrase([ADV _,_],_)]} as s])) -> (* print_endline (ENIAMwalStringOf.morf morf); *) [LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))]])); LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))];s(*{s with dir=Backward}*)]))] | LexPhrase([PREP pcase,plex;SUBST(n,c),slex],(Ratr1,[{morfs=[LexPhrase([ADV _,_],_)]} as s])) -> (* print_endline (ENIAMwalStringOf.morf morf); *) [LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([SUBST(n,c),slex],(Natr,[]))];s(*{s with dir=Backward}*)]))] | LexPhrase([PREP pcase,plex;pos,lex],restr) -> [LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([pos,lex],restr)]]))] | LexPhrase([PREP pcase,plex;NUM(c,g),nlex;pos,lex],restr) -> let genders,lexs = split_elexeme lex in Xlist.map genders (fun gender -> LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([NUM(c,gender),nlex],(Ratrs,[(*num*)position [Phrase Null(*Pro*)]]))]]))) @ (*FIXME*) Xlist.map lexs (fun lex -> LexPhrase([PREP pcase,plex],(Ratrs,[position [LexPhrase([NUM(c,g),nlex],(Ratrs,[(*num*)position [LexPhrase([pos,lex],restr)]]))]]))) | LexPhrase([NUM(c,g),nlex;pos,lex],restr) -> let genders,lexs = split_elexeme lex in Xlist.map genders (fun gender -> LexPhrase([NUM(c,gender),nlex],(Ratrs,[(*num*)position [Phrase Null(*Pro*)]]))) @ Xlist.map lexs (fun lex -> LexPhrase([NUM(c,g),nlex],(Ratrs,[(*num*)position [LexPhrase([pos,lex],restr)]]))) | LexPhrase([INF(a,n),lex;QUB,Lexeme "się"],restr) -> [LexPhrase([INF(a,n),lex],add_refl_restr restr)] | LexPhrase([COMP ctype,clex;pos,lex;QUB,Lexeme "się"],restr) -> if Xlist.size (check_lexemes_schema (get_lexemes clex) (snd restr)) = 0 then [LexPhrase([pos,lex],add_refl_restr restr)] else [LexPhrase([COMP ctype,clex],(Ratrs,[(*std*)position (*Forward*) [LexPhrase([pos,lex],add_refl_restr restr)]]))] | LexPhrase([COMP ctype,clex;pos,lex],restr) -> if Xlist.size (check_lexemes_schema (get_lexemes clex) (snd restr)) = 0 then [LexPhrase([pos,lex],restr)] else [LexPhrase([COMP ctype,clex],(Ratrs,[(*std*)position (*Forward*) [LexPhrase([pos,lex],restr)]]))] | LexPhrase(_::_::_,_) -> failwith ("expand_lexicalizations_morfs: " ^ ENIAMwalStringOf.morf morf) | morf -> [morf])) let winien = StringSet.of_list ["winien"; "rad"; "powinien"; "nierad"; "niekontent"; "kontent"; "gotów"] let pred = StringSet.of_list ["żal"; "śmiech"; "znać"; "wstyd"; "wolno"; "widać"; "wiadomo"; "warto"; "trzeba"; "trza"; "słychać"; "szkoda"; "strach"; "stać"; "sposób"; "potrzeba"; "pora"; "podobna"; "niewiada"; "niepodobno"; "niepodobna"; "można"; "lża"; "lza"; "dziw"; "dość"; "dosyć"; "czuć"; "czas"; "brak"] let get_pos lex = function SUBST _ -> (match lex with "ja" -> ["ppron12"] | "my" -> ["ppron12"] | "ty" -> ["ppron12"] | "wy" -> ["ppron12"] | "on" -> ["ppron3"] | "siebie" -> ["siebie"] | "się" -> ["qub"] | _ -> ["subst"]) | PREP _ -> ["prep"] | NUM _ -> (try let _ = int_of_string lex in ["intnum"] with _ -> ["num"]) | ADV _ -> ["adv"] | ADJ _ -> ["adj"] | GER _ -> ["ger"] | PPAS _ -> ["ppas"] | PACT _ -> ["pact"] | PERS _ -> if lex = "być" then ["fin";"praet";"bedzie"] else if StringSet.mem winien lex then ["winien"] else if StringSet.mem pred lex then ["pred"] else ["fin";"praet"](*;"impt";"imps"*) | INF _ -> ["inf"] | QUB -> ["qub"] | COMPAR -> ["compar"] | COMP _ -> ["comp"] | FIXED -> ["fixed"] | _ -> failwith "get_pos" let map_pos lemma = function SUBST(number,case) -> (match lemma with "ja" -> PPRON12(number,case) | "my" -> PPRON12(number,case) | "ty" -> PPRON12(number,case) | "wy" -> PPRON12(number,case) | "on" -> PPRON3(number,case) | "siebie" -> SIEBIE case | "się" -> QUB | _ -> SUBST(number,case)) | p -> p let lex_id_counter = ref 0 let get_lex_id () = incr lex_id_counter; !lex_id_counter (* FIXME: to trzeba będzie poprawić przy unlike coordination *) (* FIXME: słownik pos wywołuje redundancję *) (* FIXME: parametr refl z typu pos można przenieść do schematu *) let rec extract_lex_entries (morfs,entries) = function LexPhrase([pos,lex],(Natr,[])) -> let lexemes = get_lexemes lex in let entries = Xlist.fold lexemes entries (fun entries lemma -> Xlist.fold (get_pos lemma pos) entries (fun entries pos2 -> (pos2,lemma,SimpleLexEntry(lemma,pos2)) :: entries)) in (* let entries = Xlist.fold lexemes entries (fun entries lemma -> Xlist.fold (get_pos lemma pos) entries (fun entries pos2 -> let entries2 = try StringMap.find entries pos2 with Not_found -> StringMap.empty in let entry = SimpleLexEntry(lemma,pos2) in let entries2 = StringMap.add_inc entries2 lemma (EntrySet.singleton entry) (fun set -> EntrySet.add set entry) in StringMap.add entries pos2 entries2)) in *) let morfs = Xlist.fold lexemes morfs (fun morfs lemma -> SimpleLexArg(lemma,map_pos lemma pos) :: morfs) in morfs,entries | LexPhrase([pos,lex],(restr,schema)) -> let id = get_lex_id () in let lexemes = get_lexemes lex in let schema,entries = extract_lex_entries_schema entries schema in let entries = Xlist.fold lexemes entries (fun entries lemma -> Xlist.fold (get_pos lemma pos) entries (fun entries pos2 -> (pos2,lemma,LexEntry(id,lemma,pos2,restr,schema)) :: entries)) in (* let entries = Xlist.fold lexemes entries (fun entries lemma -> Xlist.fold (get_pos lemma pos) entries (fun entries pos2 -> let entries2 = try StringMap.find entries pos2 with Not_found -> StringMap.empty in let entry = LexEntry(id,lemma,pos2,restr,schema) in let entries2 = StringMap.add_inc entries2 lemma (EntrySet.singleton entry) (fun set -> EntrySet.add set entry) in StringMap.add entries pos2 entries2)) in *) let morfs = Xlist.fold lexemes morfs (fun morfs lemma -> LexArg(id,lemma,map_pos lemma pos) :: morfs) in morfs,entries | LexPhrase _ as morf -> failwith ("extract_lex_entries: " ^ ENIAMwalStringOf.morf morf) | morf -> morf :: morfs, entries and extract_lex_entries_schema entries schema = let schema,entries = Xlist.fold schema ([],entries) (fun (schema,entries) p -> let morfs,entries = Xlist.fold p.morfs ([],entries) extract_lex_entries in {p with morfs=List.rev morfs} :: schema, entries) in List.rev schema, entries let extract_lex_entries_comprepnp entries compreps = Xlist.fold compreps entries (fun entries (clemma,morfs) -> Xlist.fold morfs entries (fun entries -> function LexPhrase([pos,lex],(Natr,[])) -> failwith "extract_lex_entries_comprepnp" | LexPhrase([pos,lex],(restr,schema)) -> let lexemes = get_lexemes lex in let schema,entries = extract_lex_entries_schema entries schema in Xlist.fold lexemes entries (fun entries lemma -> Xlist.fold (get_pos lemma pos) entries (fun entries pos2 -> (pos2,lemma,ComprepNPEntry(clemma,restr,schema)) :: entries)) (* Xlist.fold lexemes entries (fun entries lemma -> Xlist.fold (get_pos lemma pos) entries (fun entries pos2 -> let entries2 = try StringMap.find entries pos2 with Not_found -> StringMap.empty in let entry = ComprepNPEntry(clemma,restr,schema) in let entries2 = StringMap.add_inc entries2 lemma (EntrySet.singleton entry) (fun set -> EntrySet.add set entry) in StringMap.add entries pos2 entries2)) *) | _ -> failwith "extract_lex_entries_comprepnp")) let rec expand_restr valence lexeme pos = function SimpleLexEntry(lemma,pos2) -> [SimpleLexEntry(lemma,pos2)] (* | LexEntry(id,lemma,pos2,Natr,[]) -> [LexEntry(id,lemma,pos2,NoRestr,[])] *) | LexEntry(id,lemma,pos2,Natr,_) -> failwith "expand_restr" | LexEntry(id,lemma,pos2,restr,[]) -> (* print_endline (lexeme ^ " " ^ pos); *) [LexEntry(id,lemma,pos2,restr,[])] (* FIXME *) (* (* print_endline "expand_restr"; *) let frames = try StringMap.find (StringMap.find valence lexeme) pos with Not_found -> failwith ("expand_restr:" ^ lexeme ^ " " ^ pos) in (* Printf.printf "%s %s %d\n" lexeme pos (Xlist.size frames); Xlist.iter frames (fun frame -> print_endline (ENIAMwalStringOf.frame lexeme frame)); print_endline "";*) (if restr = Atr || restr = Atr1 then [LexEntry(id,lemma,pos2,NoRestr,[])] else []) @ (Xlist.fold frames [] (fun frames -> function Frame(_,schema) -> let schema = remove_pro_args schema in if schema = [] then frames else (expand_restr valence lexeme pos (LexEntry(id,lemma,pos2,restr,schema))) @ frames | _ -> frames))*) | LexEntry(id,lemma,pos2,Atr,schema) -> let schema = Xlist.map schema (fun p -> {p with morfs=Phrase Null :: p.morfs}) in [LexEntry(id,lemma,pos2,NoRestr,schema)] | LexEntry(id,lemma,pos2,Atr1,schema) -> LexEntry(id,lemma,pos2,NoRestr,[]) :: (Xlist.map schema (fun x -> LexEntry(id,lemma,pos2,NoRestr,[x]))) | LexEntry(id,lemma,pos2,Ratr,schema) -> let schemas = Xlist.map (Xlist.multiply_list (Xlist.map schema (fun x -> [[x];[]]))) List.flatten in Xlist.fold schemas [] (fun schemas schema -> if schema = [] then schemas else LexEntry(id,lemma,pos2,NoRestr,schema) :: schemas) | LexEntry(id,lemma,pos2,Ratr1,schema) -> Xlist.map schema (fun x -> LexEntry(id,lemma,pos2,NoRestr,[x])) | LexEntry(id,lemma,pos2,Ratrs,schema) -> [LexEntry(id,lemma,pos2,NoRestr,schema)] | LexEntry(id,lemma,pos2,NoRestr,_) -> failwith "expand_restr" (* | ComprepNPEntry(lemma,Natr,[]) -> [ComprepNPEntry(lemma,NoRestr,[])] *) | ComprepNPEntry(lemma,Natr,_) -> failwith "expand_restr" | ComprepNPEntry(lemma,restr,[]) as entry -> failwith ("expand_restr: " ^ ENIAMwalStringOf.lex_entry entry) | ComprepNPEntry(lemma,Atr,schema) -> let schema = Xlist.map schema (fun p -> {p with morfs=Phrase Null :: p.morfs}) in [ComprepNPEntry(lemma,NoRestr,schema)] | ComprepNPEntry(lemma,Atr1,schema) -> ComprepNPEntry(lemma,NoRestr,[]) :: (Xlist.map schema (fun x -> ComprepNPEntry(lemma,NoRestr,[x]))) | ComprepNPEntry(lemma,Ratr,schema) -> let schemas = Xlist.map (Xlist.multiply_list (Xlist.map schema (fun x -> [[x];[]]))) List.flatten in Xlist.fold schemas [] (fun schemas schema -> if schema = [] then schemas else ComprepNPEntry(lemma,NoRestr,schema) :: schemas) | ComprepNPEntry(lemma,Ratr1,schema) -> Xlist.map schema (fun x -> ComprepNPEntry(lemma,NoRestr,[x])) | ComprepNPEntry(lemma,Ratrs,schema) -> [ComprepNPEntry(lemma,NoRestr,schema)] | ComprepNPEntry(lemma,NoRestr,_) -> failwith "expand_restr" (* | _ -> failwith "expand_restr" *)