(* * ENIAM: Categorial Syntactic-Semantic Parser for Polish * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. *) open Xstd open PreTypes let load_lu names filename = let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in Xlist.fold l (StringMap.empty,names) (fun (lu,names) line -> if String.length line = 0 then lu,names else if String.get line 0 = '#' then lu,names else match Str.split_delim (Str.regexp "\t") line with [id; lemma; variant] -> StringMap.add_inc lu lemma [id,variant] (fun l -> (id,variant) :: l), StringMap.add_inc names id (lemma ^ " " ^ variant) (fun _ -> failwith "load_lu") | _ -> failwith ("load_lu: " ^ line)) let noun_lu,lu_names = load_lu StringMap.empty Paths.rzeczownik_filename let verb_lu,lu_names = load_lu lu_names Paths.czasownik_filename let adj_lu,lu_names = load_lu lu_names Paths.przymiotnik_filename let load_synsets filename = let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in Xlist.fold l (StringMap.empty,StringMap.empty) (fun (syn,names) line -> if String.length line = 0 then (syn,names) else if String.get line 0 = '#' then (syn,names) else match Str.split_delim (Str.regexp "\t") line with [syn_id; lu_ids] -> let lu_ids = Str.split_delim (Str.regexp " ") lu_ids in let syn = Xlist.fold lu_ids syn (fun syn lu_id -> StringMap.add_inc syn lu_id syn_id (fun _ -> failwith ("load_synsets 1: " ^ lu_id))) in let lu_id = try List.hd lu_ids with _ -> failwith ("load_synsets 2: " ^ syn_id) in let name = try StringMap.find lu_names lu_id with Not_found -> "syn_id: " ^ syn_id in (* nieznane synsety są z en wordnetu *) let names = StringMap.add_inc names syn_id name (fun _ -> failwith ("load_synsets 4: " ^ syn_id)) in syn,names | _ -> failwith ("load_synsets 5: " ^ line)) let synsets, syn_names = load_synsets Paths.synsets_filename let load_hipero filename = let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in Xlist.fold l StringMap.empty (fun hip line -> if String.length line = 0 then hip else if String.get line 0 = '#' then hip else match Str.split_delim (Str.regexp "\t") line with [id; ids] -> let ids = Str.split_delim (Str.regexp " ") ids in StringMap.add_inc hip id ids (fun _ -> failwith "load_hipero") | _ -> failwith ("load_hipero: " ^ line)) let hipero = load_hipero Paths.hipero_filename let rec get_lu_id variant = function (id,v) :: l -> if variant = v then id else get_lu_id variant l | [] -> failwith "get_lu_id" let lu_id_of_sense sense = let lemma,variant = match List.rev (Str.split (Str.regexp " ") sense) with (* [lemma] -> lemma,"" *) | variant :: l -> String.concat " " (List.rev l), variant | _ -> failwith "lu_id_of_sense 1" in if variant = "" then lemma else let l = try StringMap.find noun_lu lemma with Not_found -> failwith ("lu_id_of_sense 2: " ^ lemma) in get_lu_id variant l let load_predef hipero filename = let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in Xlist.fold l (hipero,StringSet.empty) (fun (hipero,predef) line -> if String.length line = 0 then hipero,predef else if String.get line 0 = '#' then hipero,predef else match Str.split_delim (Str.regexp "\t") line with id :: senses -> let hipero = Xlist.fold senses hipero (fun hipero sense -> if StringSet.mem predef sense then StringMap.add_inc hipero sense [id] (fun l -> id :: l) else let lu_id = lu_id_of_sense sense in let syn_id = try StringMap.find synsets lu_id with Not_found -> lu_id in StringMap.add_inc hipero syn_id [id] (fun l -> id :: l)) in hipero, StringSet.add predef id | _ -> failwith ("load_predef: " ^ line)) let hipero,predef = load_predef hipero Paths.predef_filename let rec get_hipero_rec found id = if StringSet.mem found id then found else let found = StringSet.add found id in let l = try StringMap.find hipero id with Not_found -> [] in Xlist.fold l found get_hipero_rec let get_hipero lu_id = let syn_id = StringMap.find synsets lu_id in StringSet.to_list (get_hipero_rec StringSet.empty syn_id) let synset_name id = if StringSet.mem predef id then id else try StringMap.find syn_names id with Not_found -> failwith "synset_name" let rename_sense sense = let lu_id = lu_id_of_sense sense in StringMap.find synsets lu_id let load_proper_classes filename = let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in Xlist.fold l StringMap.empty (fun map line -> if String.length line = 0 then map else if String.get line 0 = '#' then map else match Str.split_delim (Str.regexp "\t") line with id :: senses -> let senses = Xlist.map senses (fun sense -> match List.rev (Str.split (Str.regexp " ") sense) with weight :: l -> String.concat " " (List.rev l), (try float_of_string weight with _ -> failwith "load_proper_classes 2") | _ -> failwith "load_proper_classes 4") in let senses = Xlist.map senses (fun (sense,weight) -> let sense = if sense = "antroponim 1" then "nazwa własna 1" else sense in let sense = if sense = "godzina 4" then "godzina 3" else sense in (* print_endline sense; *) let lu_id = lu_id_of_sense sense in sense,Xlist.map (get_hipero lu_id) synset_name,weight) in StringMap.add_inc map id senses (fun _ -> failwith ("load_proper_classes 3: " ^ id)) | _ -> failwith ("load_proper_classes: " ^ line)) let proper_classes = load_proper_classes Paths.proper_classes_filename let simplify_pos = function "subst" -> "noun" | "depr" -> "noun" | "adj" -> "adj" | "adja" -> "adj" | "adjc" -> "adj" | "adjp" -> "adj" | "ger" -> "verb" | "pact" -> "verb" | "ppas" -> "verb" | "fin" -> "verb" | "bedzie" -> "verb" | "praet" -> "verb" | "winien" -> "verb" | "impt" -> "verb" | "imps" -> "verb" | "inf" -> "verb" | "pcon" -> "verb" | "pant" -> "verb" | "pred" -> "verb" | s -> s let assign_senses (paths,last) = (* FIXME: sensy zawierające 'się' *) List.rev (Xlist.rev_map paths (fun t -> match t.token with Lemma(lemma,pos,_) -> (*if pos = "ppron12" || pos = "ppron3" || pos = "siebie" then {t with senses=[lemma,["0"],0.]} else*) (* FIXME: ustalić co z zaimkami *) let lu = match simplify_pos pos with "noun" -> noun_lu | "adj" -> adj_lu | "verb" -> verb_lu | _ -> StringMap.empty in if StringMap.mem lu lemma then let l = StringMap.find lu lemma in let l = Xlist.rev_map l (fun (id,variant) -> lemma ^ " " ^ variant, Xlist.map (get_hipero id) synset_name, log10 (1. /. (try float_of_string variant with _ -> 3.))) in {t with senses=l} else t | Proper(_,_,_,senses) -> {t with senses=List.flatten (Xlist.rev_map senses (fun sense -> try StringMap.find proper_classes sense with Not_found -> failwith ("assign_senses: " ^ sense)))} | _ -> t)), last