inflexion.ml 13.9 KB
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open Printf

let load_alt filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  List.rev (Xlist.fold l [] (fun l line -> 
    if String.length line = 0 then l else
    if String.get line 0 = '#' then l else
    match Str.split_delim (Str.regexp "\t") line with
      [orth; lemma; interp] -> (orth,lemma,interp) :: l
    | _ -> failwith ("load_alt: " ^ line)))
    
let load_dict filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  List.rev (Xlist.fold l [] (fun l line -> 
    if String.length line = 0 then l else
    if String.get line 0 = '#' then l else
    match Str.split_delim (Str.regexp "\t") line with
      [stem; lemma_suf2; rule_names] -> (stem,lemma_suf2,Str.split (Str.regexp " ") rule_names) :: l
    | _ -> failwith ("load_dict: " ^ line)))

let load_rules filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  List.rev (Xlist.fold l [] (fun l line -> 
    if String.length line = 0 then l else
    if String.get line 0 = '#' then l else
    match Str.split_delim (Str.regexp "\t") line with
      [rule_name; quantity; lemma_suf; orth_suf; interp] -> (rule_name,int_of_string quantity,lemma_suf,orth_suf,interp) :: l
    | _ -> failwith ("load_rules: " ^ line)))

let make_rules_map rules =
  Xlist.fold rules StringMap.empty (fun rules (rule_name,quantity,lemma_suf,orth_suf,interp) ->
    let rules2 = try StringMap.find rules orth_suf with Not_found -> StringMap.empty in
    let rules2 = StringMap.add rules2 rule_name (lemma_suf,interp) in
    StringMap.add rules orth_suf rules2)
  
module OrderedChar = struct

  type t = char
  
  let compare = compare
  
end

module CharMap = Xmap.Make(OrderedChar)
     
type char_tree = N of char_tree CharMap.t * (string * string list) list * (string * string) list 
  (* następne możliwości * (lemma_suf2 * lista reguł) list * lista alt *)
  
type char_tree_suf = M of char_tree_suf CharMap.t * (string * int * string * string) list 
  (* następne możliwości * (rule_name * lemma_suf * interp) list *)

let empty_char_tree = N(CharMap.empty,[],[])
let empty_char_tree_suf = M(CharMap.empty,[])
  
let rec add_path_dict stem i n lemma_suf2 rule_names (N(map,rules,alts)) = 
  if i = n then N(map,(lemma_suf2,rule_names) :: rules,alts) else
  let tree = try CharMap.find map (String.get stem i) with Not_found -> empty_char_tree in
  let tree = add_path_dict stem (i+1) n lemma_suf2 rule_names tree in
  N(CharMap.add map (String.get stem i) tree,rules,alts)
  
let rec add_path_alt stem i n lemma interp (N(map,rules,alts)) = 
  if i = n then N(map,rules,(lemma,interp) :: alts) else
  let tree = try CharMap.find map (String.get stem i) with Not_found -> empty_char_tree in
  let tree = add_path_alt stem (i+1) n lemma interp tree in
  N(CharMap.add map (String.get stem i) tree,rules,alts)  
  
let make_char_tree dict alt =
  let tree = Xlist.fold dict empty_char_tree (fun tree (stem,lemma_suf2,rule_names) ->
    add_path_dict stem 0 (String.length stem) lemma_suf2 rule_names tree) in
  Xlist.fold alt tree (fun tree (orth,lemma,interp) ->
    add_path_alt orth 0 (String.length orth) lemma interp tree)
    
let rec add_path_rules rule_name quantity orth_suf i lemma_suf interp (M(map,rules)) = 
  if i = -1 then M(map,(rule_name,quantity,lemma_suf,interp) :: rules) else
  let tree = try CharMap.find map (String.get orth_suf i) with Not_found -> empty_char_tree_suf in
  let tree = add_path_rules rule_name quantity orth_suf (i-1) lemma_suf interp tree in
  M(CharMap.add map (String.get orth_suf i) tree,rules)
  
let make_char_tree_suf rules =
  let tree = Xlist.fold rules empty_char_tree_suf (fun tree (rule_name,quantity,lemma_suf,orth_suf,interp) ->
    add_path_rules rule_name quantity orth_suf (String.length orth_suf - 1) lemma_suf interp tree) in
  tree
    
let rec find_char_tree_rec i n orth (N(map,rules,alts)) =
  if i = n then [orth,"",rules,alts] else
  let l = try find_char_tree_rec (i+1) n orth (CharMap.find map (String.get orth i)) with Not_found -> [] in
  (String.sub orth 0 i,String.sub orth i (n-i),rules,[]) :: l  
    
let find_char_tree tree rules orth =
  let l = find_char_tree_rec 0 (String.length orth) orth tree in
  Xlist.fold l [] (fun found (stem,suf,rule_names,alts) ->
    let rules2 = try StringMap.find rules suf with Not_found -> StringMap.empty in
    let found = alts @ found in
    Xlist.fold rule_names found (fun found (lemma_suf2,rule_names2) ->
      Xlist.fold rule_names2 found (fun found rule_name ->
        try 
          let lemma_suf,interp = StringMap.find rules2 rule_name in
          let lemma = if lemma_suf2 = "" then stem ^ lemma_suf else stem ^ lemma_suf ^ ":" ^ lemma_suf2 in
          (lemma,interp) :: found
        with Not_found -> found)))
      
let rec find_char_tree_suf_rec i orth (M(map,rules)) =
  if i = 0 then Xlist.fold rules [] (fun l (rule_name,quantity,lemma_suf,interp) -> ("", rule_name, quantity, lemma_suf, interp, i) :: l) else
  let l = try find_char_tree_suf_rec (i-1) orth (CharMap.find map (String.get orth (i-1))) with Not_found -> [] in
  Xlist.fold rules l (fun l (rule_name, quantity, lemma_suf,interp) -> 
    (String.sub orth 0 i, rule_name, quantity, String.sub orth 0 i ^ lemma_suf, interp, i) :: l)
    
let find_char_tree_suf rules_tree stem_map alt_map orth =
  let alt_l = Xlist.rev_map (try StringMap.find alt_map orth with Not_found -> []) (fun (lemma,interp) -> lemma,interp,1,[]) in
  let l = find_char_tree_suf_rec (String.length orth) orth rules_tree in
  let found = Xlist.fold l alt_l (fun found (stem,rule_name,quantity,lemma,interp,i) ->
    try 
      let rule_names = StringMap.find stem_map stem in
      if StringSet.mem rule_names rule_name then (lemma,interp,1,[]) :: found else found (* FIXME: czy na pewno nie dodawać reguł niepasujących? to powoduje że lemat tak samo brzmiący a mający inną odmianę nie zostanie rozpoznany *)
    with Not_found -> if quantity < 100 || (String.length orth = i && stem = lemma) then found else (lemma,interp,quantity,["lemma not validated"]) :: found) in (* FIXME: ucięcie żadkich reguł powinno być inaczej sterowane *)
(*   if found = [] then [orth,"unk",1,["token not found"]] else  *)
  let found = (orth,"unk",1,["token not found"]) :: found in 
  let valid = Xlist.fold found [] (fun valid -> function
      lemma,interp,quantity,[] -> (lemma,interp,quantity,[]) :: valid
    | _ -> valid) in
  if valid = [] then found else valid
      
let prepare_inflexion alt_filename dict_filename rules_filename =   
  let alt = load_alt alt_filename in
  let dict = load_dict dict_filename in
  let rules = load_rules rules_filename in
  let tree = make_char_tree dict alt in
  let rules = make_rules_map rules in
  tree,rules

let tree,rules = 
(*   prepare_inflexion (morfeusz_path ^ Paths.alt_adj) (morfeusz_path ^ Paths.dict_adj) (morfeusz_path ^ Paths.rules_adj)  *)
(*   prepare_inflexion (morfeusz_path ^ Paths.alt_all) (morfeusz_path ^ Paths.dict_all) (morfeusz_path ^ Paths.rules_all)   *)
  empty_char_tree,StringMap.empty

let make_alt_map alt =
  Xlist.fold alt StringMap.empty (fun alt_map (orth,lemma,interp) ->
    StringMap.add_inc alt_map orth [lemma,interp] (fun l -> (lemma,interp) :: l))
  
let prepare_inflexion_suf alt_filename dict_filename rules_filename =   
  let alt = load_alt alt_filename in
  let rules = load_rules rules_filename in
  let rules_tree = make_char_tree_suf rules in
  let alt_map = make_alt_map alt in
  let dict = load_dict dict_filename in
  let stem_map = Xlist.fold dict StringMap.empty (fun stem_map (stem,lemma_suf2,rule_names) ->
    StringMap.add_inc stem_map stem (StringSet.of_list rule_names) (fun set -> Xlist.fold rule_names set StringSet.add)) in
  alt_map,rules_tree,stem_map
  
let alt_map,rules_tree,stem_map = 
  prepare_inflexion_suf (Paths.sgjp_path ^ Paths.alt_all) (Paths.sgjp_path ^ Paths.dict_all) (Paths.sgjp_path ^ Paths.rules_all)  

let check_prefix pat s =
  let n = String.length pat in
  if n > String.length s then false else
  String.sub s 0 n = pat
 
let cut_prefix pat s = 
  let i = String.length pat in
  let n = String.length s in
  if i >= n then "" else
  try String.sub s i (n-i) with _ -> failwith ("cut_prefix: " ^ s ^ " " ^ string_of_int i)
 
let check_sufix pat s =
  let n = String.length pat in
  let m = String.length s in
  if n > m then false else
  String.sub s (m-n) n = pat
 
let cut_sufix pat s = 
  let i = String.length pat in
  let n = String.length s in
  try String.sub s 0 (n-i) with _ -> failwith ("cut_sufix: " ^ s)
 
let rec select_interp_sufix pat = function
    [] -> []
  | (lemma,interp) :: l -> if check_sufix pat interp then (lemma,interp) :: (select_interp_sufix pat l) else select_interp_sufix pat l
  
let rec select_interp_sufix_suf pat = function
    [] -> []
  | (lemma,interp,quantity,attrs) :: l -> if check_sufix pat interp then (lemma,interp,quantity,attrs) :: (select_interp_sufix_suf pat l) else select_interp_sufix_suf pat l
  
let rec remove_interp_sufix pat = function
    [] -> []
  | (lemma,interp) :: l -> if check_sufix pat interp then remove_interp_sufix pat l else (lemma,interp) :: (remove_interp_sufix pat l)
 
let rec remove_interp_sufix_suf pat = function
    [] -> []
  | (lemma,interp,quantity,attrs) :: l -> 
       if interp = "adv:sup" then (lemma,interp,quantity,attrs) :: (remove_interp_sufix_suf pat l) else (* FIXME: zaślepka, wymaga poprawienia algorytmu generowania słowników *)
       if check_sufix pat interp then remove_interp_sufix_suf pat l else (lemma,interp,quantity,attrs) :: (remove_interp_sufix_suf pat l)
 
let get_interpretations orth = 
  (if check_prefix "naj" orth then select_interp_sufix ":sup" (find_char_tree tree rules (cut_prefix "naj" orth)) else []) @
  (if check_prefix "nie" orth then select_interp_sufix ":neg" (find_char_tree tree rules (cut_prefix "nie" orth)) else []) @
  (remove_interp_sufix ":neg" (remove_interp_sufix ":sup" (find_char_tree tree rules orth)))
    
let get_interpretations_suf orth = (* FIXME: nie działa dla adv:sup pisanych z wielkiej litery np Najdoskonalej Najlepiej *)
  if orth = "siebie" then ["siebie","siebie:acc.gen",1,[]] else
  if orth = "sobie" then ["siebie","siebie:dat.loc",1,[]] else
  if orth = "sobą" then ["siebie","siebie:inst",1,[]] else
  (if check_prefix "naj" orth then select_interp_sufix_suf ":sup" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "naj" orth)) else []) @
  (if check_prefix "nie" orth then select_interp_sufix_suf ":neg" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "nie" orth)) else []) @
  (if check_prefix "Naj" orth then select_interp_sufix_suf ":sup" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "Naj" orth)) else []) @
  (if check_prefix "Nie" orth then select_interp_sufix_suf ":neg" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "Nie" orth)) else []) @
  (remove_interp_sufix_suf ":neg" (remove_interp_sufix_suf ":sup" (find_char_tree_suf rules_tree stem_map alt_map orth)))
    
(* Testy *) 
    
let print_interpretations l =
  Xlist.iter (Xlist.sort l compare) (fun (lemma,interp) ->
    printf "%s\t%s\n" lemma interp)
    
(*let _ =
  let l = get_interpretations "życzliwą" in
  print_interpretations l;
  let l = get_interpretations "żyźniejszego" in
  print_interpretations l;
  let l = get_interpretations "zwiśli" in
  print_interpretations l;
  let l = get_interpretations "najzieleńsza" in
  print_interpretations l;
  let l = get_interpretations "najtandetniejsza" in
  print_interpretations l;
  let l = get_interpretations "nieżelazny" in
  print_interpretations l;
  ()*)
  
(*let sgjp_filename = "sgjp-20151020.tab"
let polimorf_filename = "polimorf-20151020.tab"

let _ =
  let interp_sel = Morf.load_interp_sel "data/interps.tab" in
  print_endline "loading sgjp";  
  let sgjp = Morf.load_tab (morfeusz_path ^ sgjp_filename) in  
  print_endline "loading polimorf";  
  let polimorf = Morf.load_tab (morfeusz_path ^ polimorf_filename) in
  print_endline "merging";  
  let dicts = Morf.merge_dicts [sgjp;polimorf] in
  let adj_interp_sel = StringMap.find interp_sel "adj" in
  let adj_sup_interp_sel = StringMap.find interp_sel "adj-sup" in
(*   let dicts = Morf.remove_prefix dicts "naj" adj_sup_interp_sel in *)
  print_endline "preparing queries";   
  let queries = StringMap.fold dicts StringMap.empty (fun queries lemma interps ->
    let interps = Morf.select_interps interps (adj_interp_sel @ adj_sup_interp_sel) in
    StringMap.fold interps queries (fun queries interp orths ->
      Xlist.fold orths queries (fun queries orth ->
        let s = lemma ^ "\t" ^ interp in
        StringMap.add_inc queries orth (StringSet.singleton s) (fun set -> StringSet.add set s)))) in
  print_endline "testing";   
  StringMap.iter queries (fun orth set ->
    let set = Xlist.fold (get_interpretations orth) set (fun set (lemma,interp) ->
      let s = lemma ^ "\t" ^ interp in
      if StringSet.mem set s then StringSet.remove set s else (
      printf "excessing interpretation: %s\t%s" orth s;
      set)) in
    if StringSet.is_empty set then () else
    StringSet.iter set (fun s ->
      printf "lacking interpretation: %s\t%s" orth s))*)