preWordnet.ml 7.83 KB
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open PreTypes

let load_lu names filename = 
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l (StringMap.empty,names) (fun (lu,names) line -> 
    if String.length line = 0 then lu,names else
    if String.get line 0 = '#' then lu,names else
    match Str.split_delim (Str.regexp "\t") line with
      [id; lemma; variant] -> 
        StringMap.add_inc lu lemma [id,variant] (fun l -> (id,variant) :: l),
        StringMap.add_inc names id (lemma ^ " " ^ variant) (fun _ -> failwith "load_lu")
    | _ -> failwith ("load_lu: " ^ line))

let noun_lu,lu_names = load_lu StringMap.empty Paths.rzeczownik_filename
let verb_lu,lu_names = load_lu lu_names Paths.czasownik_filename
let adj_lu,lu_names = load_lu lu_names Paths.przymiotnik_filename

let load_synsets filename = 
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l (StringMap.empty,StringMap.empty) (fun (syn,names) line -> 
    if String.length line = 0 then (syn,names) else
    if String.get line 0 = '#' then (syn,names) else
    match Str.split_delim (Str.regexp "\t") line with
      [syn_id; lu_ids] -> 
        let lu_ids = Str.split_delim (Str.regexp " ") lu_ids in
        let syn = Xlist.fold lu_ids syn (fun syn lu_id -> 
          StringMap.add_inc syn lu_id syn_id (fun _ -> failwith ("load_synsets 1: " ^ lu_id))) in
        let lu_id = try List.hd lu_ids with _ ->  failwith ("load_synsets 2: " ^ syn_id) in
        let name = try StringMap.find lu_names lu_id with Not_found -> "syn_id: " ^ syn_id in (* nieznane synsety są z en wordnetu *)
        let names = StringMap.add_inc names syn_id name (fun _ -> failwith ("load_synsets 4: " ^ syn_id)) in
        syn,names
    | _ -> failwith ("load_synsets 5: " ^ line))

let synsets, syn_names = load_synsets Paths.synsets_filename
    
let load_hipero filename = 
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l StringMap.empty (fun hip line -> 
    if String.length line = 0 then hip else
    if String.get line 0 = '#' then hip else
    match Str.split_delim (Str.regexp "\t") line with
      [id; ids] -> 
        let ids = Str.split_delim (Str.regexp " ") ids in
        StringMap.add_inc hip id ids (fun _ -> failwith "load_hipero")
    | _ -> failwith ("load_hipero: " ^ line))   
    
let hipero = load_hipero Paths.hipero_filename
    
let rec get_lu_id variant = function
    (id,v) :: l -> if variant = v then id else get_lu_id variant l
  | [] -> failwith "get_lu_id"

let lu_id_of_sense sense =
  let lemma,variant = 
    match List.rev (Str.split (Str.regexp " ") sense) with
(*       [lemma] -> lemma,"" *)
    | variant :: l -> String.concat " " (List.rev l), variant
    | _ -> failwith "lu_id_of_sense 1" in
  if variant = "" then lemma else
  let l = try StringMap.find noun_lu lemma with Not_found -> failwith ("lu_id_of_sense 2: " ^ lemma) in
  get_lu_id variant l
  
let load_predef hipero filename = 
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l (hipero,StringSet.empty) (fun (hipero,predef) line -> 
    if String.length line = 0 then hipero,predef else
    if String.get line 0 = '#' then hipero,predef else
    match Str.split_delim (Str.regexp "\t") line with
      id :: senses -> 
        let hipero = Xlist.fold senses hipero (fun hipero sense ->
          if StringSet.mem predef sense then StringMap.add_inc hipero sense [id] (fun l -> id :: l) else
          let lu_id = lu_id_of_sense sense in
          let syn_id = try StringMap.find synsets lu_id with Not_found -> lu_id in
          StringMap.add_inc hipero syn_id [id] (fun l -> id :: l)) in
        hipero, StringSet.add predef id
    | _ -> failwith ("load_predef: " ^ line))   
    
let hipero,predef = load_predef hipero Paths.predef_filename
   
let rec get_hipero_rec found id =
  if StringSet.mem found id then found else
  let found = StringSet.add found id in
  let l = try StringMap.find hipero id with Not_found -> [] in
  Xlist.fold l found get_hipero_rec
  
let get_hipero lu_id =   
  let syn_id = StringMap.find synsets lu_id in
  StringSet.to_list (get_hipero_rec StringSet.empty syn_id)
  
let synset_name id =
  if StringSet.mem predef id then id else
  try StringMap.find syn_names id with Not_found -> failwith "synset_name"
  
let rename_sense sense =
  let lu_id = lu_id_of_sense sense in
  StringMap.find synsets lu_id
  
let load_proper_classes filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l StringMap.empty (fun map line -> 
    if String.length line = 0 then map else
    if String.get line 0 = '#' then map else
    match Str.split_delim (Str.regexp "\t") line with
      id :: senses -> 
        let senses = Xlist.map senses (fun sense ->
          match List.rev (Str.split (Str.regexp " ") sense) with
            weight :: l -> String.concat " " (List.rev l), (try float_of_string weight with _ -> failwith "load_proper_classes 2")
          | _ -> failwith "load_proper_classes 4") in
        let senses = Xlist.map senses (fun (sense,weight) ->
          let sense = if sense = "antroponim 1" then "nazwa własna 1" else sense in
          let sense = if sense = "godzina 4" then "godzina 3" else sense in
(*           print_endline sense; *)
          let lu_id = lu_id_of_sense sense in
          sense,Xlist.map (get_hipero lu_id) synset_name,weight) in
        StringMap.add_inc map id senses (fun _ -> failwith ("load_proper_classes 3: " ^ id))
    | _ -> failwith ("load_proper_classes: " ^ line))   
    
let proper_classes = load_proper_classes Paths.proper_classes_filename
    
let simplify_pos = function
    "subst" -> "noun"
  | "depr" -> "noun"
  | "adj" -> "adj"
  | "adja" -> "adj"
  | "adjc" -> "adj"
  | "adjp" -> "adj"
  | "ger" -> "verb"
  | "pact" -> "verb"
  | "ppas" -> "verb"
  | "fin" -> "verb"
  | "bedzie" -> "verb"
  | "praet" -> "verb"
  | "winien" -> "verb"
  | "impt" -> "verb"
  | "imps" -> "verb"
  | "inf" -> "verb"
  | "pcon" -> "verb"
  | "pant" -> "verb"
  | "pred" -> "verb"
  | s -> s

let assign_senses (paths,last) = (* FIXME: sensy zawierające 'się' *)
  List.rev (Xlist.rev_map paths (fun t ->
    match t.token with
      Lemma(lemma,pos,_) -> 
        (*if pos = "ppron12" || pos = "ppron3" || pos = "siebie" then {t with senses=[lemma,["0"],0.]} else*) (* FIXME: ustalić co z zaimkami *)
        let lu = match simplify_pos pos with
            "noun" -> noun_lu
          | "adj" -> adj_lu
          | "verb" -> verb_lu
          | _ -> StringMap.empty in        
        if StringMap.mem lu lemma then
          let l = StringMap.find lu lemma in
          let l = Xlist.rev_map l (fun (id,variant) -> 
            lemma ^ " " ^ variant, Xlist.map (get_hipero id) synset_name, log10 (1. /. (try float_of_string variant with _ -> 3.))) in
          {t with senses=l}
        else t
    | Proper(_,_,_,senses) ->  
        {t with senses=List.flatten (Xlist.rev_map senses (fun sense ->
          try StringMap.find proper_classes sense with Not_found -> failwith ("assign_senses: " ^ sense)))}
    | _ -> t)), last