ENIAMplWordnet.ml 8.04 KB
(*
 *  ENIAMplWordnet,  an interface for "Słowosieć", a Polish Wordnet.
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

let resource_path =
  try Sys.getenv "ENIAM_RESOURCE_PATH"
  with Not_found -> "/usr/share/eniam"

let rzeczownik_filename = resource_path ^ "/plWordnet/rzeczownik.tab"
let czasownik_filename = resource_path ^ "/plWordnet/czasownik.tab"
let przymiotnik_filename = resource_path ^ "/plWordnet/przymiotnik.tab"
let synsets_filename = resource_path ^ "/plWordnet/synsets.tab"
let hipero_filename = resource_path ^ "/plWordnet/hipero.tab"
let predef_filename = resource_path ^ "/plWordnet/predef_prefs.tab"
let proper_classes_filename = resource_path ^ "/plWordnet/proper_classes.tab"


open Xstd
(* open PreTypes *)

let load_lu names filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l (StringMap.empty,names) (fun (lu,names) line ->
    if String.length line = 0 then lu,names else
    if String.get line 0 = '#' then lu,names else
    match Str.split_delim (Str.regexp "\t") line with
      [id; lemma; variant] ->
        StringMap.add_inc lu lemma [id,variant] (fun l -> (id,variant) :: l),
        StringMap.add_inc names id (lemma ^ " " ^ variant) (fun _ -> failwith "load_lu")
    | _ -> failwith ("load_lu: " ^ line))

let noun_lu,lu_names = load_lu StringMap.empty rzeczownik_filename
let verb_lu,lu_names = load_lu lu_names czasownik_filename
let adj_lu,lu_names = load_lu lu_names przymiotnik_filename

let load_synsets filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l (StringMap.empty,StringMap.empty) (fun (syn,names) line ->
    if String.length line = 0 then (syn,names) else
    if String.get line 0 = '#' then (syn,names) else
    match Str.split_delim (Str.regexp "\t") line with
      [syn_id; lu_ids] ->
        let lu_ids = Str.split_delim (Str.regexp " ") lu_ids in
        let syn = Xlist.fold lu_ids syn (fun syn lu_id ->
          StringMap.add_inc syn lu_id syn_id (fun _ -> failwith ("load_synsets 1: " ^ lu_id))) in
        let lu_id = try List.hd lu_ids with _ ->  failwith ("load_synsets 2: " ^ syn_id) in
        let name = try StringMap.find lu_names lu_id with Not_found -> "syn_id: " ^ syn_id in (* nieznane synsety są z en wordnetu *)
        let names = StringMap.add_inc names syn_id name (fun _ -> failwith ("load_synsets 4: " ^ syn_id)) in
        syn,names
    | _ -> failwith ("load_synsets 5: " ^ line))

let synsets, syn_names = load_synsets synsets_filename

let load_hipero filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l StringMap.empty (fun hip line ->
    if String.length line = 0 then hip else
    if String.get line 0 = '#' then hip else
    match Str.split_delim (Str.regexp "\t") line with
      [id; ids] ->
        let ids = Str.split_delim (Str.regexp " ") ids in
        StringMap.add_inc hip id ids (fun _ -> failwith "load_hipero")
    | _ -> failwith ("load_hipero: " ^ line))

let hipero = load_hipero hipero_filename

let rec get_lu_id variant = function
    (id,v) :: l -> if variant = v then id else get_lu_id variant l
  | [] -> failwith "get_lu_id"

let lu_id_of_sense sense =
  let lemma,variant =
    match List.rev (Str.split (Str.regexp " ") sense) with
(*       [lemma] -> lemma,"" *)
    | variant :: l -> String.concat " " (List.rev l), variant
    | _ -> failwith "lu_id_of_sense 1" in
  if variant = "" then lemma else
  let l = try StringMap.find noun_lu lemma with Not_found -> failwith ("lu_id_of_sense 2: " ^ lemma) in
  get_lu_id variant l

let load_predef hipero filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l (hipero,StringSet.empty) (fun (hipero,predef) line ->
    if String.length line = 0 then hipero,predef else
    if String.get line 0 = '#' then hipero,predef else
    match Str.split_delim (Str.regexp "\t") line with
      id :: senses ->
        let hipero = Xlist.fold senses hipero (fun hipero sense ->
          if StringSet.mem predef sense then StringMap.add_inc hipero sense [id] (fun l -> id :: l) else
          let lu_id = lu_id_of_sense sense in
          let syn_id = try StringMap.find synsets lu_id with Not_found -> lu_id in
          StringMap.add_inc hipero syn_id [id] (fun l -> id :: l)) in
        hipero, StringSet.add predef id
    | _ -> failwith ("load_predef: " ^ line))

let hipero,predef = load_predef hipero predef_filename

let rec get_hipero_rec found id =
  if StringSet.mem found id then found else
  let found = StringSet.add found id in
  let l = try StringMap.find hipero id with Not_found -> [] in
  Xlist.fold l found get_hipero_rec

let get_hipero lu_id =
  let syn_id = StringMap.find synsets lu_id in
  StringSet.to_list (get_hipero_rec StringSet.empty syn_id)

let synset_name id =
  if StringSet.mem predef id then id else
  try StringMap.find syn_names id with Not_found -> failwith "synset_name"

let rename_sense sense =
  let lu_id = lu_id_of_sense sense in
  StringMap.find synsets lu_id

let load_proper_classes filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l StringMap.empty (fun map line ->
    if String.length line = 0 then map else
    if String.get line 0 = '#' then map else
    match Str.split_delim (Str.regexp "\t") line with
      id :: senses ->
        let senses = Xlist.map senses (fun sense ->
          match List.rev (Str.split (Str.regexp " ") sense) with
            weight :: l -> String.concat " " (List.rev l), (try float_of_string weight with _ -> failwith "load_proper_classes 2")
          | _ -> failwith "load_proper_classes 4") in
        let senses = Xlist.map senses (fun (sense,weight) ->
          let sense = if sense = "antroponim 1" then "nazwa własna 1" else sense in
          let sense = if sense = "godzina 4" then "godzina 3" else sense in
(*           print_endline sense; *)
          let lu_id = lu_id_of_sense sense in
          sense,Xlist.map (get_hipero lu_id) synset_name,weight) in
        StringMap.add_inc map id senses (fun _ -> failwith ("load_proper_classes 3: " ^ id))
    | _ -> failwith ("load_proper_classes: " ^ line))

let proper_classes = load_proper_classes proper_classes_filename

let simplify_pos = function
    "subst" -> "noun"
  | "depr" -> "noun"
  | "adj" -> "adj"
  | "adja" -> "adj"
  | "adjc" -> "adj"
  | "adjp" -> "adj"
  | "ger" -> "verb"
  | "pact" -> "verb"
  | "ppas" -> "verb"
  | "fin" -> "verb"
  | "bedzie" -> "verb"
  | "praet" -> "verb"
  | "winien" -> "verb"
  | "impt" -> "verb"
  | "imps" -> "verb"
  | "inf" -> "verb"
  | "pcon" -> "verb"
  | "pant" -> "verb"
  | "pred" -> "verb"
  | s -> s

let find_senses lemma pos =
  (*if pos = "ppron12" || pos = "ppron3" || pos = "siebie" then {t with senses=[lemma,["0"],0.]} else*) (* FIXME: ustalić co z zaimkami *)
  let lu = match simplify_pos pos with
      "noun" -> noun_lu
    | "adj" -> adj_lu
    | "verb" -> verb_lu
    | _ -> StringMap.empty in
  if StringMap.mem lu lemma then
    let l = StringMap.find lu lemma in
    Xlist.rev_map l (fun (id,variant) ->
      lemma ^ " " ^ variant, Xlist.map (get_hipero id) synset_name, log10 (1. /. (try float_of_string variant with _ -> 3.)))
  else []

let find_proper_senses senses =
  List.flatten (Xlist.rev_map senses (fun sense ->
    try StringMap.find proper_classes sense with Not_found -> failwith ("find_proper_senses: " ^ sense)))