ENIAMsubsyntax.ml 14.4 KB
(*
 *  ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes
open Xstd

let load_lemma_frequencies filename map =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l map (fun map line ->
    if String.length line = 0 then map else
    if String.get line 0 = '#' then map else
    match Str.split_delim (Str.regexp "\t") line with
      [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.))
    | _ -> failwith ("load_lemma_frequencies: " ^ line))

let lemma_frequencies = File.catch_no_file (load_lemma_frequencies lemma_frequencies_filename) StringMap.empty

let modify_weights paths =
  List.rev (Xlist.fold paths [] (fun paths t ->
    let w = Xlist.fold t.attrs t.weight (fun w -> function
        "token not found" -> w -. 25.
      | "lemma not validated"-> w -. 20.
      | "notvalidated proper" -> w -. 1.
      | "lemmatized as lowercase" -> w -. 0.1
      | _ -> w) in
    let w = match t.token with
        Lemma(lemma,cat,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
      | Proper(lemma,cat,_,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
      | _ -> w in
    {t with weight = w} :: paths))

let translate_digs paths =
  Xlist.map paths (fun t ->
    match t.token with
      Dig(lemma,"dig") -> t
    | Dig(lemma,"intnum") -> {t with token=Lemma(lemma,"intnum",[[]])}
    | Dig(lemma,"realnum") -> {t with token=Lemma(lemma,"realnum",[[]])}
    | Dig(lemma,"year") -> {t with token=Proper(lemma,"year",[[]],["rok"])}
    | Dig(lemma,"month") -> t (*{t with token=Proper(lemma,"month",[[]],["miesiąc"])}*)
    | Dig(lemma,"hour") -> {t with token=Proper(lemma,"hour",[[]],["godzina"])}
    | Dig(lemma,"day") -> {t with token=Proper(lemma,"day",[[]],["dzień"])}
    | Dig(lemma,"minute") -> t (*{t with token=Proper(lemma,"minute",[[]],["minuta"])}*)
    | Dig(lemma,"2dig") -> t
    | Dig(lemma,"3dig") -> t
    | Dig(lemma,"pref3dig") -> t
    | RomanDig(lemma,"roman") -> {t with token=Lemma(lemma,"roman",[[]]); attrs=t.attrs}
    | RomanDig(lemma,"month") -> t (*{t with token=Proper(lemma,"symbol",[[]],["month"]); attrs="roman" :: t.attrs}*)
    | Dig(lemma,"ordnum") -> {t with token=Lemma(lemma,"ordnum",[[]])}
    | Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
    | Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
    | Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
    | Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
    | Compound("day-month",[Dig(d,"day");Dig(m,"month")]) -> {t with token=Proper(d ^ "." ^ m,"day-month",[[]],["data"])}
    | Compound("hour-minute",[Dig(h,"hour");Dig(m,"minute")]) -> {t with token=Proper(h ^ ":" ^ m,"hour-minute",[[]],["godzina"])}
    | Compound("match-result",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Proper(x ^ ":" ^ y,"match-result",[[]],["rezultat"])}
    | Compound("intnum-interval",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"intnum-interval",[[]])}
    | Compound("roman-interval",[RomanDig(x,"roman");RomanDig(y,"roman")]) -> {t with token=Lemma(x ^ "-" ^ y,"roman-interval",[[]]); attrs=t.attrs}
    | Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
    | Compound("realnum-interval",[Dig(x,"intnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
    | Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
    | Compound("date-interval",[Compound("date",[Dig(d1,"day");Dig(m1,"month");Dig(y1,"year")]);
        Compound("date",[Dig(d2,"day");Dig(m2,"month");Dig(y2,"year")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "." ^ y1 ^ "-" ^ d2 ^ "." ^ m2 ^ "." ^ y2,"date-interval",[[]],["interwał"])}
    | Compound("day-month-interval",[Compound("day-month",[Dig(d1,"day");Dig(m1,"month")]);
        Compound("day-month",[Dig(d2,"day");Dig(m2,"month")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "-" ^ d2 ^ "." ^ m2,"day-month-interval",[[]],["interwał"])}
    | Compound("day-interval",[Dig(d1,"day");Dig(d2,"day")]) -> {t with token=Proper(d1 ^ "-" ^ d2,"day-interval",[[]],["interwał"])}
    | Compound("month-interval",[Dig(m1,"month");Dig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"])}
    | Compound("month-interval",[RomanDig(m1,"month");RomanDig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"]); attrs="roman" :: t.attrs}
    | Compound("year-interval",[Dig(y1,"year");Dig(y2,"year")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
    | Compound("year-interval",[Dig(y1,"year");Dig(y2,"2dig")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
    | Compound("hour-minute-interval",[Compound("hour-minute",[Dig(h1,"hour");Dig(m1,"minute")]);Compound("hour-minute",[Dig(h2,"hour");Dig(m2,"minute")])]) ->
       {t with token=Proper(h1 ^ ":" ^ m1 ^ "-" ^ h2 ^ ":" ^ m2,"hour-minute-interval",[[]],["interwał"])}
    | Compound("hour-interval",[Dig(h1,"hour");Dig(h2,"hour")]) -> {t with token=Proper(h1 ^ "-" ^ h2,"hour-interval",[[]],["interwał"])}
    | Compound("minute-interval",[Dig(m1,"minute");Dig(m2,"minute")]) -> t (*{t with token=Proper(m1 ^ "-" ^ m2,"minute-interval",[[]],["interwał"])}*)
    | Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])}
    | Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])}
    | Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])}
    | Dig(cat,_) -> failwith ("translate_digs: Dig " ^ cat)
    | RomanDig(cat,_) -> failwith ("translate_digs: Romandig " ^ cat)
    | Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t)
    | _ -> t)

(**********************************************************************************)

module OrderedStringList = struct

  type t = string list

  let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)

end

module OrderedStringListList = struct

  type t = string list list

  let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)

end

module StringListMap = Xmap.Make(OrderedStringList)
module StringListListMap = Xmap.Make(OrderedStringListList)
module StringListListSet = Xset.Make(OrderedStringListList)

type tree = T of tree StringListMap.t | S of StringSet.t

let single_tags = function
    [_] :: _ -> true
  | _ -> false

let rec make_tree interp =
  if single_tags interp then S (StringSet.of_list (List.flatten (List.flatten interp))) else
  let map = Xlist.fold interp StringListMap.empty (fun map tags ->
    StringListMap.add_inc map (List.hd tags) [List.tl tags] (fun l -> (List.tl tags) :: l)) in
  T(StringListMap.map map make_tree)

let is_s_tree map =
  StringListListMap.fold map false (fun b _ -> function
      S _ -> true
    | T _ -> b)

let rec fold_tree_rec rev s f = function
    S set -> f s (List.rev rev) set
  | T map -> StringListMap.fold map s (fun s tag tree ->
       fold_tree_rec (tag :: rev) s f tree)

let fold_tree tree s f = fold_tree_rec [] s f tree

let rec combine_interps_rec map =
  if is_s_tree map then
    StringListListMap.fold map [] (fun interp tail_tags -> function
        S tag -> ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) :: interp
      | _ -> failwith "combine_interps_rec")
  else
    let map = StringListListMap.fold map StringListListMap.empty (fun map tail_tags tree ->
      fold_tree tree map (fun map head_tags tag ->
        StringListListMap.add_inc map ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) [head_tags] (fun l -> head_tags :: l))) in
    combine_interps_rec (StringListListMap.map map make_tree)

let combine_interp interp =
  let map = StringListListMap.add StringListListMap.empty [] (make_tree interp) in
  combine_interps_rec map

let combine_pos = StringSet.of_list ["subst"; "depr"; "ppron12"; "ppron3"; "siebie"; "adj"; "num"; "ger"; "praet"; "fin"; "impt"; "imps"; "pcon"; "ppas"; "pact";
  "inf"; "bedzie"; "aglt"; "winien"; "pant"; "prep"]

let combine_interps paths =
  List.rev (Xlist.rev_map paths (fun t ->
    match t.token with
      Lemma(lemma,pos,interp) ->
(*         Printf.printf "%s %s %s\n" lemma pos (PreTokenizer.string_of_interps interp); *)
        let interp =
          if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
          else interp in
        let interp =
          if StringSet.mem combine_pos pos then combine_interp interp else
          StringListListSet.to_list (StringListListSet.of_list interp) in
        {t with token=Lemma(lemma,pos,interp)}
    | Proper(lemma,pos,interp,cat) ->
      (*         Printf.printf "%s %s %s\n" lemma pos (PreTokenizer.string_of_interps interp); *)
      let interp =
        if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
        else interp in
      let interp =
        if StringSet.mem combine_pos pos then combine_interp interp else
          StringListListSet.to_list (StringListListSet.of_list interp) in
      {t with token=Proper(lemma,pos,interp,cat)}
    | _ -> t))

(**********************************************************************************)

let select_tokens paths =
  List.rev (Xlist.fold paths [] (fun paths t ->
    match t.token with
(*      RomanDig(v,cat) -> {t with token=Lemma(v,cat,[[]])} :: paths
    | Interp orth -> {t with token=Lemma(orth,"interp",[[]])} :: paths
    | Dig(value,cat) -> {t with token=Lemma(value,cat,[[]])} :: paths
    | Other2 orth -> {t with token=Lemma(orth,"unk",[[]])} :: paths
    | Lemma(lemma,cat,interp) -> t :: paths
    | Proper _ -> failwith "select_tokens"
    | Compound _ -> t :: paths*)
(*       RomanDig(v,cat) -> t :: paths *)
    | Interp orth -> t :: paths
(*     | Dig(value,cat) -> t :: paths *)
    | Other orth -> t :: paths
    | Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths
    | Proper(lemma,pos,interp,cat) -> if pos = "brev" then paths else t :: paths
(*     | Compound _ -> t :: paths *)
    | _ -> paths))

let load_proper_name proper = function
    [lemma; types] ->
    let types = Str.split (Str.regexp "|") types in
    StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
  | l -> failwith ("proper_names: " ^ String.concat " " l)

let load_proper_names filename proper =
  File.fold_tab filename proper load_proper_name

let proper_names =
  let proper = File.catch_no_file (load_proper_names proper_names_filename) StringMap.empty in
  let proper = File.catch_no_file (load_proper_names proper_names_filename2) proper in
  let proper = File.catch_no_file (load_proper_names proper_names_filename3) proper in
  proper

let remove l s =
  Xlist.fold l [] (fun l t ->
      if s = t then l else t :: l)

let find_proper_names t =
  match t.token with
    Lemma(lemma,pos,interp) ->
    if StringMap.mem proper_names lemma then
      {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
              attrs=remove t.attrs "notvalidated proper"} else
    if Xlist.mem t.attrs "notvalidated proper" then
      {t with token=Proper(lemma,pos,interp,[])}
    else t
  | _ -> t

let parse query =
  let l = ENIAMtokenizer.parse query in
(*   print_endline "a6"; *)
  let paths = ENIAMpaths.translate_into_paths l in
(*   print_endline "a7"; *)
  let paths = ENIAMpaths.lemmatize paths in
(*   print_endline "a8"; *)
  let paths,_ = ENIAM_MWE.process paths in
(*   print_endline "a12"; *)
(*  let paths = find_proper_names paths in*)
  let paths =  List.rev (Xlist.rev_map paths find_proper_names) in
(*   print_endline "a13"; *)
  let paths = modify_weights paths in
  let paths = translate_digs paths in
(*  let paths = assign_senses paths in
(*   print_endline "a14"; *)
  let paths = assign_valence paths in*)
(*   print_endline "a15"; *)
  let paths = combine_interps paths in
(*   print_endline "a16"; *)
(*  let paths = disambiguate_senses paths in
  let paths = assign_simplified_valence paths in
  let paths = PreSemantics.assign_semantics paths in*)
(*   print_endline "a16"; *)
  let paths = select_tokens paths in
(*   print_endline "a17"; *)
(*   print_endline "a18"; *)
  paths(*, next_id*)

let parse_text_tokens tokens query =
  (* print_endline ("parse_text_tokens: " ^ query); *)
  let paragraphs = Xstring.split "\n\\|\r" query in
  let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
  let n = if Xlist.size paragraphs = 1 then 0 else 1 in
  let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph ->
      let paths = parse paragraph in
      (* print_endline "parse_text 1"; *)
      let pid = if n = 0 then "" else string_of_int n ^ "_" in
      let sentences = ENIAMsentences.split_into_sentences pid paragraph tokens paths in
      (AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1) in
  AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens

let parse_text query =
  (* print_endline ("parse_text: " ^ query); *)
  let tokens = ExtArray.make 100 empty_token_env in
  let _ = ExtArray.add tokens empty_token_env in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
  parse_text_tokens tokens query