ENIAM_MWE.ml 13 KB
(*
 *  ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes

type sel = V of string | S of string | G

type t =
    L of string * string * sel list
  | O of string
  | D of string * string

let process_interp lemma interp =
  match Xstring.split ":" interp with
    cat :: interp -> L(lemma,cat,Xlist.map interp (function
        "$c" -> S "c"
      | "$n" -> S "n"
      | "$g" -> S "g"
      | "$d" -> S "d"
      | "$C" -> S "C"
      | "_" -> G
      | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s))
  | _ -> failwith "process_interp"

let load_mwe_dict filename dict =
  File.fold_tab filename dict (fun dict -> function
      [orths; lemma; interp] ->
        let orths = Xstring.split " " orths in
        if orths = [] then failwith "load_mwe_dict" else
        let s = List.hd orths in
        let orths = Xlist.map orths (fun s -> O s) in
        let lemma,cat,interp = match process_interp lemma interp with
            L(lemma,cat,interp) -> lemma,cat,interp
          | _ -> failwith "load_mwe_dict2" in
        StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
    | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))

(*<<<<<<< HEAD
let load_mwe_dict () =
  let dict = load_dict StringMap.empty brev_filename in
  let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
(*    let dict = load_dict dict complete_entries_filename in*)
  let dict = load_dict dict mwe_filename in
  dict

let mwe_dict = ref (StringMap.empty : (string * string * string) list StringMap.t)

let preselect_dict orths dict =
  StringSet.fold orths [] (fun rules orth ->
    try
      let l = StringMap.find dict orth in
      Xlist.fold l rules (fun rules (orth,lemma,interp) ->
               (* print_endline ("preselect_dict: " ^ orth); *)
               let match_list = Str.split (Str.regexp " ") orth in
               let b = Xlist.fold match_list true (fun b s ->
                   (* if not (StringSet.mem orths s) then print_endline s; *)
                   StringSet.mem orths s && b) in
               if b then (match_list,lemma,interp) :: rules else rules)
    with Not_found -> rules)*)

let process_orth = function
    [Lexer.T lemma; Lexer.B("(",")",[Lexer.T interp])] -> process_interp lemma interp
  | [Lexer.T orth] -> O orth
  | [Lexer.B("{","}",l); Lexer.B("(",")",[Lexer.T interp])] -> process_interp (Lexer.string_of_token_list l) interp
  | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l)
  | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens)

let load_mwe_dict2 filename (dict,dict2) =
  File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function
      [orths; lemma] ->
        (* print_endline (orths ^ "\t" ^ lemma); *)
        let tokens = Lexer.split "(\\|)\\|{\\|}\\| " orths in
        (* print_endline ("load_dict2 1: " ^ Lexer.string_of_token_list tokens); *)
        let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
        (* print_endline ("load_dict2 2: " ^ Lexer.string_of_token_list tokens); *)
        let orths = List.rev (Xlist.rev_map (Lexer.split_symbol (Lexer.T " ") [] tokens) process_orth) in
        let tokens = Lexer.split "(\\|)\\|{\\|}" lemma in
        (* print_endline ("load_dict2 3: " ^ Lexer.string_of_token_list tokens); *)
        let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
        (* print_endline ("load_dict2 4: " ^ Lexer.string_of_token_list tokens); *)
        let lemma,cat,interp = match process_orth tokens with
            L(lemma,cat,interp) -> lemma,cat,interp
          | _ -> failwith "load_mwe_dict2" in
        if orths = [] then failwith "load_mwe_dict2" else
        (match List.hd orths with
            L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
          | O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2
          | D _ -> failwith "load_mwe_dict2")
    | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'"))

let load_mwe_dicts () =
  let dict = File.catch_no_file (load_mwe_dict brev_filename) StringMap.empty in
  let dict = File.catch_no_file (load_mwe_dict fixed_filename) dict in
  let dict = File.catch_no_file (load_mwe_dict mwe_filename) dict in
  let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejf_filename) (dict,StringMap.empty) in
  let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejfek_filename) (dict,dict2) in
  let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_filename) (dict,dict2) in
  let dict,dict2 = File.catch_no_file (load_mwe_dict2 mwe2_filename) (dict,dict2) in
  dict,dict2

let mwe_dict = ref (StringMap.empty : (t list * string * string * sel list) list StringMap.t)
let mwe_dict2 = ref (StringMap.empty : (t list * string * string * sel list) list StringMap.t)

let get_orths paths =
  IntMap.fold paths StringSet.empty (fun orths _ map ->
    IntMap.fold map orths (fun orths _ l ->
      TokenEnvSet.fold l orths (fun orths t ->
        StringSet.add orths (ENIAMtokens.get_orth t.token))))

let get_lemmas paths =
  IntMap.fold paths StringSet.empty (fun orths _ map ->
    IntMap.fold map orths (fun orths _ l ->
      TokenEnvSet.fold l orths (fun orths t ->
        StringSet.add orths (ENIAMtokens.get_lemma t.token))))

let get_intnum_orths paths =
  IntMap.fold paths StringMap.empty (fun orths _ map ->
    IntMap.fold map orths (fun orths _ l ->
      TokenEnvSet.fold l orths (fun orths t ->
        match t.token with
          Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
        | _ -> orths)))

let preselect orths lemmas rules l =
  Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
    let b = Xlist.fold match_list true (fun b -> function
        O s -> StringSet.mem orths s && b
      | L(s,_,_) -> StringSet.mem lemmas s && b
      | D(s,_) -> failwith "preselect") in
    if b then (Xlist.size match_list > 1,match_list,lemma,cat,interp) :: rules else rules)

let preselect_dict orths lemmas dict rules =
  StringSet.fold orths rules (fun rules orth ->
    try
      preselect orths lemmas rules (StringMap.find dict orth)
    with Not_found -> rules)

let preselect_dict2 orths lemmas dict2 rules =
  StringSet.fold lemmas rules (fun rules lemma ->
    try
      preselect orths lemmas rules (StringMap.find dict2 lemma)
    with Not_found -> rules)

let add_ordnum_rules orths rules =
  StringMap.fold orths rules (fun rules orth lemmas ->
    StringSet.fold lemmas rules (fun rules lemma ->
      (* Printf.printf "%s %s\n%!" orth lemma; *)
      (false,[D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))

let select_rules paths mwe_dict mwe_dict2 =
  let orths = get_orths paths in
  let lemmas = get_lemmas paths in
  let intnum_orths = get_intnum_orths paths in
  let rules = preselect_dict orths lemmas mwe_dict [] in
  let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
  let rules = add_ordnum_rules intnum_orths rules in
  rules

let rec check_interp sels = function
    [],[] -> true
  | s :: interp, ["_"] :: interp2 -> check_interp sels (interp,interp2)
  | V s :: interp, l2 :: interp2 -> if Xlist.mem l2 s then check_interp sels (interp,interp2) else false
  | S s :: interp, l2 :: interp2 ->
      (try
        let l = Xlist.assoc sels s in
        let b = Xlist.fold l false (fun b s -> Xlist.mem l2 s || b) in
        if b then check_interp sels (interp,interp2) else false
      with Not_found -> check_interp sels (interp,interp2))
  | G :: interp, l2 :: interp2 -> check_interp sels (interp,interp2)
  | _ -> failwith "check_interp"

let rec get_sels sels = function
    [],[] -> sels
  | s :: interp, ["_"] :: interp2 -> get_sels sels (interp,interp2)
  | V s :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
  | S s :: interp, l2 :: interp2 ->
      (try
        let l = Xlist.assoc sels s in
        let sels = List.remove_assoc s sels in
        let l = Xlist.fold l [] (fun l s -> if Xlist.mem l2 s then s :: l else l) in
        get_sels ((s,l) :: sels) (interp,interp2)
      with Not_found -> get_sels ((s,l2) :: sels) (interp,interp2))
  | G :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
  | _ -> failwith "get_sels"

let rec match_path_rec map found (t:token_env) sels rev = function
    [] -> (t :: rev, sels) :: found
  | s :: l ->
     let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in
     let found2 = IntMap.fold map2 [] (fun found2 _ l ->
       TokenEnvSet.fold l found2 (fun found2 new_t ->
           match s,new_t.token with
             O s, token -> if ENIAMtokens.get_orth token = s then (new_t,sels) :: found2 else found2
           | L(s,cat,interp), Lemma(s2,cat2,interps2) ->
               Xlist.fold interps2 found2 (fun found2 interp2 ->
                 if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then
                   (new_t,get_sels sels (interp,interp2)) :: found2 else found2)
           | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
           | _ -> found2)) in
     Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)

let match_path map = function
    [] -> failwith "match_path"
  | s :: l ->
     let found = IntMap.fold map [] (fun found i map2 ->
       IntMap.fold map2 found (fun found j l ->
         TokenEnvSet.fold l found (fun found t ->
           match s,t.token with
             O s, token -> if ENIAMtokens.get_orth token = s then (t,[]) :: found else found
           | L(s,cat,interp), Lemma(s2,cat2,interps2) ->
               Xlist.fold interps2 found (fun found interp2 ->
                 if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then
                   (t,get_sels [] (interp,interp2)) :: found else found)
           | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
           | _ -> found))) in
     Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)

let concat_orths l =
  let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in
  let n = Xstring.size s in
  if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s

let create_token is_mwe (matching:token_env list) sels lemma cat interp = (* FIXME: problem z nazwami własnymi *)
  let l = List.rev matching in
  let beg = (List.hd l).beg in
  let t = List.hd matching in
  let len = t.beg + t.len - beg in
   {empty_token_env with
    orth=concat_orths l;
    beg=beg;
    len=len;
    next=t.next;
    token=Lemma(lemma,cat,[Xlist.map interp (function
        S s -> (try Xlist.assoc sels s with Not_found -> ["_"])
      | V s -> Xstring.split "\\." s
      | G -> ["_"])]);
    weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *)
    attrs=(if is_mwe then [MWE ]else []) @ ENIAMtokens.merge_attrs l}

let add_token paths t =
  let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
  let map = IntMap.add_inc map t.next (TokenEnvSet.singleton t) (fun set -> TokenEnvSet.add set t) in
  IntMap.add paths t.beg map

let apply_rule paths (is_mwe,match_list,lemma,cat,interp) =
  (* print_endline ("apply_rule: " ^ lemma); *)
  let matchings_found = match_path paths match_list in
  Xlist.fold matchings_found paths (fun paths (matching,sels) ->
    try
      let token = create_token is_mwe matching sels lemma cat interp in
      add_token paths token
    with Not_found -> paths)

let process (paths,last) =
  let paths = Xlist.fold paths IntMap.empty add_token in
  let rules = select_rules paths !mwe_dict !mwe_dict2 in
  let paths = Xlist.fold rules paths apply_rule in
  let rules = select_rules paths !mwe_dict !mwe_dict2 in
  let paths = Xlist.fold rules paths apply_rule in
  let rules = select_rules paths !mwe_dict !mwe_dict2 in
  let paths = Xlist.fold rules paths apply_rule in
  let rules = select_rules paths !mwe_dict !mwe_dict2 in
  let paths = Xlist.fold rules paths apply_rule in
  let paths = IntMap.fold paths [] (fun paths _ map ->
    IntMap.fold map paths (fun paths _ l ->
      TokenEnvSet.fold l paths (fun paths t ->
        t :: paths))) in
  ENIAMpaths.sort (paths,last)