ENIAM_MWE.ml 12.2 KB
(*
 *  ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes

let load_dict dict filename =
  File.fold_tab filename dict (fun dict -> function
      [orth; lemma; interp] ->
        let s = List.hd (Str.split_delim (Str.regexp " ") orth) in
        StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l)
    | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))

let mwe_dict =
  let dict = load_dict StringMap.empty brev_filename in
  let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
(*    let dict = load_dict dict complete_entries_filename in*)
  let dict = load_dict dict mwe_filename in
  dict

let preselect_dict orths dict =
  StringSet.fold orths [] (fun rules orth ->
    try
      let l = StringMap.find dict orth in
      Xlist.fold l rules (fun rules (orth,lemma,interp) ->
               (* print_endline ("preselect_dict: " ^ orth); *)
               let match_list = Str.split (Str.regexp " ") orth in
               let b = Xlist.fold match_list true (fun b s ->
                   (* if not (StringSet.mem orths s) then print_endline s; *)
                   StringSet.mem orths s && b) in
               if b then (match_list,lemma,interp) :: rules else rules)
    with Not_found -> rules)


(*
type matching = {
  prefix: tokens list;
  matched: token_record list;
  suffix: tokens list;
  pattern: pat list;
  command: token_record list -> token;
  last: int
  }

let rec find_abr_pattern_tail matchings found = function
    [] -> found
  | token :: l ->
      let matchings,found = Xlist.fold matchings ([],found) (fun (matchings,found) matching ->
        match matching.pattern with
          [pat] ->
            let matchings = if token.beg <= matching.last then matching :: matchings else matchings in
            if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then
              matchings, {matching with matched = token :: matching.matched; last=token.next; pattern=[]} :: found else
            matchings, found
        | pat :: pattern ->
            let matchings = if token.beg <= matching.last then matching :: matchings else matchings in
            if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then
              {matching with matched = token :: matching.matched; last=token.next; pattern=pattern} :: matchings, found else
            matchings, found
        | [] -> matchings, matching :: found) in
      if matchings = [] then found else find_abr_pattern_tail matchings found l

let rec find_abr_pattern all_matchings found = function
    token :: l ->
      let matchings = Xlist.fold all_matchings [] (fun matchings matching ->
        match matching.pattern with
          pat :: pattern ->
            (if PrePatterns.match_token (pat,token.token) then
              [{matching with matched = token :: matching.matched; last=token.next; pattern=pattern}] else []) @ matchings
        | _ -> failwith "find_abr_pattern: ni") in
      let found = if matchings = [] then found else find_abr_pattern_tail matchings found l in
      find_abr_pattern all_matchings found l
  | [] -> found

let rec make_abr_orth = function
    [] -> ""
  | [t] -> t.orth
  | t :: l -> if t.beg + t.len = t.next then t.orth ^ (make_abr_orth l) else t.orth ^ " " ^ (make_abr_orth l)

let find_abr_patterns patterns tokens =
  let found = find_abr_pattern (Xlist.map patterns (fun pattern ->
    {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); last=0})) [] tokens in
  Xlist.rev_map found (fun matching ->
    let t1 = List.hd (List.rev matching.matched) in
    let t2 = List.hd matching.matched in
    t1.beg,
    t2.beg + t2.len - t1.beg,
    t2.next,
    make_abr_orth (List.rev matching.matched))

let split_interp line gloss interp =
  if interp = "xxx" then [gloss, "xxx"] else
  Xlist.map (Str.split (Str.regexp " ") interp) (fun s ->
    match Str.split (Str.regexp "|") s with
        [lemma;interp] -> lemma, interp
      | _ -> failwith ("bad brev entry: " ^ line))

let load_brev_dict () =
  let lines = File.load_lines "data/brev_20151215.tab" in
  List.rev (Xlist.rev_map lines (fun line ->
    match Str.split_delim (Str.regexp "\t") line with
      [_; orth; gloss; interp; _] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp
    | [_; orth; gloss; interp] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp
    | _ -> failwith ("load_brev_dict: " ^ line)))

let parse_lemma lemma =
  if lemma = ":" then lemma,"" else
  match Str.split (Str.regexp ":") lemma with
    [x] -> x,""
  | [x;y] -> x,y
  | _ -> failwith ("parse_lemma: " ^ lemma)

let make_orths orth beg len lexeme_postags_list =
  let n = Xlist.size lexeme_postags_list in
  let orth_list =
    if n = 1 then [orth,beg,len] else
    List.rev (Int.fold 1 n [] (fun l i ->
      (orth ^ "_" ^ string_of_int i,
       (if i=1 then beg else beg+len-n+i-1),
       if i=1 then len-n+1 else 1) :: l)) in
  List.rev (Xlist.fold (List.combine orth_list lexeme_postags_list) [] (fun orth_list ((orth,beg,len),(lemma,postags)) ->
    (orth, fst (parse_lemma lemma), ENIAMtokens.parse_postags postags, beg, len) :: orth_list))

let brev_dict = load_brev_dict ()

(* FIXME: trzeba zmienić reprezentację skrótów nazw własnych: przenieść do mwe,
   Gdy skrót jest częścią nazwy własnej powinien być dalej przetwarzalny *)
let process_brev paths (*tokens*) = paths
(*  let paths = Xlist.fold brev_dict paths (fun paths (pattern,lexeme_postags_list) ->
    let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in
    Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->
      let orths = make_orths orth beg len lexeme_postags_list in
      ENIAMpaths.add_path paths beg next orths)) in
  paths*)

let rec preselect_mwe_dict_token set = function
    SmallLetter orth -> StringSet.add set orth
  | CapLetter(orth,lc) -> StringSet.add set orth
  | AllSmall orth -> StringSet.add set orth
  | AllCap(orth,lc,lc2) -> StringSet.add set orth
  | FirstCap(orth,lc,_,_) -> StringSet.add set orth
  | SomeCap orth -> StringSet.add set orth
  | Symbol orth  -> StringSet.add set orth
  | Dig(v,"dig") -> StringSet.add set v
  | Other2 orth  -> StringSet.add set orth
  | _ -> set

let rec preselect_mwe_dict_tokens set = function
    Token t -> preselect_mwe_dict_token set t.token
  | Seq l -> Xlist.fold l set preselect_mwe_dict_tokens
  | Variant l -> Xlist.fold l set preselect_mwe_dict_tokens

let preselect_mwe_dict mwe_dict tokens =
  let set = Xlist.fold tokens StringSet.empty preselect_mwe_dict_tokens in
  let set = StringSet.fold set StringSet.empty (fun set orth ->
    try
      let l = StringMap.find mwe_dict orth in
      Xlist.fold l set StringSet.add
    with Not_found -> set) in
(*   StringSet.iter set print_endline; *)
  StringSet.fold set [] (fun l s ->
    match Str.split_delim (Str.regexp "\t") s with
      [lemma; interp; sense] ->
        (match Str.split_delim (Str.regexp ":") interp with
           orths :: tags -> (Str.split (Str.regexp " ") orths, lemma, String.concat ":" tags, sense) :: l
         | _ -> failwith "preselect_mwe_dict")
    | _ -> failwith "preselect_mwe_dict")

let simplify_lemma lemma =
         match Str.split (Str.regexp "-") lemma with
          [x;"1"] -> x
        | [x;"2"] -> x
        | [x;"3"] -> x
        | [x;"4"] -> x
        | [x;"5"] -> x
        | _ -> lemma

let mwe_dict = load_mwe_dict ()

let process_mwe paths (*tokens*) = paths
(*  let mwe_dict = preselect_mwe_dict mwe_dict tokens  in
  let paths = Xlist.fold mwe_dict paths (fun paths (pattern,lexeme,interp,sense) ->
    let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in
    Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->
      let orths = make_orths orth beg len [simplify_lemma lexeme,interp] in
      ENIAMpaths.add_path paths beg next orths)) in
  paths*)
*)

let get_orths paths =
  IntMap.fold paths StringSet.empty (fun orths _ map ->
    IntMap.fold map orths (fun orths _ l ->
      Xlist.fold l orths (fun orths t ->
        StringSet.add orths (ENIAMtokens.get_orth t.token))))

let get_intnum_orths paths =
  IntMap.fold paths StringMap.empty (fun orths _ map ->
    IntMap.fold map orths (fun orths _ l ->
      Xlist.fold l orths (fun orths t ->
        match t.token with
          Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
        | _ -> orths)))

let rec match_path_rec map found (t:token_env) rev = function
    [] -> (t :: rev) :: found
  | s :: l ->
     let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in
     let found2 = IntMap.fold map2 [] (fun found2 _ l ->
       Xlist.fold l found2 (fun found2 new_t ->
         if ENIAMtokens.get_orth new_t.token = s then new_t :: found2 else found2)) in
     Xlist.fold found2 found (fun found new_t -> match_path_rec map found new_t (t :: rev) l)

let match_path map = function
    [] -> failwith "match_path"
  | s :: l ->
     let found = IntMap.fold map [] (fun found i map2 ->
       IntMap.fold map2 found (fun found j l ->
         Xlist.fold l found (fun found t ->
           if ENIAMtokens.get_orth t.token = s then t :: found else found))) in
     Xlist.fold found [] (fun found t -> match_path_rec map found t [] l)

let concat_orths l =
  let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in
  let n = Xstring.size s in
  if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s

let create_token (matching:token_env list) lemma interp = (* FIXME: problem z nazwami własnymi *)
  let l = List.rev matching in
  let beg = (List.hd l).beg in
  let t = List.hd matching in
  let len = t.beg + t.len - beg in
   {empty_token_env with
    orth=concat_orths l;
    beg=beg;
    len=len;
    next=t.next;
    token=ENIAMtokens.make_lemma (lemma,interp);
    weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *)
    attrs=ENIAMtokens.merge_attrs l}

let add_token paths t =
  let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
  let map = IntMap.add_inc map t.next [t] (fun l -> t :: l) in
  IntMap.add paths t.beg map

let apply_rule paths (match_list,lemma,interp) =
  (* print_endline ("apply_rule: " ^ lemma); *)
  let matchings_found = match_path paths match_list in
  Xlist.fold matchings_found paths (fun paths matching ->
    try
      let token = create_token matching lemma interp in
      add_token paths token
    with Not_found -> paths)

(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *)
let add_ordnum_rules rules paths =
  let orths = get_intnum_orths paths in
  StringMap.fold orths rules (fun rules orth lemmas ->
    StringSet.fold lemmas rules (fun rules lemma ->
      (* Printf.printf "%s %s\n%!" orth lemma; *)
      ([orth;"."],lemma,"ordnum") :: rules))

let process (paths,last) =
  let paths = Xlist.fold paths IntMap.empty add_token in
  let orths = get_orths paths in
  let rules = preselect_dict orths mwe_dict in
  let rules = add_ordnum_rules rules paths in
  let paths = Xlist.fold rules paths apply_rule in
  let paths = IntMap.fold paths [] (fun paths _ map ->
    IntMap.fold map paths (fun paths _ l ->
      Xlist.fold l paths (fun paths t ->
        t :: paths))) in
  ENIAMpaths.sort (paths,last)