morf.ml 4.68 KB
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd

let load_tab filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  List.rev (Xlist.fold l [] (fun l line ->
    if String.length line = 0 then l else
    if String.get line 0 = '#' then l else
    match Str.split (Str.regexp "\t") line with
      orth :: lemma :: interp :: _ -> (orth,lemma,interp) :: l
    | _ -> failwith ("load_tab: " ^ line)))

let load_tab_full filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  List.rev (Xlist.fold l [] (fun l line ->
    if String.length line = 0 then l else
    if String.get line 0 = '#' then l else
    match Str.split (Str.regexp "\t") line with
      [orth; lemma; interp] -> (orth,lemma,interp,"","") :: l
    | [orth; lemma; interp; cl] -> (orth,lemma,interp,cl,"") :: l
    | [orth; lemma; interp; cl; cl2] -> (orth,lemma,interp,cl,cl2) :: l
(*     | orth :: lemma :: interp :: cl :: cl2 -> (orth,lemma,interp,cl,String.concat ";" cl2) :: l *)
    | _ -> failwith ("load_tab_full: " ^ line)))

let merge_dicts l =
  Xlist.fold l StringMap.empty (fun dicts tab ->
    Xlist.fold tab dicts (fun dicts (orth,lemma,interp) ->
      let interps = try StringMap.find dicts lemma with Not_found -> StringMap.empty in
      let interps = StringMap.add_inc interps interp [orth] (fun orths ->
        if Xlist.mem orths orth then orths else orth :: orths) in
      StringMap.add dicts lemma interps))

let load_interp_sel filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l StringMap.empty (fun interp_sel line ->
    if String.length line = 0 then interp_sel else
    if String.get line 0 = '#' then interp_sel else
    match Str.split (Str.regexp "\t") line with
      [group;interp;label] -> StringMap.add_inc interp_sel group [interp,label] (fun l -> (interp,label) :: l)
    | _ -> failwith ("load_interp_sel: " ^ line))


let rec merge_digraph = function
    [] -> []
  | "c" :: "h" :: l -> "ch" :: (merge_digraph l)
  | "c" :: "z" :: l -> "cz" :: (merge_digraph l)
  | "d" :: "z" :: l -> "dz" :: (merge_digraph l)
  | "d" :: "ź" :: l -> "dź" :: (merge_digraph l)
  | "d" :: "ż" :: l -> "dż" :: (merge_digraph l)
  | "r" :: "z" :: l -> "rz" :: (merge_digraph l)
  | "s" :: "z" :: l -> "sz" :: (merge_digraph l)
  | "b" :: "'" :: l -> "b'" :: (merge_digraph l)
  | "f" :: "'" :: l -> "f'" :: (merge_digraph l)
  | s :: l -> s :: (merge_digraph l)

let text_to_chars s = Xunicode.classified_chars_of_utf8_string s
(*  (try UTF8.validate s with UTF8.Malformed_code -> failwith ("Invalid UTF8 string: " ^ s));
  let r = ref [] in
  UTF8.iter (fun c ->
    r := (UTF8.init 1 (fun _ -> c)) :: (!r)) s;
  merge_digraph (List.rev (!r))*)


let check_prefix pat s =
  let n = String.length pat in
  if n > String.length s then false else
  String.sub s 0 n = pat

let cut_prefix pat s =
  let i = String.length pat in
  let n = String.length s in
  if i >= n then "" else
  try String.sub s i (n-i) with _ -> failwith ("cut_prefix: " ^ s ^ " " ^ string_of_int i)

let check_sufix pat s =
  let n = String.length pat in
  let m = String.length s in
  if n > m then false else
  String.sub s (m-n) n = pat

let cut_sufix pat s =
  let i = String.length pat in
  let n = String.length s in
  try String.sub s 0 (n-i) with _ -> failwith ("cut_sufix: " ^ s)

let apply_transform orth (s,t) =
  if check_sufix s orth then cut_sufix s orth ^ t else raise Not_found

let split_colon s =
  match Str.split_delim (Str.regexp ":") s with
    [s] -> s, ""
  | [s;t] -> s, t
  | _ -> failwith "split_colon"

let get_cat s =
  match Str.split_delim (Str.regexp ":") s with
    cat :: _ -> cat
  | _ -> failwith "get_cat"

let select_interps interps interp_sel =
  Xlist.fold interp_sel StringMap.empty (fun new_interps (interp,_) ->
    try
      StringMap.add new_interps interp (StringMap.find interps interp)
    with Not_found -> new_interps)