fonetics.ml 5.9 KB
open Xstd
open Printf

type status = Idle | Symbols | Rules | RevSymbols | RevRules
type rule = {set: string; find: string; suf: string}

module CharTree = struct

  type t = M of t CharMap.t * rule list

  let empty = M(CharMap.empty,[])

  let rec add_path_rules rule orth i (M(map,rules)) =
    if i = String.length orth then M(map,rule :: rules) else
    let tree = try CharMap.find map (String.get orth i) with Not_found -> empty in
    let tree = add_path_rules rule orth (i+1) tree in
    M(CharMap.add map (String.get orth i) tree,rules)

  let create rules =
    let tree = Xlist.fold rules empty (fun tree rule ->
      add_path_rules rule (rule.find ^ rule.suf) 0 tree) in
    tree

  let rec find_rec l i orth (M(map,rules)) =
    if i = String.length orth then Xlist.fold rules l (fun l rule -> ("", rule) :: l) else
    let l = try find_rec l (i+1) orth (CharMap.find map (String.get orth i)) with Not_found -> l in
    Xlist.fold rules l (fun l rule -> (String.sub orth i (String.length orth - i), rule) :: l)

  let find tree orth =
    let found = find_rec [] 0 orth tree in
    (* printf "%d\n%!" (Xlist.size found); *)
    (* Xlist.iter found (fun (stem,rule) -> printf "F %s\t%s\n" stem (string_of_rule rule)); *)
    found

(*  let add_char c rule =
    let s = String.make 1 c in
    {rule with find=s ^ rule.find; set=s ^ rule.set}

  let rec disjoint_rec super (M(map,rules)) =
    let rules = rules @ super in
    if CharMap.is_empty map then M(map,rules) else
    M(CharMap.mapi map (fun c tree ->
      disjoint_rec (Xlist.rev_map rules (add_char c)) tree),[])

  let disjoint trees =
    Xlist.rev_map trees (fun (pref,tree) ->
      pref, disjoint_rec [] tree)

  let rec print_rules_rec file (M(map,rules)) =
    Xlist.iter rules (print_rule file);
    CharMap.iter map (fun _ tree -> print_rules_rec file tree)

  let print_rules filename trees =
    File.file_out filename (fun file ->
      Xlist.iter trees (fun (_,tree) ->
        print_rules_rec file tree))*)

end

let load_rules filename =
  let status,symbol_defs,rev_symbol_defs,rules,rev_rules =
    File.fold_tab filename (Idle,StringMap.empty,StringMap.empty,[],[]) (fun (status,symbol_defs,rev_symbol_defs,rules,rev_rules) -> function
        ["@symbols"] -> Symbols,symbol_defs,rev_symbol_defs,rules,rev_rules
      | ["@rev_symbols"] -> RevSymbols,symbol_defs,rev_symbol_defs,rules,rev_rules
      | ["@rules"] -> Rules,symbol_defs,rev_symbol_defs,rules,rev_rules
      | ["@rev_rules"] -> RevRules,symbol_defs,rev_symbol_defs,rules,rev_rules
      | [key;vals] ->
           (match status with
             Symbols -> status, StringMap.add symbol_defs key (Xstring.split " " vals), rev_symbol_defs, rules, rev_rules
           | RevSymbols -> status, symbol_defs, StringMap.add rev_symbol_defs key (Xstring.split " " vals), rules, rev_rules
           | _ -> failwith "Fonetics.load_rules: status 1")
    | [v;r;s] ->
           (match status with
             Rules -> status, symbol_defs, rev_symbol_defs, {set=v; find=r; suf=s} :: rules, rev_rules
           | RevRules -> status, symbol_defs, rev_symbol_defs, rules, {set=r; find=v; suf=s} :: rev_rules
           | _ -> failwith "Fonetics.load_rules: status 2")
    | line -> failwith ("load_rules: " ^ (String.concat "\t" line))) in
  if status <> Rules && status <> RevRules then failwith "Fonetics.load_rules: status 3" else
  symbol_defs, rev_symbol_defs, rules, rev_rules

let prepare_rules symbol_defs rules =
  (* let symbol_defs = StringMap.map symbol_defs (function
      "ε" -> ""
    | s -> s) in *)
  let rules = List.flatten (Xlist.rev_map rules (fun r ->
    let suf = Xunicode.utf8_chars_of_utf8_string r.suf in
    let suf = Xlist.map suf (fun s ->
      try StringMap.find symbol_defs s with Not_found -> [s]) in
    Xlist.rev_map (Xlist.multiply_list suf) (fun l ->
      {r with suf=String.concat "" l}))) in
  CharTree.create rules

let rules, rev_rules =
  let symbol_defs,rev_symbol_defs,rules,rev_rules =  load_rules "data/fonetics_pl.dic" in
  prepare_rules symbol_defs rules,
  prepare_rules rev_symbol_defs rev_rules

let rec translate_rec closure found rules s =
  if s = "ε" then [List.rev found] else
  let l = CharTree.find rules s in
  (* Xlist.iter l (fun (t,r) ->
      printf "s=%s t=%s set=%s find=%s suf=%s\n%!" s t r.set r.find r.suf); *)
  let l = if l = [] && closure then
    let n = String.length s in
    let c = String.sub s 0 1 in
    [String.sub s 1 (n-1),{find=c; set=c; suf=""}] else l in
  List.flatten (Xlist.rev_map l (fun (t,r) ->
    translate_rec closure (r.set :: found) rules (r.suf ^ t)))

let translate closure rules s =
  (* printf "translate 1: %s\n%!" s; *)
  let ll = translate_rec closure [] rules (s ^ "ε") in
  Xlist.rev_map ll (String.concat "")
(*  let s = String.concat "" l in
  (* printf "translate 2: %s\n%!" s; *)
  s*)

(*let _ = translate rules "blafickie"
let _ = translate rules "blafiacki"
let _ = translate rules "dudzia"
let _ = translate rules "rzódża"
let _ = translate rules "łódź"*)

exception NotFound of string * string
exception NotEqual of string * string * string
exception MulipleSolutions of string * string * string list

let translate_and_check closure rules rev_rules x =
  let l = translate true rules x in
  Xlist.iter l (fun s ->
    match StringSet.to_list (StringSet.of_list (translate true rev_rules s)) with
      [] -> raise (NotFound(x,s))
    | [t] -> if t <> x then raise (NotEqual(x,s,t))
    | l -> raise (MulipleSolutions(x,s,l)));
  l
let translate_and_check closure rules rev_rules x =
  let l = translate closure rules x in
  Xlist.iter l (fun s ->
    match StringSet.to_list (StringSet.of_list (translate closure rev_rules s)) with
      [] -> raise (NotFound(x,s))
    | [t] -> if t <> x then raise (NotEqual(x,s,t))
    | l -> raise (MulipleSolutions(x,s,l)));
  l

let translate_single closure rules x =
  match StringSet.to_list (StringSet.of_list (translate closure rules x)) with
    [] -> raise (NotFound(x,""))
  | [t] -> t
  | l -> raise (MulipleSolutions(x,"",l))