fonetics.ml 6.15 KB
open Xstd
open Printf

type status = Idle | Symbols | Rules | RevSymbols | RevRules
type rule = {set: string; find: string; suf: string}

module CharTree = struct

  type t = M of t CharMap.t * rule list

  let empty = M(CharMap.empty,[])

  let rec add_path_rules rule orth i (M(map,rules)) =
    if i = String.length orth then M(map,rule :: rules) else
    let tree = try CharMap.find map (String.get orth i) with Not_found -> empty in
    let tree = add_path_rules rule orth (i+1) tree in
    M(CharMap.add map (String.get orth i) tree,rules)

  let create rules =
    let tree = Xlist.fold rules empty (fun tree rule ->
      add_path_rules rule (rule.find ^ rule.suf) 0 tree) in
    tree

  let rec find_rec l i orth (M(map,rules)) =
    if i = String.length orth then Xlist.fold rules l (fun l rule -> ("", rule) :: l) else
    let l = try find_rec l (i+1) orth (CharMap.find map (String.get orth i)) with Not_found -> l in
    Xlist.fold rules l (fun l rule -> (String.sub orth i (String.length orth - i), rule) :: l)

  let find tree orth =
    let found = find_rec [] 0 orth tree in
    (* printf "%d\n%!" (Xlist.size found); *)
    (* Xlist.iter found (fun (stem,rule) -> printf "F %s\t%s\n" stem (string_of_rule rule)); *)
    found

end

let load_rules filename =
  let status,symbol_defs,rev_symbol_defs,rules,rev_rules =
    File.fold_tab filename (Idle,StringMap.empty,StringMap.empty,[],[]) (fun (status,symbol_defs,rev_symbol_defs,rules,rev_rules) -> function
        ["@symbols"] -> Symbols,symbol_defs,rev_symbol_defs,rules,rev_rules
      | ["@rev_symbols"] -> RevSymbols,symbol_defs,rev_symbol_defs,rules,rev_rules
      | ["@rules"] -> Rules,symbol_defs,rev_symbol_defs,rules,rev_rules
      | ["@rev_rules"] -> RevRules,symbol_defs,rev_symbol_defs,rules,rev_rules
      | [key;vals] ->
           (match status with
             Symbols -> status, StringMap.add symbol_defs key (Xstring.split " " vals), rev_symbol_defs, rules, rev_rules
           | RevSymbols -> status, symbol_defs, StringMap.add rev_symbol_defs key (Xstring.split " " vals), rules, rev_rules
           | _ -> failwith "Fonetics.load_rules: status 1")
    | [v;r;s] ->
           (match status with
             Rules -> status, symbol_defs, rev_symbol_defs, {set=v; find=r; suf=s} :: rules, rev_rules
           | RevRules -> status, symbol_defs, rev_symbol_defs, rules, {set=r; find=v; suf=s} :: rev_rules
           | _ -> failwith "Fonetics.load_rules: status 2")
    | line -> failwith ("load_rules: " ^ (String.concat "\t" line))) in
  if status <> Rules && status <> RevRules then failwith "Fonetics.load_rules: status 3" else
  symbol_defs, rev_symbol_defs, rules, rev_rules

let prepare_rules symbol_defs rules =
  let rules = List.flatten (Xlist.rev_map rules (fun r ->
    let suf = Xunicode.utf8_chars_of_utf8_string r.suf in
    let suf = Xlist.map suf (fun s ->
      try StringMap.find symbol_defs s with Not_found -> [s]) in
    Xlist.rev_map (Xlist.multiply_list suf) (fun l ->
      {r with suf=String.concat "" l}))) in
  CharTree.create rules

let rules, rev_rules =
  let symbol_defs,rev_symbol_defs,rules,rev_rules =  load_rules "data/fonetics_pl.dic" in
  prepare_rules symbol_defs rules,
  prepare_rules rev_symbol_defs rev_rules

let sufs = ["ω"; "iκ"; "ρ"; "δ"; "λ"; "i"; "zi"]

let make_key s =
  match Xunicode.utf8_chars_of_utf8_string s with
    ["ʒ"] -> "d"
  | ["ř"] -> "r"
  | [c] -> c
  | ["d";"j"] -> "dj"
  | ["d";"ʲ"] -> "dj"
  | ["t";"j"] -> "tj"
  | ["t";"ʲ"] -> "tj"
  | [c;"′"] -> c
  | [c;"j"] -> c
  | [c;"ʲ"] -> c
  | [c;"ʲ";"j"] -> c
  | [c;"′";"j"] -> c
  | _ -> failwith ("make_key: " ^ s)

let latex_escape_char = function
    "′" -> "$'$"
  | "ʲ" -> "\\textipa{\\super{j}}"
  | "ʒ" -> "\\textipa{Z}"
  | "ǯ" -> "\\textipa{\\v{Z}}"
  | c -> c

let latex_of_rule rule =
  String.concat "" (Xlist.map (Xunicode.utf8_chars_of_utf8_string rule.set) latex_escape_char) ^ " $\\leftarrow$ " ^ rule.find

let latex_of_rules filename =
  let symbol_defs,rev_symbol_defs,rules,rev_rules =  load_rules filename in
  let map =  Xlist.fold rules StringMap.empty (fun map rule ->
    let key = make_key rule.set in
    let map2 = try StringMap.find map key with Not_found -> StringMap.empty in
    if not (Xlist.mem sufs rule.suf) then failwith ("latex_of_rules: " ^ rule.suf) else
    let map2 = StringMap.add_inc map2 rule.suf rule (fun rule2 -> failwith ("latex_of_rules: " ^ key)) in
    StringMap.add map key map2) in
  StringMap.iter map (fun _ map2 ->
    let line = Xlist.map sufs (fun suf ->
      try latex_of_rule (StringMap.find map2 suf) with Not_found -> "") in
    print_endline (String.concat " & " line ^ "\\\\"))

let rec translate_rec closure found rules s =
  if s = "ε" then [List.rev found] else
  let l = CharTree.find rules s in
  (* Xlist.iter l (fun (t,r) ->
      printf "s=%s t=%s set=%s find=%s suf=%s\n%!" s t r.set r.find r.suf); *)
  let l = if l = [] && closure then
    let n = String.length s in
    let c = String.sub s 0 1 in
    [String.sub s 1 (n-1),{find=c; set=c; suf=""}] else l in
  List.flatten (Xlist.rev_map l (fun (t,r) ->
    translate_rec closure (r.set :: found) rules (r.suf ^ t)))

let translate closure rules s =
  (* printf "translate 1: %s\n%!" s; *)
  let ll = translate_rec closure [] rules (s ^ "ε") in
  Xlist.rev_map ll (String.concat "")
(*  let s = String.concat "" l in
  (* printf "translate 2: %s\n%!" s; *)
  s*)

(*let _ = translate rules "blafickie"
let _ = translate rules "blafiacki"
let _ = translate rules "dudzia"
let _ = translate rules "rzódża"
let _ = translate rules "łódź"*)

exception NotFound of string * string
exception NotEqual of string * string * string
exception MulipleSolutions of string * string * string list

let translate_and_check closure rules rev_rules x =
  let l = translate closure rules x in
  Xlist.iter l (fun s ->
    match StringSet.to_list (StringSet.of_list (translate closure rev_rules s)) with
      [] -> raise (NotFound(x,s))
    | [t] -> if t <> x then raise (NotEqual(x,s,t))
    | l -> raise (MulipleSolutions(x,s,l)));
  l

let translate_single closure rules x =
  match StringSet.to_list (StringSet.of_list (translate closure rules x)) with
    [] -> raise (NotFound(x,""))
  | [t] -> t
  | l -> raise (MulipleSolutions(x,"",l))