fonetics.ml 8.56 KB
open Xstd
open Printf

type status = Idle | Symbols | Rules | RevSymbols | RevRules
type rule = {set: string; find: string; suf: string; lang: string}

let string_of_phon p =
  Printf.sprintf "%s  %s" p.Types.phon (String.concat " " (Xlist.map p.Types.mapping (fun (a,b) -> a ^ "->" ^ b)))

module CharTree = struct

  type t = M of t CharMap.t * rule list

  let empty = M(CharMap.empty,[])

  let rec add_path_rules rule orth i (M(map,rules)) =
    if i = String.length orth then M(map,rule :: rules) else
    let tree = try CharMap.find map (String.get orth i) with Not_found -> empty in
    let tree = add_path_rules rule orth (i+1) tree in
    M(CharMap.add map (String.get orth i) tree,rules)

  let create rules =
    let tree = Xlist.fold rules empty (fun tree rule ->
      add_path_rules rule (rule.find ^ rule.suf) 0 tree) in
    tree

  let rec find_rec l i orth (M(map,rules)) =
    if i = String.length orth then Xlist.fold rules l (fun l rule -> ("", rule) :: l) else
    let l = try find_rec l (i+1) orth (CharMap.find map (String.get orth i)) with Not_found -> l in
    Xlist.fold rules l (fun l rule -> (String.sub orth i (String.length orth - i), rule) :: l)

  let find tree orth =
    let found = find_rec [] 0 orth tree in
    (* printf "%d\n%!" (Xlist.size found); *)
    (* Xlist.iter found (fun (stem,rule) -> printf "F %s\t%s\n" stem (string_of_rule rule)); *)
    found

end

let load_rules filename =
  let status,symbol_defs,rev_symbol_defs,rules,rev_rules =
    File.fold_tab filename (Idle,StringMap.empty,StringMap.empty,[],[]) (fun (status,symbol_defs,rev_symbol_defs,rules,rev_rules) -> function
        ["@symbols"] -> Symbols,symbol_defs,rev_symbol_defs,rules,rev_rules
      | ["@rev_symbols"] -> RevSymbols,symbol_defs,rev_symbol_defs,rules,rev_rules
      | ["@rules"] -> Rules,symbol_defs,rev_symbol_defs,rules,rev_rules
      | ["@rev_rules"] -> RevRules,symbol_defs,rev_symbol_defs,rules,rev_rules
      | [key;vals] ->
           (match status with
             Symbols -> status, StringMap.add symbol_defs key (Xstring.split " " vals), rev_symbol_defs, rules, rev_rules
           | RevSymbols -> status, symbol_defs, StringMap.add rev_symbol_defs key (Xstring.split " " vals), rules, rev_rules
           | _ -> failwith ("Fonetics.load_rules status 1: " ^ key ^ "\t" ^ vals))
      | [lang;v;r;s] ->
           (match status with
             Rules -> status, symbol_defs, rev_symbol_defs, {set=v; find=r; suf=s; lang=lang} :: rules, rev_rules
           | RevRules -> status, symbol_defs, rev_symbol_defs, rules, {set=r; find=v; suf=s; lang=lang} :: rev_rules
           | _ -> failwith "Fonetics.load_rules: status 2")
      | line -> failwith ("load_rules: " ^ (String.concat "\t" line))) in
  if status <> Rules && status <> RevRules then failwith "Fonetics.load_rules: status 3" else
  symbol_defs, rev_symbol_defs, rules, rev_rules

let prepare_rules symbol_defs rules =
  let rules = List.flatten (Xlist.rev_map rules (fun r ->
    let suf = Xunicode.utf8_chars_of_utf8_string r.suf in
    let suf = Xlist.map suf (fun s ->
      try StringMap.find symbol_defs s with Not_found -> [s]) in
    Xlist.rev_map (Xlist.multiply_list suf) (fun l ->
      {r with suf=String.concat "" l}))) in
  CharTree.create rules

let select_rules lang rules =
  Xlist.fold rules [] (fun rules r ->
    if r.lang = lang then r :: rules else rules)

let rules, rev_rules, core_rules, core_rev_rules =
  let symbol_defs,rev_symbol_defs,rules,rev_rules = load_rules "data/fonetics.dic" in
  let core_rules = select_rules "core" rules in
  let core_rev_rules = select_rules "core" rev_rules in
  prepare_rules symbol_defs rules,
  prepare_rules rev_symbol_defs rev_rules,
  prepare_rules symbol_defs core_rules,
  prepare_rules rev_symbol_defs core_rev_rules

let sufs = ["ω"; "iκ"; "ρ"; "δ"; "λ"; "i"(*; "zi"*)]

let make_key s =
  match Xunicode.utf8_chars_of_utf8_string s with
    ["ʒ"] -> "d"
  | ["ř"] -> "r"
  | [c] -> c
  | ["d";"j"] -> "dj"
  | ["d";"ʲ"] -> "dj"
  | ["t";"j"] -> "tj"
  | ["t";"ʲ"] -> "tj"
  | [c;"′"] -> c
  | [c;"j"] -> c
  | [c;"ʲ"] -> c
  | [c;"ʲ";"j"] -> c
  | [c;"′";"j"] -> c
  | _ -> failwith ("make_key: " ^ s)

let latex_escape_char = function
    "′" -> "$'$"
  | "ʲ" -> "\\textipa{\\super{j}}"
  | "ʒ" -> "\\textipa{Z}"
  | "ǯ" -> "\\textipa{\\v{Z}}"
  | c -> c

let latex_of_rule rule =
  String.concat "" (Xlist.map (Xunicode.utf8_chars_of_utf8_string rule.set) latex_escape_char) ^ " $\\leftarrow$ " ^ rule.find

let is_excluded r =
  match r.set, r.find, r.suf with
    "r", "r", "zi" -> true
  | "mar", "mar", "z" -> true
  | "m′er", "mier", "z" -> true
  | "n′e", "nie", "i" -> true
  | _ -> false


let latex_of_rules filename =
  let symbol_defs,rev_symbol_defs,rules,rev_rules =  load_rules filename in
  let map =  Xlist.fold rules StringMap.empty (fun map rule ->
    if is_excluded rule then map else
    let key = make_key rule.set in
    let map2 = try StringMap.find map key with Not_found -> StringMap.empty in
    if not (Xlist.mem sufs rule.suf) then failwith ("latex_of_rules: " ^ rule.suf) else
    let map2 = StringMap.add_inc map2 rule.suf rule (fun rule2 -> failwith ("latex_of_rules: " ^ key)) in
    StringMap.add map key map2) in
  StringMap.iter map (fun _ map2 ->
    let line = Xlist.map sufs (fun suf ->
      try latex_of_rule (StringMap.find map2 suf) with Not_found -> "") in
    print_endline (String.concat " & " line ^ "\\\\"))

let rec translate_rec closure found found_maping rules s =
  if s = "ε" then [List.rev found,List.rev found_maping] else
  let l = CharTree.find rules s in
  (* Xlist.iter l (fun (t,r) ->
      printf "s=%s t=%s set=%s find=%s suf=%s\n%!" s t r.set r.find r.suf); *)
  let l = if l = [] && closure then
    let c,s = Xunicode.first_utf8_char_of_utf8_string s in
    [s,{find=c; set=c; suf=""; lang=""}] else l in
  List.flatten (Xlist.rev_map l (fun (t,r) ->
    translate_rec closure (r.set :: found) ((r.find,r.set) :: found_maping) rules (r.suf ^ t)))

let translate closure rules s =
  (* printf "translate 1: %s\n%!" s; *)
  let ll = translate_rec closure [] [] rules (s ^ "ε") in
  Xlist.rev_map ll (fun (phon,mapping) -> {Types.phon=String.concat "" phon; Types.mapping=mapping})

exception NotFound of string * string
exception NotEqual of string * string * string
exception MulipleSolutions of string * string * string list

let translate_simple closure rules s =
  let ll = translate_rec closure [] [] rules (s ^ "ε") in
  if ll = [] then raise (NotFound(s,"")) else
  Xlist.rev_map ll (fun (phon,mapping) -> String.concat "" phon)

let print_phon p = print_endline (string_of_phon p)

(*let _ = translate rules "blafickie"
let _ = translate rules "blafiacki"
let _ = translate rules "dudzia"*)
(* let _ = Xlist.iter (translate true rules "rzódża") print_phon
let _ = Xlist.iter (translate true rules "Mia") print_phon
let _ = Xlist.iter (translate true rules "mia") print_phon
let _ = Xlist.iter (translate true rules "łódź") print_phon
let _ = Xlist.iter (translate true rules "Łódź") print_phon *)
(* let _ = translate true rules "izolował" *)

let translate_and_check closure rules rev_rules orth =
  let l = translate closure rules orth in
  Xlist.iter l (fun s ->
    let y = translate closure rev_rules s.Types.phon in
    let y = Xlist.map y (fun s -> s.Types.phon) in
    match StringSet.to_list (StringSet.of_list y) with
      [] -> raise (NotFound(orth,s.Types.phon))
    | [t] -> if t <> orth then raise (NotEqual(orth,s.Types.phon,t))
    | l -> raise (MulipleSolutions(orth,s.Types.phon,l)))


(*let translate_single closure rules x =
  let y = translate closure rev_rules x in
  let y = Xlist.map y (fun s -> s.Types.phon) in
  match StringSet.to_list (StringSet.of_list y) with
    [] -> raise (NotFound(x,""))
  | [t] -> t
  | l ->
     Printf.printf "%s  %s\n" x (String.concat " " l);
     raise (MulipleSolutions(x,"",l))

let rec rev_translate_rec x s = function
    [] -> x,s,[]
  | (_,"") :: m -> rev_translate_rec x s m
  | (a,b) :: m ->
      if Xstring.check_prefix b s then rev_translate_rec (x^a) (Xstring.cut_prefix b s) m
      else x,s,m

let rev_translate closure rev_rules s m =
  let x,s,_ = rev_translate_rec "" s m in
  if s = "" then x else
  x ^ (translate_single closure rev_rules s)*)

(*let rev_translate2 closure rev_rules s m =
  let x,s,_ = rev_translate_rec "" s m in
  if s = "" then [x] else
  let l = translate closure rev_rules s in
  if l = [] then raise (NotFound(s,"")) else
  Xlist.rev_map l (fun y -> x ^ y.Types.phon)*)

let rec get_short_stem x s = function
    [] -> if s = "" then x else failwith "get_short_stem"
  | (a,b) :: m ->
      if Xstring.check_prefix b s then get_short_stem (x^a) (Xstring.cut_prefix b s) m
      else x