fonetics.ml
6.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
open Xstd
open Printf
type status = Idle | Symbols | Rules | RevSymbols | RevRules
type rule = {set: string; find: string; suf: string}
module CharTree = struct
type t = M of t CharMap.t * rule list
let empty = M(CharMap.empty,[])
let rec add_path_rules rule orth i (M(map,rules)) =
if i = String.length orth then M(map,rule :: rules) else
let tree = try CharMap.find map (String.get orth i) with Not_found -> empty in
let tree = add_path_rules rule orth (i+1) tree in
M(CharMap.add map (String.get orth i) tree,rules)
let create rules =
let tree = Xlist.fold rules empty (fun tree rule ->
add_path_rules rule (rule.find ^ rule.suf) 0 tree) in
tree
let rec find_rec l i orth (M(map,rules)) =
if i = String.length orth then Xlist.fold rules l (fun l rule -> ("", rule) :: l) else
let l = try find_rec l (i+1) orth (CharMap.find map (String.get orth i)) with Not_found -> l in
Xlist.fold rules l (fun l rule -> (String.sub orth i (String.length orth - i), rule) :: l)
let find tree orth =
let found = find_rec [] 0 orth tree in
(* printf "%d\n%!" (Xlist.size found); *)
(* Xlist.iter found (fun (stem,rule) -> printf "F %s\t%s\n" stem (string_of_rule rule)); *)
found
end
let load_rules filename =
let status,symbol_defs,rev_symbol_defs,rules,rev_rules =
File.fold_tab filename (Idle,StringMap.empty,StringMap.empty,[],[]) (fun (status,symbol_defs,rev_symbol_defs,rules,rev_rules) -> function
["@symbols"] -> Symbols,symbol_defs,rev_symbol_defs,rules,rev_rules
| ["@rev_symbols"] -> RevSymbols,symbol_defs,rev_symbol_defs,rules,rev_rules
| ["@rules"] -> Rules,symbol_defs,rev_symbol_defs,rules,rev_rules
| ["@rev_rules"] -> RevRules,symbol_defs,rev_symbol_defs,rules,rev_rules
| [key;vals] ->
(match status with
Symbols -> status, StringMap.add symbol_defs key (Xstring.split " " vals), rev_symbol_defs, rules, rev_rules
| RevSymbols -> status, symbol_defs, StringMap.add rev_symbol_defs key (Xstring.split " " vals), rules, rev_rules
| _ -> failwith "Fonetics.load_rules: status 1")
| [v;r;s] ->
(match status with
Rules -> status, symbol_defs, rev_symbol_defs, {set=v; find=r; suf=s} :: rules, rev_rules
| RevRules -> status, symbol_defs, rev_symbol_defs, rules, {set=r; find=v; suf=s} :: rev_rules
| _ -> failwith "Fonetics.load_rules: status 2")
| line -> failwith ("load_rules: " ^ (String.concat "\t" line))) in
if status <> Rules && status <> RevRules then failwith "Fonetics.load_rules: status 3" else
symbol_defs, rev_symbol_defs, rules, rev_rules
let prepare_rules symbol_defs rules =
let rules = List.flatten (Xlist.rev_map rules (fun r ->
let suf = Xunicode.utf8_chars_of_utf8_string r.suf in
let suf = Xlist.map suf (fun s ->
try StringMap.find symbol_defs s with Not_found -> [s]) in
Xlist.rev_map (Xlist.multiply_list suf) (fun l ->
{r with suf=String.concat "" l}))) in
CharTree.create rules
let rules, rev_rules =
let symbol_defs,rev_symbol_defs,rules,rev_rules = load_rules "data/fonetics_pl.dic" in
prepare_rules symbol_defs rules,
prepare_rules rev_symbol_defs rev_rules
let sufs = ["ω"; "iκ"; "ρ"; "δ"; "λ"; "i"; "zi"]
let make_key s =
match Xunicode.utf8_chars_of_utf8_string s with
["ʒ"] -> "d"
| ["ř"] -> "r"
| [c] -> c
| ["d";"j"] -> "dj"
| ["d";"ʲ"] -> "dj"
| ["t";"j"] -> "tj"
| ["t";"ʲ"] -> "tj"
| [c;"′"] -> c
| [c;"j"] -> c
| [c;"ʲ"] -> c
| [c;"ʲ";"j"] -> c
| [c;"′";"j"] -> c
| _ -> failwith ("make_key: " ^ s)
let latex_escape_char = function
"′" -> "$'$"
| "ʲ" -> "\\textipa{\\super{j}}"
| "ʒ" -> "\\textipa{Z}"
| "ǯ" -> "\\textipa{\\v{Z}}"
| c -> c
let latex_of_rule rule =
String.concat "" (Xlist.map (Xunicode.utf8_chars_of_utf8_string rule.set) latex_escape_char) ^ " $\\leftarrow$ " ^ rule.find
let latex_of_rules filename =
let symbol_defs,rev_symbol_defs,rules,rev_rules = load_rules filename in
let map = Xlist.fold rules StringMap.empty (fun map rule ->
let key = make_key rule.set in
let map2 = try StringMap.find map key with Not_found -> StringMap.empty in
if not (Xlist.mem sufs rule.suf) then failwith ("latex_of_rules: " ^ rule.suf) else
let map2 = StringMap.add_inc map2 rule.suf rule (fun rule2 -> failwith ("latex_of_rules: " ^ key)) in
StringMap.add map key map2) in
StringMap.iter map (fun _ map2 ->
let line = Xlist.map sufs (fun suf ->
try latex_of_rule (StringMap.find map2 suf) with Not_found -> "") in
print_endline (String.concat " & " line ^ "\\\\"))
let rec translate_rec closure found rules s =
if s = "ε" then [List.rev found] else
let l = CharTree.find rules s in
(* Xlist.iter l (fun (t,r) ->
printf "s=%s t=%s set=%s find=%s suf=%s\n%!" s t r.set r.find r.suf); *)
let l = if l = [] && closure then
let n = String.length s in
let c = String.sub s 0 1 in
[String.sub s 1 (n-1),{find=c; set=c; suf=""}] else l in
List.flatten (Xlist.rev_map l (fun (t,r) ->
translate_rec closure (r.set :: found) rules (r.suf ^ t)))
let translate closure rules s =
(* printf "translate 1: %s\n%!" s; *)
let ll = translate_rec closure [] rules (s ^ "ε") in
Xlist.rev_map ll (String.concat "")
(* let s = String.concat "" l in
(* printf "translate 2: %s\n%!" s; *)
s*)
(*let _ = translate rules "blafickie"
let _ = translate rules "blafiacki"
let _ = translate rules "dudzia"
let _ = translate rules "rzódża"
let _ = translate rules "łódź"*)
exception NotFound of string * string
exception NotEqual of string * string * string
exception MulipleSolutions of string * string * string list
let translate_and_check closure rules rev_rules x =
let l = translate closure rules x in
Xlist.iter l (fun s ->
match StringSet.to_list (StringSet.of_list (translate closure rev_rules s)) with
[] -> raise (NotFound(x,s))
| [t] -> if t <> x then raise (NotEqual(x,s,t))
| l -> raise (MulipleSolutions(x,s,l)));
l
let translate_single closure rules x =
match StringSet.to_list (StringSet.of_list (translate closure rules x)) with
[] -> raise (NotFound(x,""))
| [t] -> t
| l -> raise (MulipleSolutions(x,"",l))