fonetics.ml
12.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
(*
* ENIAMmorphology, a morphological analyser and a guesser for Polish
* Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open Printf
open ENIAMmorphologyTypes
type status = Idle | Symbols | Rules | RevSymbols | RevRules
let string_of_phon p =
Printf.sprintf "%s %s" p.phon (String.concat " " (Xlist.map p.mapping (fun r -> r.pfind ^ "->" ^ r.pset))) (*fun (a,b) -> a ^ "->" ^ b))*)
module CharTree = struct
type t = M of t CharMap.t * phon_rule list
let empty = M(CharMap.empty,[])
let rec add_path_rules rule orth i (M(map,rules)) =
if i = String.length orth then M(map,rule :: rules) else
let tree = try CharMap.find map (String.get orth i) with Not_found -> empty in
let tree = add_path_rules rule orth (i+1) tree in
M(CharMap.add map (String.get orth i) tree,rules)
let create rules =
let tree = Xlist.fold rules empty (fun tree rule ->
add_path_rules rule (rule.pfind ^ rule.psuf) 0 tree) in
tree
let rec find_rec l i orth (M(map,rules)) =
if i = String.length orth then Xlist.fold rules l (fun l rule -> ("", rule) :: l) else
let l = try find_rec l (i+1) orth (CharMap.find map (String.get orth i)) with Not_found -> l in
Xlist.fold rules l (fun l rule -> (String.sub orth i (String.length orth - i), rule) :: l)
let find tree orth =
let found = find_rec [] 0 orth tree in
(* printf "%d\n%!" (Xlist.size found); *)
(* Xlist.iter found (fun (stem,rule) -> printf "F %s\t%s\n" stem (string_of_rule rule)); *)
found
end
let load_rules filename =
let status,symbol_defs,rev_symbol_defs,rules,rev_rules =
File.fold_tab filename (Idle,StringMap.empty,StringMap.empty,[],[]) (fun (status,symbol_defs,rev_symbol_defs,rules,rev_rules) -> function
["@symbols"] -> Symbols,symbol_defs,rev_symbol_defs,rules,rev_rules
| ["@rev_symbols"] -> RevSymbols,symbol_defs,rev_symbol_defs,rules,rev_rules
| ["@rules"] -> Rules,symbol_defs,rev_symbol_defs,rules,rev_rules
| ["@rev_rules"] -> RevRules,symbol_defs,rev_symbol_defs,rules,rev_rules
| [key;vals] ->
(match status with
Symbols -> status, StringMap.add symbol_defs key (Xstring.split " " vals), rev_symbol_defs, rules, rev_rules
| RevSymbols -> status, symbol_defs, StringMap.add rev_symbol_defs key (Xstring.split " " vals), rules, rev_rules
| _ -> failwith ("Fonetics.load_rules status 1: " ^ key ^ "\t" ^ vals))
| [lang;v;r;s] ->
(match status with
Rules -> status, symbol_defs, rev_symbol_defs, {pset=v; pfind=r; psuf=s; plang=lang} :: rules, rev_rules
| RevRules -> status, symbol_defs, rev_symbol_defs, rules, {pset=r; pfind=v; psuf=s; plang=lang} :: rev_rules
| _ -> failwith "Fonetics.load_rules: status 2")
| line -> failwith ("load_rules: " ^ (String.concat "\t" line))) in
if status <> Rules && status <> RevRules then failwith "Fonetics.load_rules: status 3" else
symbol_defs, rev_symbol_defs, rules, rev_rules
let prepare_rules symbol_defs rules =
let rules = List.flatten (Xlist.rev_map rules (fun r ->
let suf = Xunicode.utf8_chars_of_utf8_string r.psuf in
let suf = Xlist.map suf (fun s ->
try StringMap.find symbol_defs s with Not_found -> [s]) in
Xlist.rev_map (Xlist.multiply_list suf) (fun l ->
{r with psuf=String.concat "" l}))) in
CharTree.create rules
let select_rules lang rules =
Xlist.fold rules [] (fun rules r ->
if r.plang = lang then r :: rules else rules)
let deselect_rules lang rules =
Xlist.fold rules [] (fun rules r ->
if r.plang <> lang || String.get r.pset 0 = '{' || String.get r.pfind 0 = '{' then r :: rules else rules)
let rules, rev_rules, core_rules, core_rev_rules =
let symbol_defs,rev_symbol_defs,rules,rev_rules = load_rules "data/fonetics.dic" in
let core_rules = select_rules "core" rules in
let core_rev_rules = select_rules "core" rev_rules in
prepare_rules symbol_defs rules,
prepare_rules rev_symbol_defs rev_rules,
prepare_rules symbol_defs core_rules,
prepare_rules rev_symbol_defs core_rev_rules
let sufs = [
["ω"; "iκ"; "ρ"; "δ"; "λ"; "i"](*; "zi"*);
["γ"; "eά"; "owieε"; "β"]; ["ε"; ""];
["α"; "ά"; "’"; "’eά"; "iβ"; "g"];
["e"; "y"; "è"; "a"; "h"; "u"; "n"; "é"; "cieε"]]
let make_key s =
match Xunicode.utf8_chars_of_utf8_string s with
["ʒ"] -> "d"
| ["ř"] -> "r"
| [c] -> c
| ["d";"j"] -> "dj"
| ["d";"ʲ"] -> "dj"
| ["t";"j"] -> "tj"
| ["t";"ʲ"] -> "tj"
| [c;"′"] -> c
| [c;"j"] -> c
| [c;"ʲ"] -> c
| [c;"ʲ";"j"] -> c
| [c;"′";"j"] -> c
| [] -> ""
| "{" :: c :: "}" :: _ -> "{" ^ c
| "{" :: c :: d :: "}" :: _ -> "{" ^ c ^ d
| _ -> (*failwith ("make_key: '" ^ s ^ "'")*) s
let latex_escape_char = function
"′" -> "$'$"
| "ʲ" -> "\\textipa{\\super{j}}"
| "ʒ" -> "\\textipa{Z}"
| "ǯ" -> "\\textipa{\\v{Z}}"
| "{" -> "\\{"
| "}" -> "\\}"
| "ε" -> "$\\varepsilon$"
| c -> c
let latex_escape_utf8_string s =
String.concat "" (Xlist.map (Xunicode.utf8_chars_of_utf8_string s) latex_escape_char)
let latex_of_rule rule =
latex_escape_utf8_string rule.pset ^ " $\\leftarrow$ " ^ latex_escape_utf8_string rule.pfind
(* String.concat "" (Xlist.map (Xunicode.utf8_chars_of_utf8_string rule.pset) latex_escape_char) ^ " $\\leftarrow$ " ^ rule.pfind *)
let latex_of_rev_rule rule =
latex_escape_utf8_string rule.pfind ^ " $\\rightarrow$ " ^ latex_escape_utf8_string rule.pset
let is_excluded r =
match r.pset, r.pfind, r.psuf with
"r", "r", "zi" -> true
| "mar", "mar", "z" -> true
| "m′er", "mier", "z" -> true
| "n′e", "nie", "i" -> true
| _ -> false
let latex_of_core_rules filename =
let symbol_defs,rev_symbol_defs,rules,rev_rules = load_rules filename in
let core_rules = select_rules "core" rules in
let map = Xlist.fold core_rules StringMap.empty (fun map rule ->
if is_excluded rule then map else (
(* print_endline (latex_of_rule rule); *)
let key = make_key rule.pset in
let map2 = try StringMap.find map key with Not_found -> StringMap.empty in
if not (Xlist.mem (List.flatten sufs) rule.psuf) then print_endline ("latex_of_rules 1: " ^ rule.psuf);
let map2 = StringMap.add_inc map2 rule.psuf rule (fun rule2 -> print_endline(*failwith*) ("latex_of_rules 2: " ^ key); rule) in
StringMap.add map key map2)) in
Xlist.iter sufs (fun sufs ->
print_endline ("\\begin{longtable}{" ^ String.concat "|" (Xlist.map sufs (fun _ -> "l")) ^ "}");
StringMap.iter map (fun _ map2 ->
let line = Xlist.map sufs (fun suf ->
try latex_of_rule (StringMap.find map2 suf) with Not_found -> "") in
if String.concat "" line <> "" then
print_endline (String.concat " & " line ^ "\\\\"));
print_endline "\\end{longtable}\n"
)
let latex_of_non_core_rules filename =
let symbol_defs,rev_symbol_defs,rules,rev_rules = load_rules filename in
let non_core_rules = deselect_rules "core" rules in
let map = Xlist.fold non_core_rules StringMap.empty (fun map rule ->
if is_excluded rule then map else (
let key = make_key rule.pset in
let map2 = try StringMap.find map key with Not_found -> StringMap.empty in
let map2 = StringMap.add_inc map2 rule.psuf [rule] (fun rules -> rule :: rules) in
StringMap.add map key map2)) in
print_endline ("\\begin{longtable}{lp{10cm}}");
StringMap.iter map (fun key map2 ->
StringMap.iter map2 (fun _ rules ->
Xlist.iter rules (fun rule ->
let suf = Xunicode.utf8_chars_of_utf8_string rule.psuf in
let suf = Xlist.map suf (fun s ->
try StringMap.find symbol_defs s with Not_found -> [s]) in
let suf = String.concat " " (List.sort compare (Xlist.rev_map (Xlist.multiply_list suf) (String.concat ""))) in
let suf = latex_escape_utf8_string suf in
printf "%s & %s\\\\\n" (latex_of_rule rule) suf)));
print_endline "\\end{longtable}\n"
let latex_of_non_core_rev_rules filename =
let symbol_defs,rev_symbol_defs,rules,rev_rules = load_rules filename in
let non_core_rules = deselect_rules "core" rev_rules in
let map = Xlist.fold non_core_rules StringMap.empty (fun map rule ->
if is_excluded rule then map else (
let key = make_key rule.pset in
let map2 = try StringMap.find map key with Not_found -> StringMap.empty in
let map2 = StringMap.add_inc map2 rule.psuf [rule] (fun rules -> rule :: rules) in
StringMap.add map key map2)) in
print_endline ("\\begin{longtable}{lp{10cm}}");
StringMap.iter map (fun key map2 ->
StringMap.iter map2 (fun _ rules ->
Xlist.iter rules (fun rule ->
let suf = Xunicode.utf8_chars_of_utf8_string rule.psuf in
let suf = Xlist.map suf (fun s ->
try StringMap.find rev_symbol_defs s with Not_found -> [s]) in
let suf = String.concat " " (List.sort compare (Xlist.rev_map (Xlist.multiply_list suf) (String.concat ""))) in
let suf = latex_escape_utf8_string suf in
printf "%s & %s\\\\\n" (latex_of_rev_rule rule) suf)));
print_endline "\\end{longtable}\n"
let rec translate_rec closure found found_maping rules s =
if s = "ε" then [List.rev found,List.rev found_maping] else
let l = CharTree.find rules s in
(* Xlist.iter l (fun (t,r) ->
printf "s=%s t=%s set=%s find=%s suf=%s\n%!" s t r.set r.find r.suf); *)
let l = if l = [] && closure then
let c,s = Xunicode.first_utf8_char_of_utf8_string s in
[s,{pfind=c; pset=c; psuf=""; plang=""}] else l in
List.flatten (Xlist.rev_map l (fun (t,r) ->
translate_rec closure (r.pset :: found) (r(*r.pfind,r.pset*) :: found_maping) rules (r.psuf ^ t)))
let translate closure rules s =
(* printf "translate 1: %s\n%!" s; *)
let ll = translate_rec closure [] [] rules (s ^ "ε") in
Xlist.rev_map ll (fun (phon,mapping) -> {phon=String.concat "" phon; mapping=mapping})
exception NotFound of string * string
exception NotEqual of string * string * string
exception MulipleSolutions of string * string * string list
let translate_simple closure rules s =
let ll = translate_rec closure [] [] rules (s ^ "ε") in
if ll = [] then raise (NotFound(s,"")) else
Xlist.rev_map ll (fun (phon,mapping) -> String.concat "" phon)
let print_phon p = print_endline (string_of_phon p)
(*let _ = translate rules "blafickie"
let _ = translate rules "blafiacki"
let _ = translate rules "dudzia"*)
(* let _ = Xlist.iter (translate true rules "rzódża") print_phon
let _ = Xlist.iter (translate true rules "Mia") print_phon
let _ = Xlist.iter (translate true rules "mia") print_phon
let _ = Xlist.iter (translate true rules "łódź") print_phon
let _ = Xlist.iter (translate true rules "Łódź") print_phon *)
(* let _ = translate true rules "izolował" *)
let translate_and_check closure rules rev_rules orth =
let l = translate closure rules orth in
Xlist.iter l (fun s ->
let y = translate closure rev_rules s.phon in
let y = Xlist.map y (fun s -> s.phon) in
match StringSet.to_list (StringSet.of_list y) with
[] -> raise (NotFound(orth,s.phon))
| [t] -> if t <> orth then raise (NotEqual(orth,s.phon,t))
| l -> raise (MulipleSolutions(orth,s.phon,l)))
(*let translate_single closure rules x =
let y = translate closure rev_rules x in
let y = Xlist.map y (fun s -> s.phon) in
match StringSet.to_list (StringSet.of_list y) with
[] -> raise (NotFound(x,""))
| [t] -> t
| l ->
Printf.printf "%s %s\n" x (String.concat " " l);
raise (MulipleSolutions(x,"",l))
let rec rev_translate_rec x s = function
[] -> x,s,[]
| (_,"") :: m -> rev_translate_rec x s m
| (a,b) :: m ->
if Xstring.check_prefix b s then rev_translate_rec (x^a) (Xstring.cut_prefix b s) m
else x,s,m
let rev_translate closure rev_rules s m =
let x,s,_ = rev_translate_rec "" s m in
if s = "" then x else
x ^ (translate_single closure rev_rules s)*)
(*let rev_translate2 closure rev_rules s m =
let x,s,_ = rev_translate_rec "" s m in
if s = "" then [x] else
let l = translate closure rev_rules s in
if l = [] then raise (NotFound(s,"")) else
Xlist.rev_map l (fun y -> x ^ y.phon)*)
let rec get_short_stem x s = function
[] -> if s = "" then x else failwith "get_short_stem"
| r :: m ->
if Xstring.check_prefix r.pset s then get_short_stem (x^r.pfind) (Xstring.cut_prefix r.pset s) m
else x