fonetics.ml
5.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
open Xstd
open Printf
type status = Idle | Symbols | Rules | RevSymbols | RevRules
type rule = {set: string; find: string; suf: string}
module CharTree = struct
type t = M of t CharMap.t * rule list
let empty = M(CharMap.empty,[])
let rec add_path_rules rule orth i (M(map,rules)) =
if i = String.length orth then M(map,rule :: rules) else
let tree = try CharMap.find map (String.get orth i) with Not_found -> empty in
let tree = add_path_rules rule orth (i+1) tree in
M(CharMap.add map (String.get orth i) tree,rules)
let create rules =
let tree = Xlist.fold rules empty (fun tree rule ->
add_path_rules rule (rule.find ^ rule.suf) 0 tree) in
tree
let rec find_rec l i orth (M(map,rules)) =
if i = String.length orth then Xlist.fold rules l (fun l rule -> ("", rule) :: l) else
let l = try find_rec l (i+1) orth (CharMap.find map (String.get orth i)) with Not_found -> l in
Xlist.fold rules l (fun l rule -> (String.sub orth i (String.length orth - i), rule) :: l)
let find tree orth =
let found = find_rec [] 0 orth tree in
(* printf "%d\n%!" (Xlist.size found); *)
(* Xlist.iter found (fun (stem,rule) -> printf "F %s\t%s\n" stem (string_of_rule rule)); *)
found
(* let add_char c rule =
let s = String.make 1 c in
{rule with find=s ^ rule.find; set=s ^ rule.set}
let rec disjoint_rec super (M(map,rules)) =
let rules = rules @ super in
if CharMap.is_empty map then M(map,rules) else
M(CharMap.mapi map (fun c tree ->
disjoint_rec (Xlist.rev_map rules (add_char c)) tree),[])
let disjoint trees =
Xlist.rev_map trees (fun (pref,tree) ->
pref, disjoint_rec [] tree)
let rec print_rules_rec file (M(map,rules)) =
Xlist.iter rules (print_rule file);
CharMap.iter map (fun _ tree -> print_rules_rec file tree)
let print_rules filename trees =
File.file_out filename (fun file ->
Xlist.iter trees (fun (_,tree) ->
print_rules_rec file tree))*)
end
let load_rules filename =
let status,symbol_defs,rev_symbol_defs,rules,rev_rules =
File.fold_tab filename (Idle,StringMap.empty,StringMap.empty,[],[]) (fun (status,symbol_defs,rev_symbol_defs,rules,rev_rules) -> function
["@symbols"] -> Symbols,symbol_defs,rev_symbol_defs,rules,rev_rules
| ["@rev_symbols"] -> RevSymbols,symbol_defs,rev_symbol_defs,rules,rev_rules
| ["@rules"] -> Rules,symbol_defs,rev_symbol_defs,rules,rev_rules
| ["@rev_rules"] -> RevRules,symbol_defs,rev_symbol_defs,rules,rev_rules
| [key;vals] ->
(match status with
Symbols -> status, StringMap.add symbol_defs key (Xstring.split " " vals), rev_symbol_defs, rules, rev_rules
| RevSymbols -> status, symbol_defs, StringMap.add rev_symbol_defs key (Xstring.split " " vals), rules, rev_rules
| _ -> failwith "Fonetics.load_rules: status 1")
| [v;r;s] ->
(match status with
Rules -> status, symbol_defs, rev_symbol_defs, {set=v; find=r; suf=s} :: rules, rev_rules
| RevRules -> status, symbol_defs, rev_symbol_defs, rules, {set=r; find=v; suf=s} :: rev_rules
| _ -> failwith "Fonetics.load_rules: status 2")
| line -> failwith ("load_rules: " ^ (String.concat "\t" line))) in
if status <> Rules && status <> RevRules then failwith "Fonetics.load_rules: status 3" else
symbol_defs, rev_symbol_defs, rules, rev_rules
let prepare_rules symbol_defs rules =
(* let symbol_defs = StringMap.map symbol_defs (function
"ε" -> ""
| s -> s) in *)
let rules = List.flatten (Xlist.rev_map rules (fun r ->
let suf = Xunicode.utf8_chars_of_utf8_string r.suf in
let suf = Xlist.map suf (fun s ->
try StringMap.find symbol_defs s with Not_found -> [s]) in
Xlist.rev_map (Xlist.multiply_list suf) (fun l ->
{r with suf=String.concat "" l}))) in
CharTree.create rules
let rules, rev_rules =
let symbol_defs,rev_symbol_defs,rules,rev_rules = load_rules "data/fonetics_pl.dic" in
prepare_rules symbol_defs rules,
prepare_rules rev_symbol_defs rev_rules
let rec translate_rec closure found rules s =
if s = "ε" then [List.rev found] else
let l = CharTree.find rules s in
(* Xlist.iter l (fun (t,r) ->
printf "s=%s t=%s set=%s find=%s suf=%s\n%!" s t r.set r.find r.suf); *)
let l = if l = [] && closure then
let n = String.length s in
let c = String.sub s 0 1 in
[String.sub s 1 (n-1),{find=c; set=c; suf=""}] else l in
List.flatten (Xlist.rev_map l (fun (t,r) ->
translate_rec closure (r.set :: found) rules (r.suf ^ t)))
let translate closure rules s =
(* printf "translate 1: %s\n%!" s; *)
let ll = translate_rec closure [] rules (s ^ "ε") in
Xlist.rev_map ll (String.concat "")
(* let s = String.concat "" l in
(* printf "translate 2: %s\n%!" s; *)
s*)
(*let _ = translate rules "blafickie"
let _ = translate rules "blafiacki"
let _ = translate rules "dudzia"
let _ = translate rules "rzódża"
let _ = translate rules "łódź"*)
exception NotFound of string * string
exception NotEqual of string * string * string
exception MulipleSolutions of string * string * string list
let translate_and_check closure rules rev_rules x =
let l = translate true rules x in
Xlist.iter l (fun s ->
match StringSet.to_list (StringSet.of_list (translate true rev_rules s)) with
[] -> raise (NotFound(x,s))
| [t] -> if t <> x then raise (NotEqual(x,s,t))
| l -> raise (MulipleSolutions(x,s,l)));
l
let translate_and_check closure rules rev_rules x =
let l = translate closure rules x in
Xlist.iter l (fun s ->
match StringSet.to_list (StringSet.of_list (translate closure rev_rules s)) with
[] -> raise (NotFound(x,s))
| [t] -> if t <> x then raise (NotEqual(x,s,t))
| l -> raise (MulipleSolutions(x,s,l)));
l
let translate_single closure rules x =
match StringSet.to_list (StringSet.of_list (translate closure rules x)) with
[] -> raise (NotFound(x,""))
| [t] -> t
| l -> raise (MulipleSolutions(x,"",l))