morf.ml
4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
(*
* ENIAM: Categorial Syntactic-Semantic Parser for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
let load_tab filename =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
List.rev (Xlist.fold l [] (fun l line ->
if String.length line = 0 then l else
if String.get line 0 = '#' then l else
match Str.split (Str.regexp "\t") line with
orth :: lemma :: interp :: _ -> (orth,lemma,interp) :: l
| _ -> failwith ("load_tab: " ^ line)))
let load_tab_full filename =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
List.rev (Xlist.fold l [] (fun l line ->
if String.length line = 0 then l else
if String.get line 0 = '#' then l else
match Str.split (Str.regexp "\t") line with
[orth; lemma; interp] -> (orth,lemma,interp,"","") :: l
| [orth; lemma; interp; cl] -> (orth,lemma,interp,cl,"") :: l
| [orth; lemma; interp; cl; cl2] -> (orth,lemma,interp,cl,cl2) :: l
(* | orth :: lemma :: interp :: cl :: cl2 -> (orth,lemma,interp,cl,String.concat ";" cl2) :: l *)
| _ -> failwith ("load_tab_full: " ^ line)))
let merge_dicts l =
Xlist.fold l StringMap.empty (fun dicts tab ->
Xlist.fold tab dicts (fun dicts (orth,lemma,interp) ->
let interps = try StringMap.find dicts lemma with Not_found -> StringMap.empty in
let interps = StringMap.add_inc interps interp [orth] (fun orths ->
if Xlist.mem orths orth then orths else orth :: orths) in
StringMap.add dicts lemma interps))
let load_interp_sel filename =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
Xlist.fold l StringMap.empty (fun interp_sel line ->
if String.length line = 0 then interp_sel else
if String.get line 0 = '#' then interp_sel else
match Str.split (Str.regexp "\t") line with
[group;interp;label] -> StringMap.add_inc interp_sel group [interp,label] (fun l -> (interp,label) :: l)
| _ -> failwith ("load_interp_sel: " ^ line))
let rec merge_digraph = function
[] -> []
| "c" :: "h" :: l -> "ch" :: (merge_digraph l)
| "c" :: "z" :: l -> "cz" :: (merge_digraph l)
| "d" :: "z" :: l -> "dz" :: (merge_digraph l)
| "d" :: "ź" :: l -> "dź" :: (merge_digraph l)
| "d" :: "ż" :: l -> "dż" :: (merge_digraph l)
| "r" :: "z" :: l -> "rz" :: (merge_digraph l)
| "s" :: "z" :: l -> "sz" :: (merge_digraph l)
| "b" :: "'" :: l -> "b'" :: (merge_digraph l)
| "f" :: "'" :: l -> "f'" :: (merge_digraph l)
| s :: l -> s :: (merge_digraph l)
let text_to_chars s = Xunicode.classified_chars_of_utf8_string s
(* (try UTF8.validate s with UTF8.Malformed_code -> failwith ("Invalid UTF8 string: " ^ s));
let r = ref [] in
UTF8.iter (fun c ->
r := (UTF8.init 1 (fun _ -> c)) :: (!r)) s;
merge_digraph (List.rev (!r))*)
let check_prefix pat s =
let n = String.length pat in
if n > String.length s then false else
String.sub s 0 n = pat
let cut_prefix pat s =
let i = String.length pat in
let n = String.length s in
if i >= n then "" else
try String.sub s i (n-i) with _ -> failwith ("cut_prefix: " ^ s ^ " " ^ string_of_int i)
let check_sufix pat s =
let n = String.length pat in
let m = String.length s in
if n > m then false else
String.sub s (m-n) n = pat
let cut_sufix pat s =
let i = String.length pat in
let n = String.length s in
try String.sub s 0 (n-i) with _ -> failwith ("cut_sufix: " ^ s)
let apply_transform orth (s,t) =
if check_sufix s orth then cut_sufix s orth ^ t else raise Not_found
let split_colon s =
match Str.split_delim (Str.regexp ":") s with
[s] -> s, ""
| [s;t] -> s, t
| _ -> failwith "split_colon"
let get_cat s =
match Str.split_delim (Str.regexp ":") s with
cat :: _ -> cat
| _ -> failwith "get_cat"
let select_interps interps interp_sel =
Xlist.fold interp_sel StringMap.empty (fun new_interps (interp,_) ->
try
StringMap.add new_interps interp (StringMap.find interps interp)
with Not_found -> new_interps)