suffix_tree.ml
4.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
(*
* ENIAMmorphology, a morphological analyser and a guesser for Polish
* Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
type star = Productive | Star | Ndm
type rule = {star: star; pref: string; find: string; set: string; tags: (string * string) list;
interp: string; id: string; freq: int}
let parse_star = function
"" -> Productive
| "*" -> Star
| "ndm" -> Ndm
| s -> failwith "parse_star"
let string_of_star = function
Productive -> ""
| Star -> "*"
| Ndm -> "ndm"
let print_rule file rule =
Printf.fprintf file "%s\t%d\t%s\t%s\t%s\t%s\t%s\n" rule.id rule.freq (string_of_star rule.star)
rule.pref rule.find rule.set rule.interp
module CharTrees = struct
type t = M of t CharMap.t * rule list
let empty = M(CharMap.empty,[])
let rec add_path_rules rule orth_suf i (M(map,rules)) =
if i = -1 then M(map,rule :: rules) else
let tree = try CharMap.find map (String.get orth_suf i) with Not_found -> empty in
let tree = add_path_rules rule orth_suf (i-1) tree in
M(CharMap.add map (String.get orth_suf i) tree,rules)
let create_char_tree rules =
let tree = Xlist.fold rules empty (fun tree rule ->
add_path_rules rule rule.find (String.length rule.find - 1) tree) in
tree
let create rules =
let prefix_map = Xlist.fold rules StringMap.empty (fun prefix_map rule ->
StringMap.add_inc prefix_map rule.pref [rule] (fun l -> rule :: l)) in
StringMap.fold prefix_map [] (fun trees prefix rules -> (prefix, create_char_tree rules) :: trees)
let rec find_rec l i orth (M(map,rules)) =
if i = 0 then Xlist.fold rules l (fun l rule -> ("", rule) :: l) else
let l = try find_rec l (i-1) orth (CharMap.find map (String.get orth (i-1))) with Not_found -> l in
Xlist.fold rules l (fun l rule -> (String.sub orth 0 i, rule) :: l)
let find trees orth =
Xlist.fold trees [] (fun found (pref,tree) ->
(* print_endline pref; *)
if Xstring.check_prefix pref orth then (
let orth = Xstring.cut_prefix pref orth in
(* printf "%s %d " orth (Xlist.size found); *)
let found = find_rec found (String.length orth) orth tree in
(* printf "%d\n%!" (Xlist.size found); *)
(* Xlist.iter found (fun (stem,rule) -> printf "F %s\t%s\n" stem (string_of_rule rule)); *)
found)
else found)
let add_char c rule =
let s = String.make 1 c in
{rule with find=s ^ rule.find; set=s ^ rule.set}
let rec disjoint_rec super (M(map,rules)) =
let rules = rules @ super in
if CharMap.is_empty map then M(map,rules) else
M(CharMap.mapi map (fun c tree ->
disjoint_rec (Xlist.rev_map rules (add_char c)) tree),[])
let disjoint trees =
Xlist.rev_map trees (fun (pref,tree) ->
pref, disjoint_rec [] tree)
let rec print_rules_rec file (M(map,rules)) =
Xlist.iter rules (print_rule file);
CharMap.iter map (fun _ tree -> print_rules_rec file tree)
let print_rules filename trees =
File.file_out filename (fun file ->
Xlist.iter trees (fun (_,tree) ->
print_rules_rec file tree))
end
let load_freq_rules filename =
File.fold_tab filename [] (fun rules -> function
[id; freq; star; pref; find; set; interp] ->
{id=id; freq=int_of_string freq; star=parse_star star; pref=pref; find=find; set=set;
tags=[]; interp=interp} :: rules
| _ -> failwith "load_freq_rules")
let load_rules resource_path rules_filename =
let rules = load_freq_rules (resource_path ^ rules_filename) in
let rules = CharTrees.create rules in
let rules = CharTrees.disjoint rules in
rules
let rules_filename = "freq_rules.tab"
let adj_rules_filename = "adj_freq_rules.tab"
let resource_path = "../resources/SGJP/"
let _ =
let rules = load_rules resource_path adj_rules_filename in
CharTrees.print_rules "results/disjoint_rules_adj" rules