generate.ml
2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
(*
* ENIAMmorphology, a morphological analyser and a guesser for Polish
* Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
let nlp_resources_path = "../../NLP resources/"
let sgjp_path = nlp_resources_path ^ "SGJP/"
let sgjp_filename = "sgjp-20170730.tab.gz"
let sources = [
sgjp_path, sgjp_filename;
"data/", "noun-supplement-acro.tab";
"data/", "noun-supplement-polimorf.tab";
"data/", "dial_ach.tab";
"data/", "dial_ami2.tab";
"data/", "dial_ami3.tab";
"data/", "dial_ami4.tab";
"data/", "dial_ami.tab";
"data/", "dial_ą2.tab";
"data/", "dial_ą.tab";
"data/", "dial_ę.tab";
"data/", "dial_my.tab";
"data/", "dial_sz.tab";
"data/", "dial_ym.tab";
]
let compound_rules = ENIAMmorphologyRules.make_compound_rules ()
let interp_compound_rule_trees = ENIAMmorphologyRules.make_interp_compound_rule_trees compound_rules
let generate_alt rules_filename path filename out_filename =
let rules = ENIAMmorphologyRules.load_freq_rules rules_filename in
let rules = ENIAMmorphologyRules.CharTrees.create rules in
let dict = Dict.load_tab (path ^ filename) in
let dict = Dict.merge_entries dict in
let dict = Dict.process_interps dict in
let dict = Dict.remove_cat "cond" dict in
(* let dict = Dict.mark_ndm dict in *)
let dict = Dict.validate_interp rules dict in
let dict = Dict.remove_validated_forms dict in
Dict.print out_filename dict
let generate_lemmata path filename out_filename =
let dict = Dict.load_tab (path ^ filename) in
let lemmata = Xlist.fold dict StringSet.empty (fun set e ->
StringSet.add set (Stem.simplify_lemma e.ENIAMmorphologyTypes.lemma)) in
File.file_out out_filename (fun file ->
StringSet.iter lemmata (Printf.fprintf file "%s\n"))
let _ =
Dict.generate_rule_frequencies_list interp_compound_rule_trees sources "resources/freq_rules.tab";
generate_alt "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/alt.tab";
Dict.generate_stem_dict "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/stem.tab";
Dict.generate_wyglos sgjp_path sgjp_filename "resources/wyglos.tab";
generate_lemmata sgjp_path sgjp_filename "resources/lemmata.tab";
()