test2.ml
6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
(*
* ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAM_LCGlexiconTypes
open ENIAM_LCGtypes
open ENIAMsubsyntaxTypes
let rules = ENIAM_LCGlexicon.make_rules ENIAM_LCGlexiconTypes.rules_filename
let examples = [
(* "Szpak","Szpak śpiewa.";*)
(* "miał","Miałem miał."; *)
(* "Ala","Ala ma kota.";
"Ale","Ale mają kota:"; *)
(* "zima","Szpak frunie zimą.";*)
(* "październik","Kot miauczy w październiku."; *)
(* "Szpak-Kot","Szpak frunie. Kot miauczy.";
"powiedział","Szpak powiedział: „Frunę. Kiszę.”";*)
"teraz","Teraz frunie jakiś szpak.";
"chłopcy","Chłopcy mają ulicę kwiatami.";
(* "arabia","Arabia Saudyjska biegnie.";*)
(* "Tom","Tom idzie."; *)
]
let clarify_categories senses token =
match token.ENIAMtokenizerTypes.token with
ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
| ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
| ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[])
| _ -> []
let create_chart tokens lex_sems paths last =
ENIAM_LCGrenderer.reset_variable_numbers ();
let chart = ENIAM_LCGchart.make last in
let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
let t = ExtArray.get tokens id in
let s = ExtArray.get lex_sems id in
ENIAM_LCGrenderer.reset_variable_names ();
ENIAM_LCGrenderer.add_variable_numbers ();
let cats = clarify_categories ["X"] t in
let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
chart
let test_example name tokens lex_sems paths last =
ENIAM_LCGreductions.reset_variant_label ();
let chart = create_chart tokens lex_sems paths last in
ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart;
let chart,references = ENIAM_LCGchart.lazify chart in
ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart;
ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references;
let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart;
ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references;
if ENIAM_LCGchart.is_parsed chart then (
let term = ENIAM_LCGchart.get_parsed_term chart in
Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file ->
Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
Xlatex.latex_compile_and_clean "results/" (name^"4_term");
let dependency_tree = ENIAM_LCGreductions.reduce term references in
ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree;
if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree;
ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree;
ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree;
ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree;
())
else print_endline "not reduced")
else print_endline "not parsed"
let rec parse_sentence name id tokens lex_sems = function
RawSentence s -> id
| StructSentence(paths,last) ->
test_example (name ^ string_of_int id ^ "_") tokens lex_sems paths last;
id + 1
| DepSentence(paths) -> id
| QuotedSentences sentences ->
Xlist.fold sentences id (fun id p ->
parse_sentence name id tokens lex_sems p.sentence)
| AltSentence l ->
Xlist.fold l id (fun id (mode,sentence) ->
parse_sentence name id tokens lex_sems sentence)
let rec parse_paragraph name id tokens lex_sems = function
RawParagraph s -> id
| StructParagraph sentences ->
Xlist.fold sentences id (fun id p ->
parse_sentence name id tokens lex_sems p.sentence)
| AltParagraph l ->
Xlist.fold l id (fun id (mode,paragraph) ->
parse_paragraph name id tokens lex_sems paragraph)
let rec parse_text name id tokens lex_sems = function
RawText s -> id
| StructText paragraphs ->
Xlist.fold paragraphs id (fun id paragraph ->
parse_paragraph name id tokens lex_sems paragraph)
| AltText l ->
Xlist.fold l id (fun id (mode,text) ->
parse_text name id tokens lex_sems text)
let _ =
Xlist.iter examples (fun (name,example) ->
let text,tokens = ENIAMsubsyntax.parse_text example in
let lex_sems = ENIAMlexSemantics.assign tokens text in
ignore(parse_text name 1 tokens lex_sems text))