semparser.ml
13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
open Xstd
open ENIAMsubsyntaxTypes
let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.user_lexicon_filename
let load_senses_map filename =
File.fold_tab filename StringMap.empty (fun map -> function
[lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l)
| l -> failwith ("load_senses_map: " ^ String.concat "\t" l))
let senses_map = load_senses_map ENIAM_LCGlexiconTypes.user_senses_filename
let examples = [
(* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *)
"studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994.";
]
let clarify_categories token =
match token.ENIAMtokenizerTypes.token with
ENIAMtokenizerTypes.Lemma(lemma,pos,interp) ->
let senses = try StringMap.find senses_map lemma with Not_found -> ["X"] in
List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
| ENIAMtokenizerTypes.Proper(lemma,pos,interp,senses) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
| ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false ["X"] (lemma,"interp",[])
| _ -> []
let create_chart tokens paths last =
ENIAM_LCGrenderer.reset_variable_numbers ();
let chart = ENIAM_LCGchart.make last in
let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
let t = ExtArray.get tokens id in
ENIAM_LCGrenderer.reset_variable_names ();
ENIAM_LCGrenderer.add_variable_numbers ();
let cats = clarify_categories t in
let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats [] in
ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
chart
let test_example name tokens paths last =
ENIAM_LCGreductions.reset_variant_label ();
let chart = create_chart tokens paths last in
ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart;
let chart,references = ENIAM_LCGchart.lazify chart in
ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart;
ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references;
let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart;
ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references;
if ENIAM_LCGchart.is_parsed chart then (
let term = ENIAM_LCGchart.get_parsed_term chart in
Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file ->
Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
Xlatex.latex_compile_and_clean "results/" (name^"4_term");
let dependency_tree = ENIAM_LCGreductions.reduce term references in
ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree;
if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree;
ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree;
ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree;
ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree;
())
else print_endline "not reduced")
else print_endline "not parsed"
let rec parse_sentence name id tokens = function
RawSentence s -> id
| StructSentence(paths,last) ->
test_example (name ^ string_of_int id ^ "_") tokens paths last;
id + 1
| DepSentence(paths) -> id
| QuotedSentences sentences ->
Xlist.fold sentences id (fun id p ->
parse_sentence name id tokens p.sentence)
| AltSentence l ->
Xlist.fold l id (fun id (mode,sentence) ->
parse_sentence name id tokens sentence)
let rec parse_paragraph name id tokens = function
RawParagraph s -> id
| StructParagraph sentences ->
Xlist.fold sentences id (fun id p ->
parse_sentence name id tokens p.sentence)
| AltParagraph l ->
Xlist.fold l id (fun id (mode,paragraph) ->
parse_paragraph name id tokens paragraph)
let rec parse_text name id tokens = function
RawText s -> id
| StructText paragraphs ->
Xlist.fold paragraphs id (fun id paragraph ->
parse_paragraph name id tokens paragraph)
| AltText l ->
Xlist.fold l id (fun id (mode,text) ->
parse_text name id tokens text)
(* let _ =
Xlist.iter examples (fun (name,example) ->
let text,tokens = ENIAMsubsyntax.parse_text example in
ignore(parse_text name 1 tokens text)) *)
(*
type entry = {title: string; info:string; biogram:string; (*primary:string; secondary:string;*) author:string}
let process_xml = function
Xml.Element("entries",[],entries) ->
List.rev (Xlist.rev_map entries (function
Xml.Element("entry",[],[title;info;biogram(*;primary;secondary*);author]) ->
{title=Xml.to_string title; info=Xml.to_string info; biogram=Xml.to_string biogram;
(*primary=Xml.to_string primary; secondary=Xml.to_string secondary;*) author=Xml.to_string author}
| _ -> failwith "process_xml 1"))
| _ -> failwith "process_xml 2"
let load_ppibl filename =
let ppibl = File.load_file_gen ("data/" ^ filename) in
process_xml (Xml.parse_string ppibl)
let named_entities =
File.fold_tab "data/ne.tab" StringMap.empty (fun map -> function
[lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l)
| _ -> failwith "named_entities")
let assign_named_entities t =
match t.token with
Lemma(lemma,"subst",interp) ->
(try
let cat = StringMap.find named_entities lemma in
{t with token=Proper(lemma,"subst",interp,cat)}
with Not_found -> t)
| Proper(lemma,"subst",interp,_) ->
(try
let cat = StringMap.find named_entities lemma in
{t with token=Proper(lemma,"subst",interp,cat)}
with Not_found -> t)
| _ -> t
let test_strings = [
(* "Debiutował opowiadaniem pt. <i>Zlecenie na dostawę</i>."; *)
"W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie.";
(* "Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994." *)
(* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP." *)
]
(* let _ =
let entries = load_ppibl "ak322269.xml" in
Xlist.iter entries (fun entry -> print_endline entry.biogram) *)
(*
let test_strings = [
"Szpak frunie.";
"Kot np. miauczy.";
"Ala ma kota.";
"Ale mają kota:"
]
let test_strings2 = [
"Szpak frunie. Kot miauczy.";
"Szpak powiedział: „Frunę. Kiszę.”";
]
*)
let grammar = [
"pos=year", Basic "year",symbol_weight;
"pos=year-interval", Basic "year-interval",symbol_weight;
"lemma=w,pos=prep,case=loc", Basic "time/(year+year-interval)",0.;
"lemma=w,pos=prep,case=loc", Basic "locat/np*MIASTO*T*loc*T",0.;
"lemma=uczęszczać,pos=praet|fin,person=ter,negation=aff,mood=indicative", Basic "ip*number*gender{|(1+time),|(1+pp*ORGANIZACJA*do*gen),|(1+locat)}",0.;
"lemma=do,pos=prep,case=gen", Basic "pp*sense*lemma*case/np*sense*T*case*T",0.;
]
let _ =
print_endline "Testy wbudowane";
Xlist.iter test_strings (fun s ->
print_endline ("\nTEST: " ^ s);
let paths = ENIAMsubsyntax.parse s in
let paths = Xlist.map paths assign_named_entities in
(* print_endline (ENIAMtokenizer.xml_of tokens); *)
print_endline (ENIAMpaths.to_string (paths,0)));
(* Xlist.iter test_strings2 (fun s ->
print_endline ("\nTEST: " ^ s);
let text,tokens = ENIAMsubsyntax.parse_text s in
(* print_endline (ENIAMtokenizer.xml_of tokens); *)
print_endline (ENIAMsubsyntaxStringOf.tokens tokens);
print_endline "";
print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));*)
(* print_endline "Testy użytkownika.";
print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
let s = ref (read_line ()) in
while !s <> "" do
let tokens = ENIAMtokenizer.parse !s in
(* print_endline (ENIAMtokenizer.xml_of tokens); *)
Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token));
print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
s := read_line ()
done;*)
()
open ENIAM_LCGlexiconTypes
open ENIAM_LCGtypes
(*
type output = Text | Xml | Html | Marsh | Graphviz
let output = ref Text
let comm_stdio = ref true
let sentence_split = ref true
let port = ref 0
let spec_list = [
"-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
"-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences";
"-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)";
"-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number";
"-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)";
"-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
"-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
"-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
"-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off";
(* "-r", Arg.String (fun p ->
ENIAMtokenizerTypes.set_resource_path p;
ENIAMmorphologyTypes.set_resource_path p;
ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *)
]
let usage_msg =
"Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:"
*)*)
let message = "ENIAM_LCGparser, a parser for Logical Categorial Grammar formalism\n\
Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\
Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences"
(*
let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s))
*)
let input_text channel =
let s = ref (try input_line channel with End_of_file -> "") in
let lines = ref [] in
while !s <> "" do
lines := !s :: !lines;
s := try input_line channel with End_of_file -> ""
done;
String.concat "\n" (List.rev !lines)
let rec main_loop sub_in sub_out in_chan out_chan =
let text = input_text in_chan in
if text = "" then () else (
Printf.fprintf sub_out "%s\n\n%!" text;
let text,tokens = (Marshal.from_channel sub_in : ENIAMsubsyntaxTypes.text * ENIAMtokenizerTypes.token_env ExtArray.t) in
(* let text,tokens = ENIAMsubsyntax.parse_text text in *)
ignore(parse_text "E"(*name*) 1 tokens text)
(* print_endline "input text begin";
print_endline text;
print_endline "input text end"; *)
(*if !sentence_split then
let text,tokens = ENIAMsubsyntax.parse_text text in
(match !output with
Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n")
| Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n")
| Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n")
| Marsh -> Marshal.to_channel out_chan (text,tokens) []
| Graphviz -> failwith "main_loop: ni")
else
let tokens = ENIAMsubsyntax.parse text in
(match !output with
Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n")
| Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n")
| Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n")
| Marsh -> Marshal.to_channel out_chan tokens []
| Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))*);
flush out_chan;
main_loop sub_in sub_out in_chan out_chan)
let get_sock_addr host_name port =
let he = Unix.gethostbyname host_name in
let addr = he.Unix.h_addr_list in
Unix.ADDR_INET(addr.(0),port)
let sub_host = "localhost"
let sub_port = 5739
let _ =
prerr_endline message;
(* ENIAMsubsyntax.initialize (); *)
ENIAMcategoriesPL.initialize ();
(* Arg.parse spec_list anon_fun usage_msg; *)
Gc.compact ();
let sub_in,sub_out = Unix.open_connection (get_sock_addr sub_host sub_port) in
prerr_endline "Ready!";
(*if !comm_stdio then*) main_loop sub_in sub_out stdin stdout
(*else
let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in
Unix.establish_server main_loop sockaddr*)