Commit cc1f25eb75999621d427e059e1614a05cdbf1e7f
1 parent
4102403e
Wstępna wersja biblioteki eniam-exec
Showing
15 changed files
with
729 additions
and
676 deletions
LCGlexicon/interface.ml deleted
1 | -(* | |
2 | - * ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish | |
3 | - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | - * | |
6 | - * This library is free software: you can redistribute it and/or modify | |
7 | - * it under the terms of the GNU Lesser General Public License as published by | |
8 | - * the Free Software Foundation, either version 3 of the License, or | |
9 | - * (at your option) any later version. | |
10 | - * | |
11 | - * This library is distributed in the hope that it will be useful, | |
12 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - * GNU Lesser General Public License for more details. | |
15 | - * | |
16 | - * You should have received a copy of the GNU Lesser General Public License | |
17 | - * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | - *) | |
19 | - | |
20 | -open ENIAM_LCGlexiconTypes | |
21 | -open ENIAM_LCGtypes | |
22 | -open ENIAMsubsyntaxTypes | |
23 | - | |
24 | -let rules = ENIAM_LCGlexicon.make_rules ENIAM_LCGlexiconTypes.rules_filename | |
25 | - | |
26 | -let examples = [ | |
27 | - (* "Szpak","Szpak śpiewa.";*) | |
28 | - (* "miał","Miałem miał."; *) | |
29 | -(* "Ala","Ala ma kota."; | |
30 | - "Ale","Ale mają kota:"; *) | |
31 | - (* "zima","Szpak frunie zimą.";*) | |
32 | - (* "październik","Kot miauczy w październiku."; *) | |
33 | -(* "Szpak-Kot","Szpak frunie. Kot miauczy."; | |
34 | - "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*) | |
35 | - (* "teraz","Teraz frunie jakiś szpak."; | |
36 | - "chłopcy","Chłopcy mają ulicę kwiatami."; *) | |
37 | - (* "arabia","Arabia Saudyjska biegnie.";*) | |
38 | -(* "Tom","Tom idzie."; *) | |
39 | - "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; | |
40 | - "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; | |
41 | -] | |
42 | - | |
43 | -let clarify_categories senses token = | |
44 | - match token.ENIAMtokenizerTypes.token with | |
45 | - ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) | |
46 | - | ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) | |
47 | - | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) | |
48 | - | _ -> [] | |
49 | - | |
50 | -let create_chart tokens lex_sems paths last = | |
51 | - ENIAM_LCGrenderer.reset_variable_numbers (); | |
52 | - let chart = ENIAM_LCGchart.make last in | |
53 | - let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | |
54 | - let t = ExtArray.get tokens id in | |
55 | - let s = ExtArray.get lex_sems id in | |
56 | - ENIAM_LCGrenderer.reset_variable_names (); | |
57 | - ENIAM_LCGrenderer.add_variable_numbers (); | |
58 | - let cats = clarify_categories ["X"] t in | |
59 | - let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | |
60 | - ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | |
61 | - chart | |
62 | - | |
63 | -let test_example name tokens lex_sems paths last = | |
64 | - ENIAM_LCGreductions.reset_variant_label (); | |
65 | - let chart = create_chart tokens lex_sems paths last in | |
66 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; | |
67 | - let chart,references = ENIAM_LCGchart.lazify chart in | |
68 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | |
69 | - ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; | |
70 | - let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
71 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | |
72 | - ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; | |
73 | - if ENIAM_LCGchart.is_parsed chart then ( | |
74 | - let term = ENIAM_LCGchart.get_parsed_term chart in | |
75 | - Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file -> | |
76 | - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
77 | - Xlatex.latex_compile_and_clean "results/" (name^"4_term"); | |
78 | - let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
79 | - ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree; | |
80 | - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
81 | - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
82 | - ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree; | |
83 | - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
84 | - ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree; | |
85 | - ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree; | |
86 | - ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree; | |
87 | - ()) | |
88 | - else print_endline "not reduced") | |
89 | - else print_endline "not parsed" | |
90 | - | |
91 | -let rec parse_sentence name id tokens lex_sems = function | |
92 | - RawSentence s -> id | |
93 | - | StructSentence(paths,last) -> | |
94 | - test_example (name ^ string_of_int id ^ "_") tokens lex_sems paths last; | |
95 | - id + 1 | |
96 | - | DepSentence(paths) -> id | |
97 | - | QuotedSentences sentences -> | |
98 | - Xlist.fold sentences id (fun id p -> | |
99 | - parse_sentence name id tokens lex_sems p.sentence) | |
100 | - | AltSentence l -> | |
101 | - Xlist.fold l id (fun id (mode,sentence) -> | |
102 | - parse_sentence name id tokens lex_sems sentence) | |
103 | - | |
104 | -let rec parse_paragraph name id tokens lex_sems = function | |
105 | - RawParagraph s -> id | |
106 | - | StructParagraph sentences -> | |
107 | - Xlist.fold sentences id (fun id p -> | |
108 | - parse_sentence name id tokens lex_sems p.sentence) | |
109 | - | AltParagraph l -> | |
110 | - Xlist.fold l id (fun id (mode,paragraph) -> | |
111 | - parse_paragraph name id tokens lex_sems paragraph) | |
112 | - | |
113 | -let rec parse_text name id tokens lex_sems = function | |
114 | - RawText s -> id | |
115 | - | StructText paragraphs -> | |
116 | - Xlist.fold paragraphs id (fun id paragraph -> | |
117 | - parse_paragraph name id tokens lex_sems paragraph) | |
118 | - | AltText l -> | |
119 | - Xlist.fold l id (fun id (mode,text) -> | |
120 | - parse_text name id tokens lex_sems text) | |
121 | - | |
122 | - | |
123 | -let _ = | |
124 | - ENIAMsubsyntax.initialize (); | |
125 | - ENIAMcategoriesPL.initialize (); | |
126 | - Xlist.iter examples (fun (name,example) -> | |
127 | - let text,tokens = ENIAMsubsyntax.parse_text example in | |
128 | - let lex_sems = ENIAMlexSemantics.assign tokens text in | |
129 | - ignore(parse_text name 1 tokens lex_sems text)) | |
130 | - | |
131 | -(* | |
132 | -type output = Text | Xml | Html | Marsh | Graphviz | |
133 | - | |
134 | -let output = ref Text | |
135 | -let comm_stdio = ref true | |
136 | -let sentence_split = ref true | |
137 | -let port = ref 0 | |
138 | - | |
139 | -let spec_list = [ | |
140 | - "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | |
141 | - "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; | |
142 | - "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; | |
143 | - "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; | |
144 | - "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; | |
145 | - "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; | |
146 | - "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; | |
147 | - "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; | |
148 | - "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; | |
149 | - (* "-r", Arg.String (fun p -> | |
150 | - ENIAMtokenizerTypes.set_resource_path p; | |
151 | - ENIAMmorphologyTypes.set_resource_path p; | |
152 | - ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *) | |
153 | - ] | |
154 | - | |
155 | -let usage_msg = | |
156 | - "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:" | |
157 | - | |
158 | -let message = "ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish\n\ | |
159 | -Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\ | |
160 | -Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences" | |
161 | - | |
162 | -let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s)) | |
163 | - | |
164 | -let input_text channel = | |
165 | - let s = ref (try input_line channel with End_of_file -> "") in | |
166 | - let lines = ref [] in | |
167 | - while !s <> "" do | |
168 | - lines := !s :: !lines; | |
169 | - s := try input_line channel with End_of_file -> "" | |
170 | - done; | |
171 | - String.concat "\n" (List.rev !lines) | |
172 | - | |
173 | -let rec main_loop in_chan out_chan = | |
174 | - let text = input_text in_chan in | |
175 | - if text = "" then () else ( | |
176 | - (* print_endline "input text begin"; | |
177 | - print_endline text; | |
178 | - print_endline "input text end"; *) | |
179 | - (if !sentence_split then | |
180 | - let text,tokens = ENIAMsubsyntax.parse_text text in | |
181 | - (match !output with | |
182 | - Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n") | |
183 | - | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n") | |
184 | - | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n") | |
185 | - | Marsh -> Marshal.to_channel out_chan (text,tokens) [] | |
186 | - | Graphviz -> failwith "main_loop: ni") | |
187 | - else | |
188 | - let tokens = ENIAMsubsyntax.parse text in | |
189 | - (match !output with | |
190 | - Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n") | |
191 | - | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n") | |
192 | - | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n") | |
193 | - | Marsh -> Marshal.to_channel out_chan tokens [] | |
194 | - | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))); | |
195 | - flush out_chan; | |
196 | - main_loop in_chan out_chan) | |
197 | - | |
198 | -let _ = | |
199 | - prerr_endline message; | |
200 | - Arg.parse spec_list anon_fun usage_msg; | |
201 | - Gc.compact (); | |
202 | - prerr_endline "Ready!"; | |
203 | - if !comm_stdio then main_loop stdin stdout | |
204 | - else | |
205 | - let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in | |
206 | - Unix.establish_server main_loop sockaddr | |
207 | -*) |
LCGlexicon/makefile
... | ... | @@ -4,8 +4,6 @@ OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa |
7 | -OCAMLOPTFLAGS2=$(OCAMLOPTFLAGS) eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa | |
8 | -OCAMLOPTFLAGS3=$(OCAMLOPTFLAGS2) eniam-lexSemantics.cmxa | |
9 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
10 | 8 | |
11 | 9 | SOURCES= ENIAM_LCGlexiconTypes.ml ENIAMcategoriesPL.ml ENIAM_LCGlexiconParser.ml ENIAM_LCGlexicon.ml |
... | ... | @@ -39,17 +37,6 @@ test: test.ml |
39 | 37 | mkdir -p results |
40 | 38 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml |
41 | 39 | |
42 | -test2: test2.ml | |
43 | - mkdir -p results | |
44 | - $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS3) test2.ml | |
45 | - | |
46 | -interface: interface.ml | |
47 | - $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS3) interface.ml | |
48 | - | |
49 | -semparser: semparser.ml | |
50 | - mkdir -p results | |
51 | - $(OCAMLOPT) -o semparser $(OCAMLOPTFLAGS2) semparser.ml | |
52 | - | |
53 | 40 | print_lexicon: ENIAM_LCGlexiconLatexOf.ml |
54 | 41 | mkdir -p results |
55 | 42 | $(OCAMLOPT) -o print_lexicon $(OCAMLOPTFLAGS) ENIAM_LCGlexiconLatexOf.ml |
... | ... | @@ -75,4 +62,4 @@ print_lexicon: ENIAM_LCGlexiconLatexOf.ml |
75 | 62 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
76 | 63 | |
77 | 64 | clean: |
78 | - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test test2 parser print_lexicon | |
65 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test print_lexicon | |
... | ... |
LCGlexicon/semparser.ml deleted
1 | -open Xstd | |
2 | -open ENIAMsubsyntaxTypes | |
3 | - | |
4 | -let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.user_lexicon_filename | |
5 | - | |
6 | -let load_senses_map filename = | |
7 | - File.fold_tab filename StringMap.empty (fun map -> function | |
8 | - [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l) | |
9 | - | l -> failwith ("load_senses_map: " ^ String.concat "\t" l)) | |
10 | - | |
11 | -let senses_map = load_senses_map ENIAM_LCGlexiconTypes.user_senses_filename | |
12 | - | |
13 | - | |
14 | -let examples = [ | |
15 | - (* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) | |
16 | - "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; | |
17 | -] | |
18 | - | |
19 | -let clarify_categories token = | |
20 | - match token.ENIAMtokenizerTypes.token with | |
21 | - ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> | |
22 | - let senses = try StringMap.find senses_map lemma with Not_found -> ["X"] in | |
23 | - List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) | |
24 | - | ENIAMtokenizerTypes.Proper(lemma,pos,interp,senses) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) | |
25 | - | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false ["X"] (lemma,"interp",[]) | |
26 | - | _ -> [] | |
27 | - | |
28 | -let create_chart tokens paths last = | |
29 | - ENIAM_LCGrenderer.reset_variable_numbers (); | |
30 | - let chart = ENIAM_LCGchart.make last in | |
31 | - let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | |
32 | - let t = ExtArray.get tokens id in | |
33 | - ENIAM_LCGrenderer.reset_variable_names (); | |
34 | - ENIAM_LCGrenderer.add_variable_numbers (); | |
35 | - let cats = clarify_categories t in | |
36 | - let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats [] in | |
37 | - ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | |
38 | - chart | |
39 | - | |
40 | -let test_example name tokens paths last = | |
41 | - ENIAM_LCGreductions.reset_variant_label (); | |
42 | - let chart = create_chart tokens paths last in | |
43 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; | |
44 | - let chart,references = ENIAM_LCGchart.lazify chart in | |
45 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | |
46 | - ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; | |
47 | - let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
48 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | |
49 | - ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; | |
50 | - if ENIAM_LCGchart.is_parsed chart then ( | |
51 | - let term = ENIAM_LCGchart.get_parsed_term chart in | |
52 | - Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file -> | |
53 | - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
54 | - Xlatex.latex_compile_and_clean "results/" (name^"4_term"); | |
55 | - let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
56 | - ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree; | |
57 | - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
58 | - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
59 | - ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree; | |
60 | - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
61 | - ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree; | |
62 | - ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree; | |
63 | - ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree; | |
64 | - ()) | |
65 | - else print_endline "not reduced") | |
66 | - else print_endline "not parsed" | |
67 | - | |
68 | -let rec parse_sentence name id tokens = function | |
69 | - RawSentence s -> id | |
70 | - | StructSentence(paths,last) -> | |
71 | - test_example (name ^ string_of_int id ^ "_") tokens paths last; | |
72 | - id + 1 | |
73 | - | DepSentence(paths) -> id | |
74 | - | QuotedSentences sentences -> | |
75 | - Xlist.fold sentences id (fun id p -> | |
76 | - parse_sentence name id tokens p.sentence) | |
77 | - | AltSentence l -> | |
78 | - Xlist.fold l id (fun id (mode,sentence) -> | |
79 | - parse_sentence name id tokens sentence) | |
80 | - | |
81 | -let rec parse_paragraph name id tokens = function | |
82 | - RawParagraph s -> id | |
83 | - | StructParagraph sentences -> | |
84 | - Xlist.fold sentences id (fun id p -> | |
85 | - parse_sentence name id tokens p.sentence) | |
86 | - | AltParagraph l -> | |
87 | - Xlist.fold l id (fun id (mode,paragraph) -> | |
88 | - parse_paragraph name id tokens paragraph) | |
89 | - | |
90 | -let rec parse_text name id tokens = function | |
91 | - RawText s -> id | |
92 | - | StructText paragraphs -> | |
93 | - Xlist.fold paragraphs id (fun id paragraph -> | |
94 | - parse_paragraph name id tokens paragraph) | |
95 | - | AltText l -> | |
96 | - Xlist.fold l id (fun id (mode,text) -> | |
97 | - parse_text name id tokens text) | |
98 | - | |
99 | - | |
100 | -(* let _ = | |
101 | - Xlist.iter examples (fun (name,example) -> | |
102 | - let text,tokens = ENIAMsubsyntax.parse_text example in | |
103 | - ignore(parse_text name 1 tokens text)) *) | |
104 | - | |
105 | -(* | |
106 | -type entry = {title: string; info:string; biogram:string; (*primary:string; secondary:string;*) author:string} | |
107 | - | |
108 | -let process_xml = function | |
109 | - Xml.Element("entries",[],entries) -> | |
110 | - List.rev (Xlist.rev_map entries (function | |
111 | - Xml.Element("entry",[],[title;info;biogram(*;primary;secondary*);author]) -> | |
112 | - {title=Xml.to_string title; info=Xml.to_string info; biogram=Xml.to_string biogram; | |
113 | - (*primary=Xml.to_string primary; secondary=Xml.to_string secondary;*) author=Xml.to_string author} | |
114 | - | _ -> failwith "process_xml 1")) | |
115 | - | _ -> failwith "process_xml 2" | |
116 | - | |
117 | - | |
118 | -let load_ppibl filename = | |
119 | - let ppibl = File.load_file_gen ("data/" ^ filename) in | |
120 | - process_xml (Xml.parse_string ppibl) | |
121 | - | |
122 | -let named_entities = | |
123 | - File.fold_tab "data/ne.tab" StringMap.empty (fun map -> function | |
124 | - [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l) | |
125 | - | _ -> failwith "named_entities") | |
126 | - | |
127 | -let assign_named_entities t = | |
128 | - match t.token with | |
129 | - Lemma(lemma,"subst",interp) -> | |
130 | - (try | |
131 | - let cat = StringMap.find named_entities lemma in | |
132 | - {t with token=Proper(lemma,"subst",interp,cat)} | |
133 | - with Not_found -> t) | |
134 | - | Proper(lemma,"subst",interp,_) -> | |
135 | - (try | |
136 | - let cat = StringMap.find named_entities lemma in | |
137 | - {t with token=Proper(lemma,"subst",interp,cat)} | |
138 | - with Not_found -> t) | |
139 | - | _ -> t | |
140 | - | |
141 | -let test_strings = [ | |
142 | - (* "Debiutował opowiadaniem pt. <i>Zlecenie na dostawę</i>."; *) | |
143 | - "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; | |
144 | - (* "Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994." *) | |
145 | - (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP." *) | |
146 | -] | |
147 | - | |
148 | -(* let _ = | |
149 | - let entries = load_ppibl "ak322269.xml" in | |
150 | - Xlist.iter entries (fun entry -> print_endline entry.biogram) *) | |
151 | - | |
152 | -(* | |
153 | -let test_strings = [ | |
154 | - "Szpak frunie."; | |
155 | - "Kot np. miauczy."; | |
156 | - "Ala ma kota."; | |
157 | - "Ale mają kota:" | |
158 | - ] | |
159 | - | |
160 | -let test_strings2 = [ | |
161 | - "Szpak frunie. Kot miauczy."; | |
162 | - "Szpak powiedział: „Frunę. Kiszę.”"; | |
163 | - ] | |
164 | -*) | |
165 | - | |
166 | -let grammar = [ | |
167 | - "pos=year", Basic "year",symbol_weight; | |
168 | - "pos=year-interval", Basic "year-interval",symbol_weight; | |
169 | - "lemma=w,pos=prep,case=loc", Basic "time/(year+year-interval)",0.; | |
170 | - "lemma=w,pos=prep,case=loc", Basic "locat/np*MIASTO*T*loc*T",0.; | |
171 | - | |
172 | - "lemma=uczęszczać,pos=praet|fin,person=ter,negation=aff,mood=indicative", Basic "ip*number*gender{|(1+time),|(1+pp*ORGANIZACJA*do*gen),|(1+locat)}",0.; | |
173 | - "lemma=do,pos=prep,case=gen", Basic "pp*sense*lemma*case/np*sense*T*case*T",0.; | |
174 | - | |
175 | -] | |
176 | - | |
177 | -let _ = | |
178 | - print_endline "Testy wbudowane"; | |
179 | - Xlist.iter test_strings (fun s -> | |
180 | - print_endline ("\nTEST: " ^ s); | |
181 | - let paths = ENIAMsubsyntax.parse s in | |
182 | - let paths = Xlist.map paths assign_named_entities in | |
183 | - (* print_endline (ENIAMtokenizer.xml_of tokens); *) | |
184 | - print_endline (ENIAMpaths.to_string (paths,0))); | |
185 | -(* Xlist.iter test_strings2 (fun s -> | |
186 | - print_endline ("\nTEST: " ^ s); | |
187 | - let text,tokens = ENIAMsubsyntax.parse_text s in | |
188 | - (* print_endline (ENIAMtokenizer.xml_of tokens); *) | |
189 | - print_endline (ENIAMsubsyntaxStringOf.tokens tokens); | |
190 | - print_endline ""; | |
191 | - print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));*) | |
192 | -(* print_endline "Testy użytkownika."; | |
193 | - print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; | |
194 | - let s = ref (read_line ()) in | |
195 | - while !s <> "" do | |
196 | - let tokens = ENIAMtokenizer.parse !s in | |
197 | - (* print_endline (ENIAMtokenizer.xml_of tokens); *) | |
198 | - Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token)); | |
199 | - print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; | |
200 | - s := read_line () | |
201 | - done;*) | |
202 | - () | |
203 | - | |
204 | -open ENIAM_LCGlexiconTypes | |
205 | -open ENIAM_LCGtypes | |
206 | - | |
207 | - | |
208 | -(* | |
209 | -type output = Text | Xml | Html | Marsh | Graphviz | |
210 | - | |
211 | -let output = ref Text | |
212 | -let comm_stdio = ref true | |
213 | -let sentence_split = ref true | |
214 | -let port = ref 0 | |
215 | - | |
216 | -let spec_list = [ | |
217 | - "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | |
218 | - "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; | |
219 | - "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; | |
220 | - "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; | |
221 | - "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; | |
222 | - "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; | |
223 | - "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; | |
224 | - "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; | |
225 | - "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; | |
226 | - (* "-r", Arg.String (fun p -> | |
227 | - ENIAMtokenizerTypes.set_resource_path p; | |
228 | - ENIAMmorphologyTypes.set_resource_path p; | |
229 | - ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *) | |
230 | - ] | |
231 | - | |
232 | -let usage_msg = | |
233 | - "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:" | |
234 | -*)*) | |
235 | -let message = "ENIAM_LCGparser, a parser for Logical Categorial Grammar formalism\n\ | |
236 | -Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\ | |
237 | -Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences" | |
238 | -(* | |
239 | -let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s)) | |
240 | -*) | |
241 | -let input_text channel = | |
242 | - let s = ref (try input_line channel with End_of_file -> "") in | |
243 | - let lines = ref [] in | |
244 | - while !s <> "" do | |
245 | - lines := !s :: !lines; | |
246 | - s := try input_line channel with End_of_file -> "" | |
247 | - done; | |
248 | - String.concat "\n" (List.rev !lines) | |
249 | - | |
250 | -let rec main_loop sub_in sub_out in_chan out_chan = | |
251 | - let text = input_text in_chan in | |
252 | - if text = "" then () else ( | |
253 | - Printf.fprintf sub_out "%s\n\n%!" text; | |
254 | - let text,tokens = (Marshal.from_channel sub_in : ENIAMsubsyntaxTypes.text * ENIAMtokenizerTypes.token_env ExtArray.t) in | |
255 | - (* let text,tokens = ENIAMsubsyntax.parse_text text in *) | |
256 | - ignore(parse_text "E"(*name*) 1 tokens text) | |
257 | - (* print_endline "input text begin"; | |
258 | - print_endline text; | |
259 | - print_endline "input text end"; *) | |
260 | - (*if !sentence_split then | |
261 | - let text,tokens = ENIAMsubsyntax.parse_text text in | |
262 | - (match !output with | |
263 | - Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n") | |
264 | - | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n") | |
265 | - | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n") | |
266 | - | Marsh -> Marshal.to_channel out_chan (text,tokens) [] | |
267 | - | Graphviz -> failwith "main_loop: ni") | |
268 | - else | |
269 | - let tokens = ENIAMsubsyntax.parse text in | |
270 | - (match !output with | |
271 | - Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n") | |
272 | - | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n") | |
273 | - | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n") | |
274 | - | Marsh -> Marshal.to_channel out_chan tokens [] | |
275 | - | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))*); | |
276 | - flush out_chan; | |
277 | - main_loop sub_in sub_out in_chan out_chan) | |
278 | - | |
279 | -let get_sock_addr host_name port = | |
280 | - let he = Unix.gethostbyname host_name in | |
281 | - let addr = he.Unix.h_addr_list in | |
282 | - Unix.ADDR_INET(addr.(0),port) | |
283 | - | |
284 | -let sub_host = "localhost" | |
285 | -let sub_port = 5739 | |
286 | - | |
287 | -let _ = | |
288 | - prerr_endline message; | |
289 | - (* ENIAMsubsyntax.initialize (); *) | |
290 | - ENIAMcategoriesPL.initialize (); | |
291 | - (* Arg.parse spec_list anon_fun usage_msg; *) | |
292 | - Gc.compact (); | |
293 | - let sub_in,sub_out = Unix.open_connection (get_sock_addr sub_host sub_port) in | |
294 | - prerr_endline "Ready!"; | |
295 | - (*if !comm_stdio then*) main_loop sub_in sub_out stdin stdout | |
296 | - (*else | |
297 | - let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in | |
298 | - Unix.establish_server main_loop sockaddr*) |
LCGlexicon/test2.ml deleted
1 | -(* | |
2 | - * ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish | |
3 | - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | - * | |
6 | - * This library is free software: you can redistribute it and/or modify | |
7 | - * it under the terms of the GNU Lesser General Public License as published by | |
8 | - * the Free Software Foundation, either version 3 of the License, or | |
9 | - * (at your option) any later version. | |
10 | - * | |
11 | - * This library is distributed in the hope that it will be useful, | |
12 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - * GNU Lesser General Public License for more details. | |
15 | - * | |
16 | - * You should have received a copy of the GNU Lesser General Public License | |
17 | - * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | - *) | |
19 | - | |
20 | -open ENIAM_LCGlexiconTypes | |
21 | -open ENIAM_LCGtypes | |
22 | -open ENIAMsubsyntaxTypes | |
23 | - | |
24 | -let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename | |
25 | - | |
26 | -let examples = [ | |
27 | - (* "Szpak","Szpak śpiewa.";*) | |
28 | - (* "miał","Miałem miał."; *) | |
29 | -(* "Ala","Ala ma kota."; | |
30 | - "Ale","Ale mają kota:"; *) | |
31 | - (* "zima","Szpak frunie zimą.";*) | |
32 | - (* "październik","Kot miauczy w październiku."; *) | |
33 | -(* "Szpak-Kot","Szpak frunie. Kot miauczy."; | |
34 | - "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*) | |
35 | - "teraz","Teraz frunie jakiś szpak."; | |
36 | - "chłopcy","Chłopcy mają ulicę kwiatami."; | |
37 | - (* "arabia","Arabia Saudyjska biegnie.";*) | |
38 | -(* "Tom","Tom idzie."; *) | |
39 | -] | |
40 | - | |
41 | -let clarify_categories senses token = | |
42 | - match token.ENIAMtokenizerTypes.token with | |
43 | - ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) | |
44 | - | ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) | |
45 | - | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) | |
46 | - | _ -> [] | |
47 | - | |
48 | -let create_chart tokens lex_sems paths last = | |
49 | - ENIAM_LCGrenderer.reset_variable_numbers (); | |
50 | - let chart = ENIAM_LCGchart.make last in | |
51 | - let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | |
52 | - let t = ExtArray.get tokens id in | |
53 | - let s = ExtArray.get lex_sems id in | |
54 | - ENIAM_LCGrenderer.reset_variable_names (); | |
55 | - ENIAM_LCGrenderer.add_variable_numbers (); | |
56 | - let cats = clarify_categories ["X"] t in | |
57 | - let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | |
58 | - ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | |
59 | - chart | |
60 | - | |
61 | -let test_example name tokens lex_sems paths last = | |
62 | - ENIAM_LCGreductions.reset_variant_label (); | |
63 | - let chart = create_chart tokens lex_sems paths last in | |
64 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; | |
65 | - let chart,references = ENIAM_LCGchart.lazify chart in | |
66 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | |
67 | - ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; | |
68 | - let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
69 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | |
70 | - ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; | |
71 | - if ENIAM_LCGchart.is_parsed chart then ( | |
72 | - let term = ENIAM_LCGchart.get_parsed_term chart in | |
73 | - Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file -> | |
74 | - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
75 | - Xlatex.latex_compile_and_clean "results/" (name^"4_term"); | |
76 | - let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
77 | - ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree; | |
78 | - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
79 | - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
80 | - ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree; | |
81 | - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
82 | - ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree; | |
83 | - ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree; | |
84 | - ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree; | |
85 | - ()) | |
86 | - else print_endline "not reduced") | |
87 | - else print_endline "not parsed" | |
88 | - | |
89 | -let rec parse_sentence name id tokens lex_sems = function | |
90 | - RawSentence s -> id | |
91 | - | StructSentence(paths,last) -> | |
92 | - test_example (name ^ string_of_int id ^ "_") tokens lex_sems paths last; | |
93 | - id + 1 | |
94 | - | DepSentence(paths) -> id | |
95 | - | QuotedSentences sentences -> | |
96 | - Xlist.fold sentences id (fun id p -> | |
97 | - parse_sentence name id tokens lex_sems p.sentence) | |
98 | - | AltSentence l -> | |
99 | - Xlist.fold l id (fun id (mode,sentence) -> | |
100 | - parse_sentence name id tokens lex_sems sentence) | |
101 | - | |
102 | -let rec parse_paragraph name id tokens lex_sems = function | |
103 | - RawParagraph s -> id | |
104 | - | StructParagraph sentences -> | |
105 | - Xlist.fold sentences id (fun id p -> | |
106 | - parse_sentence name id tokens lex_sems p.sentence) | |
107 | - | AltParagraph l -> | |
108 | - Xlist.fold l id (fun id (mode,paragraph) -> | |
109 | - parse_paragraph name id tokens lex_sems paragraph) | |
110 | - | |
111 | -let rec parse_text name id tokens lex_sems = function | |
112 | - RawText s -> id | |
113 | - | StructText paragraphs -> | |
114 | - Xlist.fold paragraphs id (fun id paragraph -> | |
115 | - parse_paragraph name id tokens lex_sems paragraph) | |
116 | - | AltText l -> | |
117 | - Xlist.fold l id (fun id (mode,text) -> | |
118 | - parse_text name id tokens lex_sems text) | |
119 | - | |
120 | - | |
121 | -let _ = | |
122 | - ENIAMsubsyntax.initialize (); | |
123 | - ENIAMcategoriesPL.initialize (); | |
124 | - Xlist.iter examples (fun (name,example) -> | |
125 | - let text,tokens = ENIAMsubsyntax.parse_text example in | |
126 | - let lex_sems = ENIAMlexSemantics.assign tokens text in | |
127 | - ignore(parse_text name 1 tokens lex_sems text)) |
parser/exec.ml renamed to exec/ENIAMexec.ml
1 | 1 | (* |
2 | - * ENIAM: Categorial Syntactic-Semantic Parser for Polish | |
3 | - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
2 | + * ENIAMexec implements ENIAM processing stream | |
3 | + * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences | |
5 | 5 | * |
6 | - * This program is free software: you can redistribute it and/or modify | |
7 | - * it under the terms of the GNU General Public License as published by | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | 8 | * the Free Software Foundation, either version 3 of the License, or |
9 | 9 | * (at your option) any later version. |
10 | 10 | * |
11 | - * This program is distributed in the hope that it will be useful, | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | - * GNU General Public License for more details. | |
14 | + * GNU Lesser General Public License for more details. | |
15 | 15 | * |
16 | - * You should have received a copy of the GNU General Public License | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | 17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
18 | 18 | *) |
19 | 19 | |
20 | -open LCGtypes | |
21 | -open ExecTypes | |
20 | +(* open LCGtypes *) | |
21 | +open ENIAMexecTypes | |
22 | +open Xstd | |
23 | + | |
24 | +let clarify_categories senses_map token = | |
25 | + match token.ENIAMtokenizerTypes.token with | |
26 | + ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> | |
27 | + let senses = try StringMap.find senses_map lemma with Not_found -> ["X"] in | |
28 | + List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) | |
29 | + | ENIAMtokenizerTypes.Proper(lemma,pos,interp,senses2) -> | |
30 | + let senses = try StringMap.find senses_map lemma with Not_found -> senses2 in | |
31 | + List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) | |
32 | + | ENIAMtokenizerTypes.Interp lemma -> | |
33 | + let senses = try StringMap.find senses_map lemma with Not_found -> ["X"] in | |
34 | + ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) | |
35 | + | _ -> [] | |
36 | + | |
37 | +let create_chart rules senses_map tokens lex_sems paths last = | |
38 | + ENIAM_LCGrenderer.reset_variable_numbers (); | |
39 | + let chart = ENIAM_LCGchart.make last in | |
40 | + let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | |
41 | + let t = ExtArray.get tokens id in | |
42 | + let s = ExtArray.get lex_sems id in | |
43 | + ENIAM_LCGrenderer.reset_variable_names (); | |
44 | + ENIAM_LCGrenderer.add_variable_numbers (); | |
45 | + let cats = clarify_categories senses_map t in | |
46 | + let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | |
47 | + ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | |
48 | + chart | |
49 | + | |
50 | +let test_example rules senses_map name tokens lex_sems paths last = | |
51 | + ENIAM_LCGreductions.reset_variant_label (); | |
52 | + let chart = create_chart rules senses_map tokens lex_sems paths last in | |
53 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; | |
54 | + let chart,references = ENIAM_LCGchart.lazify chart in | |
55 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | |
56 | + ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; | |
57 | + let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
58 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | |
59 | + ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; | |
60 | + if ENIAM_LCGchart.is_parsed chart then ( | |
61 | + let term = ENIAM_LCGchart.get_parsed_term chart in | |
62 | + Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file -> | |
63 | + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
64 | + Xlatex.latex_compile_and_clean "results/" (name^"4_term"); | |
65 | + let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
66 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree; | |
67 | + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
68 | + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
69 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree; | |
70 | + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
71 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree; | |
72 | + ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree; | |
73 | + ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree; | |
74 | + ()) | |
75 | + else print_endline "not reduced") | |
76 | + else print_endline "not parsed" | |
77 | + | |
78 | +let parse rules senses_map name id tokens lex_sems = | |
79 | + ENIAMsubsyntaxTypes.fold_text ENIAMsubsyntaxTypes.Struct id (fun mode id -> function | |
80 | + ENIAMsubsyntaxTypes.RawSentence s -> id | |
81 | + | ENIAMsubsyntaxTypes.StructSentence(paths,last) -> | |
82 | + test_example rules senses_map (name ^ string_of_int id ^ "_") tokens lex_sems paths last; | |
83 | + id + 1 | |
84 | + | ENIAMsubsyntaxTypes.DepSentence(paths) -> id | |
85 | + | _ -> failwith "parse") | |
22 | 86 | |
87 | +(* | |
23 | 88 | let empty_result = { |
24 | 89 | input_text=RawText ""; |
25 | 90 | pre_text=RawText ""; |
... | ... | @@ -676,3 +741,4 @@ let process_file_id filename output_filename timeout = |
676 | 741 | Printf.fprintf oc "\n%!"; |
677 | 742 | let _ = Unix.shutdown_connection ic in |
678 | 743 | ()*) |
744 | +*) | |
... | ... |
parser/execTypes.ml renamed to exec/ENIAMexecTypes.ml
1 | 1 | (* |
2 | - * ENIAM: Categorial Syntactic-Semantic Parser for Polish | |
3 | - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
2 | + * ENIAMexec implements ENIAM processing stream | |
3 | + * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences | |
5 | 5 | * |
6 | - * This program is free software: you can redistribute it and/or modify | |
7 | - * it under the terms of the GNU General Public License as published by | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | 8 | * the Free Software Foundation, either version 3 of the License, or |
9 | 9 | * (at your option) any later version. |
10 | 10 | * |
11 | - * This program is distributed in the hope that it will be useful, | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | - * GNU General Public License for more details. | |
14 | + * GNU Lesser General Public License for more details. | |
15 | 15 | * |
16 | - * You should have received a copy of the GNU General Public License | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | 17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
18 | 18 | *) |
19 | - | |
19 | +(* | |
20 | 20 | type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated |
21 | 21 | |
22 | 22 | type eniam_parse_result = { |
... | ... | @@ -145,3 +145,4 @@ type message_to_overseer = |
145 | 145 | |
146 | 146 | let time_fun = Unix.gettimeofday |
147 | 147 | (* let time_fun = Sys.time () *) |
148 | +*) | |
... | ... |
exec/README
0 → 100644
1 | +ENIAMexec Version 1.0 : | |
2 | +----------------------- | |
3 | + | |
4 | +ENIAMexec implements ENIAM processing stream. | |
5 | + | |
6 | +Install | |
7 | +------- | |
8 | + | |
9 | +ENIAM_LCGlexicon requires OCaml version 4.02.3 compiler | |
10 | +together with Xlib library version 3.2 or later | |
11 | +and ENIAM_LCGparser library version 1.0. | |
12 | + | |
13 | +In order to install type: | |
14 | + | |
15 | +make install | |
16 | + | |
17 | +by default, ENIAM_LCGlexicon is installed in the 'ocamlc -where'/eniam directory. | |
18 | +you can change it by editing the Makefile. | |
19 | + | |
20 | +In order to test library type: | |
21 | +make test | |
22 | +./test | |
23 | + | |
24 | +In order to print lexicon as pdf file type: | |
25 | +make print_lexicon | |
26 | +./print_lexicon | |
27 | + | |
28 | +Both test and print_lexicon require pdflatex installed. | |
29 | + | |
30 | +By default ENIAM_LCGlexicon looks for resources in /usr/share/eniam directory. | |
31 | +However this behaviour may be changed by setting end exporting ENIAM_RESOURCE_PATH | |
32 | +environment variable. | |
33 | + | |
34 | +Credits | |
35 | +------- | |
36 | +Copyright © 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
37 | +Copyright © 2016 Institute of Computer Science Polish Academy of Sciences | |
38 | + | |
39 | +Licence | |
40 | +------- | |
41 | + | |
42 | +This library is free software: you can redistribute it and/or modify | |
43 | +it under the terms of the GNU Lesser General Public License as published by | |
44 | +the Free Software Foundation, either version 3 of the License, or | |
45 | +(at your option) any later version. | |
46 | + | |
47 | +This library is distributed in the hope that it will be useful, | |
48 | +but WITHOUT ANY WARRANTY; without even the implied warranty of | |
49 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
50 | +GNU Lesser General Public License for more details. | |
51 | + | |
52 | +You should have received a copy of the GNU Lesser General Public License | |
53 | +along with this program. If not, see <http://www.gnu.org/licenses/>. | |
... | ... |
exec/TODO
0 → 100644
exec/lgpl-3.0.txt
0 → 100644
1 | + GNU LESSER GENERAL PUBLIC LICENSE | |
2 | + Version 3, 29 June 2007 | |
3 | + | |
4 | + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> | |
5 | + Everyone is permitted to copy and distribute verbatim copies | |
6 | + of this license document, but changing it is not allowed. | |
7 | + | |
8 | + | |
9 | + This version of the GNU Lesser General Public License incorporates | |
10 | +the terms and conditions of version 3 of the GNU General Public | |
11 | +License, supplemented by the additional permissions listed below. | |
12 | + | |
13 | + 0. Additional Definitions. | |
14 | + | |
15 | + As used herein, "this License" refers to version 3 of the GNU Lesser | |
16 | +General Public License, and the "GNU GPL" refers to version 3 of the GNU | |
17 | +General Public License. | |
18 | + | |
19 | + "The Library" refers to a covered work governed by this License, | |
20 | +other than an Application or a Combined Work as defined below. | |
21 | + | |
22 | + An "Application" is any work that makes use of an interface provided | |
23 | +by the Library, but which is not otherwise based on the Library. | |
24 | +Defining a subclass of a class defined by the Library is deemed a mode | |
25 | +of using an interface provided by the Library. | |
26 | + | |
27 | + A "Combined Work" is a work produced by combining or linking an | |
28 | +Application with the Library. The particular version of the Library | |
29 | +with which the Combined Work was made is also called the "Linked | |
30 | +Version". | |
31 | + | |
32 | + The "Minimal Corresponding Source" for a Combined Work means the | |
33 | +Corresponding Source for the Combined Work, excluding any source code | |
34 | +for portions of the Combined Work that, considered in isolation, are | |
35 | +based on the Application, and not on the Linked Version. | |
36 | + | |
37 | + The "Corresponding Application Code" for a Combined Work means the | |
38 | +object code and/or source code for the Application, including any data | |
39 | +and utility programs needed for reproducing the Combined Work from the | |
40 | +Application, but excluding the System Libraries of the Combined Work. | |
41 | + | |
42 | + 1. Exception to Section 3 of the GNU GPL. | |
43 | + | |
44 | + You may convey a covered work under sections 3 and 4 of this License | |
45 | +without being bound by section 3 of the GNU GPL. | |
46 | + | |
47 | + 2. Conveying Modified Versions. | |
48 | + | |
49 | + If you modify a copy of the Library, and, in your modifications, a | |
50 | +facility refers to a function or data to be supplied by an Application | |
51 | +that uses the facility (other than as an argument passed when the | |
52 | +facility is invoked), then you may convey a copy of the modified | |
53 | +version: | |
54 | + | |
55 | + a) under this License, provided that you make a good faith effort to | |
56 | + ensure that, in the event an Application does not supply the | |
57 | + function or data, the facility still operates, and performs | |
58 | + whatever part of its purpose remains meaningful, or | |
59 | + | |
60 | + b) under the GNU GPL, with none of the additional permissions of | |
61 | + this License applicable to that copy. | |
62 | + | |
63 | + 3. Object Code Incorporating Material from Library Header Files. | |
64 | + | |
65 | + The object code form of an Application may incorporate material from | |
66 | +a header file that is part of the Library. You may convey such object | |
67 | +code under terms of your choice, provided that, if the incorporated | |
68 | +material is not limited to numerical parameters, data structure | |
69 | +layouts and accessors, or small macros, inline functions and templates | |
70 | +(ten or fewer lines in length), you do both of the following: | |
71 | + | |
72 | + a) Give prominent notice with each copy of the object code that the | |
73 | + Library is used in it and that the Library and its use are | |
74 | + covered by this License. | |
75 | + | |
76 | + b) Accompany the object code with a copy of the GNU GPL and this license | |
77 | + document. | |
78 | + | |
79 | + 4. Combined Works. | |
80 | + | |
81 | + You may convey a Combined Work under terms of your choice that, | |
82 | +taken together, effectively do not restrict modification of the | |
83 | +portions of the Library contained in the Combined Work and reverse | |
84 | +engineering for debugging such modifications, if you also do each of | |
85 | +the following: | |
86 | + | |
87 | + a) Give prominent notice with each copy of the Combined Work that | |
88 | + the Library is used in it and that the Library and its use are | |
89 | + covered by this License. | |
90 | + | |
91 | + b) Accompany the Combined Work with a copy of the GNU GPL and this license | |
92 | + document. | |
93 | + | |
94 | + c) For a Combined Work that displays copyright notices during | |
95 | + execution, include the copyright notice for the Library among | |
96 | + these notices, as well as a reference directing the user to the | |
97 | + copies of the GNU GPL and this license document. | |
98 | + | |
99 | + d) Do one of the following: | |
100 | + | |
101 | + 0) Convey the Minimal Corresponding Source under the terms of this | |
102 | + License, and the Corresponding Application Code in a form | |
103 | + suitable for, and under terms that permit, the user to | |
104 | + recombine or relink the Application with a modified version of | |
105 | + the Linked Version to produce a modified Combined Work, in the | |
106 | + manner specified by section 6 of the GNU GPL for conveying | |
107 | + Corresponding Source. | |
108 | + | |
109 | + 1) Use a suitable shared library mechanism for linking with the | |
110 | + Library. A suitable mechanism is one that (a) uses at run time | |
111 | + a copy of the Library already present on the user's computer | |
112 | + system, and (b) will operate properly with a modified version | |
113 | + of the Library that is interface-compatible with the Linked | |
114 | + Version. | |
115 | + | |
116 | + e) Provide Installation Information, but only if you would otherwise | |
117 | + be required to provide such information under section 6 of the | |
118 | + GNU GPL, and only to the extent that such information is | |
119 | + necessary to install and execute a modified version of the | |
120 | + Combined Work produced by recombining or relinking the | |
121 | + Application with a modified version of the Linked Version. (If | |
122 | + you use option 4d0, the Installation Information must accompany | |
123 | + the Minimal Corresponding Source and Corresponding Application | |
124 | + Code. If you use option 4d1, you must provide the Installation | |
125 | + Information in the manner specified by section 6 of the GNU GPL | |
126 | + for conveying Corresponding Source.) | |
127 | + | |
128 | + 5. Combined Libraries. | |
129 | + | |
130 | + You may place library facilities that are a work based on the | |
131 | +Library side by side in a single library together with other library | |
132 | +facilities that are not Applications and are not covered by this | |
133 | +License, and convey such a combined library under terms of your | |
134 | +choice, if you do both of the following: | |
135 | + | |
136 | + a) Accompany the combined library with a copy of the same work based | |
137 | + on the Library, uncombined with any other library facilities, | |
138 | + conveyed under the terms of this License. | |
139 | + | |
140 | + b) Give prominent notice with the combined library that part of it | |
141 | + is a work based on the Library, and explaining where to find the | |
142 | + accompanying uncombined form of the same work. | |
143 | + | |
144 | + 6. Revised Versions of the GNU Lesser General Public License. | |
145 | + | |
146 | + The Free Software Foundation may publish revised and/or new versions | |
147 | +of the GNU Lesser General Public License from time to time. Such new | |
148 | +versions will be similar in spirit to the present version, but may | |
149 | +differ in detail to address new problems or concerns. | |
150 | + | |
151 | + Each version is given a distinguishing version number. If the | |
152 | +Library as you received it specifies that a certain numbered version | |
153 | +of the GNU Lesser General Public License "or any later version" | |
154 | +applies to it, you have the option of following the terms and | |
155 | +conditions either of that published version or of any later version | |
156 | +published by the Free Software Foundation. If the Library as you | |
157 | +received it does not specify a version number of the GNU Lesser | |
158 | +General Public License, you may choose any version of the GNU Lesser | |
159 | +General Public License ever published by the Free Software Foundation. | |
160 | + | |
161 | + If the Library as you received it specifies that a proxy can decide | |
162 | +whether future versions of the GNU Lesser General Public License shall | |
163 | +apply, that proxy's public statement of acceptance of any version is | |
164 | +permanent authorization for you to choose that version for the | |
165 | +Library. | |
... | ... |
exec/makefile
0 → 100755
1 | +OCAMLC=ocamlc | |
2 | +OCAMLOPT=ocamlopt | |
3 | +OCAMLDEP=ocamldep | |
4 | +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam | |
5 | +OCAMLFLAGS=$(INCLUDES) -g | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa \ | |
7 | + eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa \ | |
8 | + eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa \ | |
9 | + eniam-lexSemantics.cmxa eniam-exec.cmxa | |
10 | +INSTALLDIR=`ocamlc -where`/eniam | |
11 | + | |
12 | +SOURCES= ENIAMexecTypes.ml ENIAMexec.ml | |
13 | + | |
14 | +all: eniam-exec.cma eniam-exec.cmxa | |
15 | + | |
16 | +install: all | |
17 | + mkdir -p $(INSTALLDIR) | |
18 | + cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) | |
19 | + cp ENIAMexecTypes.cmi ENIAMexec.cmi $(INSTALLDIR) | |
20 | + cp ENIAMexecTypes.cmx ENIAMexec.cmx $(INSTALLDIR) | |
21 | + | |
22 | + | |
23 | +eniam-exec.cma: $(SOURCES) | |
24 | + ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^ | |
25 | + | |
26 | +eniam-exec.cmxa: $(SOURCES) | |
27 | + ocamlopt -linkall -a -o eniam-exec.cmxa $(INCLUDES) $^ | |
28 | + | |
29 | +parser: parser.ml | |
30 | + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS) parser.ml | |
31 | + | |
32 | +semparser: semparser.ml | |
33 | + mkdir -p results | |
34 | + $(OCAMLOPT) -o semparser $(OCAMLOPTFLAGS) semparser.ml | |
35 | + | |
36 | +.SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx | |
37 | + | |
38 | +.mll.ml: | |
39 | + ocamllex $< | |
40 | + | |
41 | +.mly.mli: | |
42 | + ocamlyacc $< | |
43 | + | |
44 | +.mly.ml: | |
45 | + ocamlyacc $< | |
46 | + | |
47 | +.ml.cmo: | |
48 | + $(OCAMLC) $(OCAMLFLAGS) -c $< | |
49 | + | |
50 | +.mli.cmi: | |
51 | + $(OCAMLC) $(OCAMLFALGS) -c $< | |
52 | + | |
53 | +.ml.cmx: | |
54 | + $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< | |
55 | + | |
56 | +clean: | |
57 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a parser semparser | |
... | ... |
exec/parser.ml
0 → 100644
1 | +(* | |
2 | + * ENIAMexec implements ENIAM processing stream | |
3 | + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open ENIAM_LCGlexiconTypes | |
21 | +open ENIAM_LCGtypes | |
22 | +open ENIAMsubsyntaxTypes | |
23 | +open Xstd | |
24 | + | |
25 | +let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename | |
26 | + | |
27 | +let examples = [ | |
28 | + "Szpak","Szpak śpiewa."; | |
29 | + (* "miał","Miałem miał."; *) | |
30 | +(* "Ala","Ala ma kota."; | |
31 | + "Ale","Ale mają kota:"; *) | |
32 | + (* "zima","Szpak frunie zimą.";*) | |
33 | + (* "październik","Kot miauczy w październiku."; *) | |
34 | +(* "Szpak-Kot","Szpak frunie. Kot miauczy."; | |
35 | + "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*) | |
36 | + (* "teraz","Teraz frunie jakiś szpak."; | |
37 | + "chłopcy","Chłopcy mają ulicę kwiatami."; *) | |
38 | + (* "arabia","Arabia Saudyjska biegnie.";*) | |
39 | +(* "Tom","Tom idzie."; *) | |
40 | + (* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; | |
41 | + "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; *) | |
42 | +] | |
43 | + | |
44 | + | |
45 | + | |
46 | + | |
47 | +let _ = | |
48 | + ENIAMsubsyntax.initialize (); | |
49 | + ENIAMcategoriesPL.initialize (); | |
50 | + ENIAMwalParser.initialize (); | |
51 | + ENIAMwalReduce.initialize (); | |
52 | + Xlist.iter examples (fun (name,example) -> | |
53 | + let text,tokens = ENIAMsubsyntax.parse_text example in | |
54 | + let lex_sems = ENIAMlexSemantics.assign tokens text in | |
55 | + ignore(ENIAMexec.parse rules StringMap.empty name 1 tokens lex_sems text)) | |
56 | + | |
57 | +(* | |
58 | +type output = Text | Xml | Html | Marsh | Graphviz | |
59 | + | |
60 | +let output = ref Text | |
61 | +let comm_stdio = ref true | |
62 | +let sentence_split = ref true | |
63 | +let port = ref 0 | |
64 | + | |
65 | +let spec_list = [ | |
66 | + "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | |
67 | + "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; | |
68 | + "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; | |
69 | + "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; | |
70 | + "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; | |
71 | + "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; | |
72 | + "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; | |
73 | + "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; | |
74 | + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; | |
75 | + (* "-r", Arg.String (fun p -> | |
76 | + ENIAMtokenizerTypes.set_resource_path p; | |
77 | + ENIAMmorphologyTypes.set_resource_path p; | |
78 | + ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *) | |
79 | + ] | |
80 | + | |
81 | +let usage_msg = | |
82 | + "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:" | |
83 | + | |
84 | +let message = "ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish\n\ | |
85 | +Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\ | |
86 | +Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences" | |
87 | + | |
88 | +let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s)) | |
89 | + | |
90 | +let input_text channel = | |
91 | + let s = ref (try input_line channel with End_of_file -> "") in | |
92 | + let lines = ref [] in | |
93 | + while !s <> "" do | |
94 | + lines := !s :: !lines; | |
95 | + s := try input_line channel with End_of_file -> "" | |
96 | + done; | |
97 | + String.concat "\n" (List.rev !lines) | |
98 | + | |
99 | +let rec main_loop in_chan out_chan = | |
100 | + let text = input_text in_chan in | |
101 | + if text = "" then () else ( | |
102 | + (* print_endline "input text begin"; | |
103 | + print_endline text; | |
104 | + print_endline "input text end"; *) | |
105 | + (if !sentence_split then | |
106 | + let text,tokens = ENIAMsubsyntax.parse_text text in | |
107 | + (match !output with | |
108 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n") | |
109 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n") | |
110 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n") | |
111 | + | Marsh -> Marshal.to_channel out_chan (text,tokens) [] | |
112 | + | Graphviz -> failwith "main_loop: ni") | |
113 | + else | |
114 | + let tokens = ENIAMsubsyntax.parse text in | |
115 | + (match !output with | |
116 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n") | |
117 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n") | |
118 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n") | |
119 | + | Marsh -> Marshal.to_channel out_chan tokens [] | |
120 | + | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))); | |
121 | + flush out_chan; | |
122 | + main_loop in_chan out_chan) | |
123 | + | |
124 | +let _ = | |
125 | + prerr_endline message; | |
126 | + Arg.parse spec_list anon_fun usage_msg; | |
127 | + Gc.compact (); | |
128 | + prerr_endline "Ready!"; | |
129 | + if !comm_stdio then main_loop stdin stdout | |
130 | + else | |
131 | + let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in | |
132 | + Unix.establish_server main_loop sockaddr | |
133 | +*) | |
... | ... |
exec/semparser.ml
0 → 100644
1 | +(* | |
2 | + * ENIAMexec implements ENIAM processing stream | |
3 | + * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open Xstd | |
21 | +open ENIAMsubsyntaxTypes | |
22 | + | |
23 | +let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.user_lexicon_filename | |
24 | + | |
25 | +let load_senses_map filename = | |
26 | + File.fold_tab filename StringMap.empty (fun map -> function | |
27 | + [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l) | |
28 | + | l -> failwith ("load_senses_map: " ^ String.concat "\t" l)) | |
29 | + | |
30 | +let senses_map = load_senses_map ENIAM_LCGlexiconTypes.user_senses_filename | |
31 | + | |
32 | + | |
33 | +let examples = [ | |
34 | + (* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) | |
35 | + "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; | |
36 | +] | |
37 | + | |
38 | + | |
39 | +(* | |
40 | +type output = Text | Xml | Html | Marsh | Graphviz | |
41 | + | |
42 | +let output = ref Text | |
43 | +let comm_stdio = ref true | |
44 | +let sentence_split = ref true | |
45 | +let port = ref 0 | |
46 | +*) | |
47 | +let subsyntax_built_in = ref true | |
48 | +let subsyntax_host = ref "localhost" | |
49 | +let subsyntax_port = ref 5739 | |
50 | + | |
51 | +let spec_list = [ | |
52 | +(* "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | |
53 | + "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; | |
54 | + "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; | |
55 | + "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; | |
56 | + "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; | |
57 | + "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; | |
58 | + "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; | |
59 | + "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; | |
60 | + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off";*) | |
61 | + (* "-r", Arg.String (fun p -> | |
62 | + ENIAMtokenizerTypes.set_resource_path p; | |
63 | + ENIAMmorphologyTypes.set_resource_path p; | |
64 | + ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *) | |
65 | + "-b", Arg.Unit (fun () -> subsyntax_built_in:=true), "Use built in version of ENIAMsubsyntax (default)"; | |
66 | + "--port", Arg.Int (fun p -> subsyntax_built_in:=false; subsyntax_port:=p), "<port> Connect to ENIAMsubsyntax on a given port"; | |
67 | + "--host", Arg.String (fun s -> subsyntax_built_in:=false; subsyntax_host:=s), "<hostname> Connect to ENIAMsubsyntax on a given host (by default localhost)"; | |
68 | + ] | |
69 | + | |
70 | +let usage_msg = | |
71 | + "Usage: semparser <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:" | |
72 | + | |
73 | +let message = "ENIAM_LCGsemparser, a parser for Logical Categorial Grammar formalism\n\ | |
74 | +Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\ | |
75 | +Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences" | |
76 | + | |
77 | +let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s)) | |
78 | + | |
79 | +let input_text channel = | |
80 | + let s = ref (try input_line channel with End_of_file -> "") in | |
81 | + let lines = ref [] in | |
82 | + while !s <> "" do | |
83 | + lines := !s :: !lines; | |
84 | + s := try input_line channel with End_of_file -> "" | |
85 | + done; | |
86 | + String.concat "\n" (List.rev !lines) | |
87 | + | |
88 | +let rec main_loop sub_in sub_out in_chan out_chan = | |
89 | + let text = input_text in_chan in | |
90 | + if text = "" then () else ( | |
91 | + Printf.fprintf sub_out "%s\n\n%!" text; | |
92 | + let text,tokens = | |
93 | + if !subsyntax_built_in then ENIAMsubsyntax.parse_text text else | |
94 | + (Marshal.from_channel sub_in : ENIAMsubsyntaxTypes.text * ENIAMtokenizerTypes.token_env ExtArray.t) in | |
95 | + let lex_sems = ExtArray.make (ExtArray.size tokens) ENIAMlexSemanticsTypes.empty_lex_sem in | |
96 | + ignore(ENIAMexec.parse rules senses_map "E"(*name*) 1 tokens lex_sems text) | |
97 | + (* print_endline "input text begin"; | |
98 | + print_endline text; | |
99 | + print_endline "input text end"; *) | |
100 | + (*if !sentence_split then | |
101 | + let text,tokens = ENIAMsubsyntax.parse_text text in | |
102 | + (match !output with | |
103 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n") | |
104 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n") | |
105 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n") | |
106 | + | Marsh -> Marshal.to_channel out_chan (text,tokens) [] | |
107 | + | Graphviz -> failwith "main_loop: ni") | |
108 | + else | |
109 | + let tokens = ENIAMsubsyntax.parse text in | |
110 | + (match !output with | |
111 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n") | |
112 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n") | |
113 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n") | |
114 | + | Marsh -> Marshal.to_channel out_chan tokens [] | |
115 | + | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))*); | |
116 | + flush out_chan; | |
117 | + main_loop sub_in sub_out in_chan out_chan) | |
118 | + | |
119 | +let get_sock_addr host_name port = | |
120 | + let he = Unix.gethostbyname host_name in | |
121 | + let addr = he.Unix.h_addr_list in | |
122 | + Unix.ADDR_INET(addr.(0),port) | |
123 | + | |
124 | +let _ = | |
125 | + prerr_endline message; | |
126 | + (* ENIAMsubsyntax.initialize (); *) | |
127 | + ENIAMcategoriesPL.initialize (); | |
128 | + Arg.parse spec_list anon_fun usage_msg; | |
129 | + if !subsyntax_built_in then ENIAMsubsyntax.initialize (); | |
130 | + Gc.compact (); | |
131 | + let sub_in,sub_out = | |
132 | + if !subsyntax_built_in then stdin,stdout | |
133 | + else Unix.open_connection (get_sock_addr !subsyntax_host !subsyntax_port) in | |
134 | + prerr_endline "Ready!"; | |
135 | + (*if !comm_stdio then*) main_loop sub_in sub_out stdin stdout | |
136 | + (*else | |
137 | + let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in | |
138 | + Unix.establish_server main_loop sockaddr*) | |
... | ... |
lexSemantics/ENIAMwalParser.ml
... | ... | @@ -448,11 +448,19 @@ let load_meanings filename = |
448 | 448 | Xlist.fold l IntMap.empty (fun meanings m -> |
449 | 449 | IntMap.add meanings m.mng_id m) |
450 | 450 | |
451 | -let phrases = load_phrases phrases_filename | |
452 | -let entries = load_entries entries_filename | |
453 | -let schemata = load_schemata schemata_filename | |
454 | -let connected = load_connected connected_filename | |
455 | -let meanings = load_meanings meanings_filename | |
451 | +let phrases = ref IntMap.empty | |
452 | +let entries = ref StringMap.empty | |
453 | +let schemata = ref StringMap.empty | |
454 | +let connected = ref StringMap.empty | |
455 | +let meanings = ref IntMap.empty | |
456 | + | |
457 | +let initialize () = | |
458 | + phrases := load_phrases phrases_filename; | |
459 | + entries := load_entries entries_filename; | |
460 | + schemata := load_schemata schemata_filename; | |
461 | + connected := load_connected connected_filename; | |
462 | + meanings := load_meanings meanings_filename; | |
463 | + () | |
456 | 464 | |
457 | 465 | |
458 | 466 | (* |
... | ... |
lexSemantics/ENIAMwalReduce.ml
... | ... | @@ -68,14 +68,23 @@ let create_comprep_adjuncts comprep_reqs comprep_reqs2 = |
68 | 68 | StringMap.map map (fun l -> |
69 | 69 | Xlist.map l (fun s -> s, try StringMap.find comprep_reqs s with Not_found -> StringSet.empty)) |
70 | 70 | |
71 | -let comprep_reqs,comprep_reqs2 = create_comprep_reqs ENIAMwalParser.entries | |
72 | -let lexarg_reqs = create_lexarg_reqs ENIAMwalParser.entries | |
73 | -let comprep_adjuncts = create_comprep_adjuncts comprep_reqs comprep_reqs2 | |
71 | +let comprep_reqs = ref StringMap.empty | |
72 | +let comprep_reqs2 = ref StringMap.empty | |
73 | +let lexarg_reqs = ref IntMap.empty | |
74 | +let comprep_adjuncts = ref StringMap.empty | |
75 | + | |
76 | +let initialize () = | |
77 | + let a,b = create_comprep_reqs !ENIAMwalParser.entries in | |
78 | + comprep_reqs := a; | |
79 | + comprep_reqs2 := b; | |
80 | + lexarg_reqs := create_lexarg_reqs !ENIAMwalParser.entries; | |
81 | + comprep_adjuncts := create_comprep_adjuncts !comprep_reqs !comprep_reqs2; | |
82 | + () | |
74 | 83 | |
75 | 84 | let select_comprep_adjuncts lexemes = |
76 | 85 | StringSet.fold lexemes [] (fun l lemma -> |
77 | 86 | try |
78 | - Xlist.fold (StringMap.find comprep_adjuncts lemma) l (fun l (s,reqs) -> | |
87 | + Xlist.fold (StringMap.find !comprep_adjuncts lemma) l (fun l (s,reqs) -> | |
79 | 88 | (* Printf.printf "%s: %s: %s\n" lemma s (String.concat " " (StringSet.to_list reqs)); *) |
80 | 89 | if StringSet.is_empty reqs || |
81 | 90 | not (StringSet.is_empty (StringSet.intersection reqs lexemes)) then s :: l else l) |
... | ... | @@ -216,8 +225,8 @@ let select_all_entries phrases entries schemata connected meanings = |
216 | 225 | entries,schemata,connected |
217 | 226 | |
218 | 227 | let select_entries lexemes = |
219 | - select_entries_full ENIAMwalParser.phrases ENIAMwalParser.entries ENIAMwalParser.schemata | |
220 | - ENIAMwalParser.connected ENIAMwalParser.meanings comprep_reqs comprep_reqs2 lexarg_reqs lexemes | |
228 | + select_entries_full !ENIAMwalParser.phrases !ENIAMwalParser.entries !ENIAMwalParser.schemata | |
229 | + !ENIAMwalParser.connected !ENIAMwalParser.meanings !comprep_reqs !comprep_reqs2 !lexarg_reqs lexemes | |
221 | 230 | |
222 | 231 | (* let entries,schemata,connected = |
223 | 232 | (* let lexemes = StringSet.of_list ["Ala"; "ma"; "kot"] in *) |
... | ... |
subsyntax/ENIAMsubsyntaxTypes.ml
... | ... | @@ -76,3 +76,65 @@ let int_of_mode = function |
76 | 76 | |
77 | 77 | let compare_mode x y = |
78 | 78 | compare (int_of_mode x) (int_of_mode y) |
79 | + | |
80 | + | |
81 | +let rec map_sentence mode f = function | |
82 | + | QuotedSentences sentences -> | |
83 | + let sentences = Xlist.rev_map sentences (fun p -> | |
84 | + let sentence = map_sentence mode f p.sentence in | |
85 | + {p with sentence=sentence}) in | |
86 | + QuotedSentences(List.rev sentences) | |
87 | + | AltSentence l -> | |
88 | + let l = Xlist.rev_map l (fun (mode,sentence) -> | |
89 | + mode, map_sentence mode f sentence) in | |
90 | + AltSentence(List.rev l) | |
91 | + | s -> f mode s | |
92 | + | |
93 | +let rec map_paragraph mode f = function | |
94 | + RawParagraph s -> RawParagraph s | |
95 | + | StructParagraph sentences -> | |
96 | + let sentences = Xlist.rev_map sentences (fun p -> | |
97 | + let sentence = map_sentence mode f p.sentence in | |
98 | + {p with sentence=sentence}) in | |
99 | + StructParagraph(List.rev sentences) | |
100 | + | AltParagraph l -> | |
101 | + let l = Xlist.rev_map l (fun (mode,paragraph) -> | |
102 | + mode, map_paragraph mode f paragraph) in | |
103 | + AltParagraph(List.rev l) | |
104 | + | |
105 | +let rec map_text mode f = function | |
106 | + RawText s -> RawText s | |
107 | + | StructText paragraphs -> | |
108 | + let paragraphs = Xlist.rev_map paragraphs (fun paragraph -> | |
109 | + map_paragraph mode f paragraph) in | |
110 | + StructText(List.rev paragraphs) | |
111 | + | AltText l -> AltText(Xlist.map l (fun (mode,text) -> | |
112 | + mode, map_text mode f text)) | |
113 | + | |
114 | + | |
115 | +let rec fold_sentence mode s f = function | |
116 | + QuotedSentences sentences -> | |
117 | + Xlist.fold sentences s (fun s p -> | |
118 | + fold_sentence mode s f p.sentence) | |
119 | + | AltSentence l -> | |
120 | + Xlist.fold l s (fun s (mode,sentence) -> | |
121 | + fold_sentence mode s f sentence) | |
122 | + | t -> f mode s t | |
123 | + | |
124 | +let rec fold_paragraph mode s f = function | |
125 | + RawParagraph _ -> s | |
126 | + | StructParagraph sentences -> | |
127 | + Xlist.fold sentences s (fun s p -> | |
128 | + fold_sentence mode s f p.sentence) | |
129 | + | AltParagraph l -> | |
130 | + Xlist.fold l s (fun s (mode,paragraph) -> | |
131 | + fold_paragraph mode s f paragraph) | |
132 | + | |
133 | +let rec fold_text mode s f = function | |
134 | + RawText _ -> s | |
135 | + | StructText paragraphs -> | |
136 | + Xlist.fold paragraphs s (fun s paragraph -> | |
137 | + fold_paragraph mode s f paragraph) | |
138 | + | AltText l -> | |
139 | + Xlist.fold l s (fun s (mode,text) -> | |
140 | + fold_text mode s f text) | |
... | ... |