Commit b369a30acf75ced421d7d50d287a58cac987ec2d
1 parent
2f308cb1
Parser gramatyk semantycznych
Showing
12 changed files
with
348 additions
and
94 deletions
LCGlexicon/ENIAM_LCGlexiconTypes.ml
... | ... | @@ -83,7 +83,13 @@ let resource_path = |
83 | 83 | if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else |
84 | 84 | failwith "resource directory does not exists" |
85 | 85 | |
86 | +let data_path = | |
87 | + try Sys.getenv "ENIAM_USER_DATA_PATH" | |
88 | + with Not_found -> "data" | |
89 | + | |
86 | 90 | let rules_filename = resource_path ^ "/LCGlexicon/lexicon-pl.dic" |
91 | +let user_lexicon_filename = data_path ^ "/lexicon.dic" | |
92 | +let user_senses_filename = data_path ^ "/senses.tab" | |
87 | 93 | |
88 | 94 | let subst_uncountable_lexemes_filename = resource_path ^ "/LCGlexicon/subst_uncountable.dat" |
89 | 95 | let subst_uncountable_lexemes_filename2 = resource_path ^ "/LCGlexicon/subst_uncountable_stare.dat" |
... | ... | @@ -91,7 +97,4 @@ let subst_container_lexemes_filename = resource_path ^ "/LCGlexicon/subst_contai |
91 | 97 | let subst_numeral_lexemes_filename = resource_path ^ "/LCGlexicon/subst_numeral.dat" |
92 | 98 | let subst_time_lexemes_filename = resource_path ^ "/LCGlexicon/subst_time.dat" |
93 | 99 | |
94 | -(*let proper_names_filename = resource_path ^ "/lexSemantics/proper_names_sgjp_polimorf.tab" | |
95 | - let proper_names_filename2 = resource_path ^ "/lexSemantics/proper_names.tab"*) | |
96 | - | |
97 | 100 | let adv_modes_filename = resource_path ^ "/Walenty/adv_modes.tab" |
... | ... |
LCGlexicon/ENIAMcategoriesPL.ml
... | ... | @@ -36,7 +36,7 @@ let selector_values = Xlist.fold [ |
36 | 36 | "match-result";"url";"email";"obj-id";"adj";"adjc";"adjp";"adja"; |
37 | 37 | "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt"; |
38 | 38 | "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj"; |
39 | - "sinterj";"burk";"interp";"unk"]; | |
39 | + "sinterj";"burk";"interp";"unk";"html-tag"]; | |
40 | 40 | Pos2, []; |
41 | 41 | Cat, []; |
42 | 42 | Number, all_numbers; |
... | ... | @@ -74,22 +74,26 @@ let split_voc cases = |
74 | 74 | "voc" -> cases, "voc" :: voc |
75 | 75 | | s -> s :: cases, voc) |
76 | 76 | |
77 | -let subst_uncountable_lexemes = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename) | |
78 | -let subst_uncountable_lexemes2 = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename2) | |
79 | -let subst_container_lexemes = StringSet.of_list (File.load_lines subst_container_lexemes_filename) | |
80 | -let subst_numeral_lexemes = StringSet.of_list (File.load_lines subst_numeral_lexemes_filename) | |
81 | -let subst_time_lexemes = StringSet.of_list (File.load_lines subst_time_lexemes_filename) | |
77 | +let load_subst_data filename _ = | |
78 | + StringSet.of_list (File.load_lines filename) | |
79 | + | |
80 | +let subst_uncountable_lexemes = File.catch_no_file (load_subst_data subst_uncountable_lexemes_filename) StringSet.empty | |
81 | +let subst_uncountable_lexemes2 = File.catch_no_file (load_subst_data subst_uncountable_lexemes_filename2) StringSet.empty | |
82 | +let subst_container_lexemes = File.catch_no_file (load_subst_data subst_container_lexemes_filename) StringSet.empty | |
83 | +let subst_numeral_lexemes = File.catch_no_file (load_subst_data subst_numeral_lexemes_filename) StringSet.empty | |
84 | +let subst_time_lexemes = File.catch_no_file (load_subst_data subst_time_lexemes_filename) StringSet.empty | |
82 | 85 | |
83 | 86 | let subst_pronoun_lexemes = StringSet.of_list ["co"; "kto"; "cokolwiek"; "ktokolwiek"; "nic"; "nikt"; "coś"; "ktoś"; "to"] |
84 | 87 | let adj_pronoun_lexemes = StringSet.of_list ["czyj"; "jaki"; "który"; "jakiś"; "ten"; "taki"] |
85 | 88 | |
86 | 89 | (* let adj_quant_lexemes = StringSet.of_list ["każdy"; "wszelki"; "wszystek"; "żaden"; "jakiś"; "pewien"; "niektóry"; "jedyny"; "sam"] *) |
87 | 90 | |
88 | -let adv_modes = | |
89 | - try File.fold_tab adv_modes_filename StringMap.empty (fun adv_modes -> function | |
91 | +let load_adv_modes filename adv_modes = | |
92 | + File.fold_tab filename adv_modes (fun adv_modes -> function | |
90 | 93 | [adv;mode] -> StringMap.add_inc adv_modes adv [mode] (fun l -> mode :: l) |
91 | 94 | | _ -> failwith "adv_modes") |
92 | - with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); StringMap.empty) | |
95 | + | |
96 | +let adv_modes = File.catch_no_file (load_adv_modes adv_modes_filename) StringMap.empty | |
93 | 97 | |
94 | 98 | let noun_type proper lemma pos = |
95 | 99 | let nsyn = |
... | ... | @@ -347,6 +351,7 @@ let clarify_categories proper cat = function |
347 | 351 | | lemma,"interp",[] -> [{empty_cats with lemma=lemma; pos="interp"; pos2="interp"}] |
348 | 352 | | lemma,"unk",[] -> |
349 | 353 | [{empty_cats with lemma=lemma; pos="unk"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}] |
354 | + | lemma,"html-tag",[] -> [{empty_cats with lemma=lemma; pos="html-tag"; pos2="html-tag"}] | |
350 | 355 | | lemma,c,l -> failwith ("clarify_categories: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) |
351 | 356 | |
352 | 357 | (* FIXME: przenieść gdzieś indziej *) |
... | ... | @@ -547,4 +552,5 @@ let pos_categories = Xlist.fold [ |
547 | 552 | "burk",[Lemma;]; |
548 | 553 | "interp",[Lemma;]; |
549 | 554 | "unk",[Lemma;Number;Case;Gender;Person;]; |
555 | + "html-tag",[Lemma;]; | |
550 | 556 | ] StringMap.empty (fun map (k,l) -> StringMap.add map k l) |
... | ... |
LCGlexicon/TODO
LCGlexicon/makefile
... | ... | @@ -4,7 +4,8 @@ OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa |
7 | -OCAMLOPTFLAGS2=$(OCAMLOPTFLAGS) eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lexSemantics.cmxa | |
7 | +OCAMLOPTFLAGS2=$(OCAMLOPTFLAGS) eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa | |
8 | +OCAMLOPTFLAGS3=$(OCAMLOPTFLAGS2) eniam-lexSemantics.cmxa | |
8 | 9 | INSTALLDIR=`ocamlc -where`/eniam |
9 | 10 | |
10 | 11 | SOURCES= ENIAM_LCGlexiconTypes.ml ENIAMcategoriesPL.ml ENIAM_LCGlexiconParser.ml ENIAM_LCGlexicon.ml |
... | ... | @@ -40,10 +41,14 @@ test: test.ml |
40 | 41 | |
41 | 42 | test2: test2.ml |
42 | 43 | mkdir -p results |
43 | - $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS2) test2.ml | |
44 | + $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS3) test2.ml | |
44 | 45 | |
45 | 46 | interface: interface.ml |
46 | - $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) interface.ml | |
47 | + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS3) interface.ml | |
48 | + | |
49 | +parser: parser.ml | |
50 | + mkdir -p results | |
51 | + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) parser.ml | |
47 | 52 | |
48 | 53 | print_lexicon: ENIAM_LCGlexiconLatexOf.ml |
49 | 54 | mkdir -p results |
... | ... |
LCGlexicon/parser.ml
0 → 100644
1 | +open Xstd | |
2 | +open ENIAMsubsyntaxTypes | |
3 | + | |
4 | +let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.user_lexicon_filename | |
5 | + | |
6 | +let load_senses_map filename = | |
7 | + File.fold_tab filename StringMap.empty (fun map -> function | |
8 | + [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l) | |
9 | + | l -> failwith ("load_senses_map: " ^ String.concat "\t" l)) | |
10 | + | |
11 | +let senses_map = load_senses_map ENIAM_LCGlexiconTypes.user_senses_filename | |
12 | + | |
13 | + | |
14 | +let examples = [ | |
15 | + (* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) | |
16 | + "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; | |
17 | +] | |
18 | + | |
19 | +let clarify_categories token = | |
20 | + match token.ENIAMtokenizerTypes.token with | |
21 | + ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> | |
22 | + let senses = try StringMap.find senses_map lemma with Not_found -> ["X"] in | |
23 | + List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) | |
24 | + | ENIAMtokenizerTypes.Proper(lemma,pos,interp,senses) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) | |
25 | + | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false ["X"] (lemma,"interp",[]) | |
26 | + | _ -> [] | |
27 | + | |
28 | +let create_chart tokens paths last = | |
29 | + ENIAM_LCGrenderer.reset_variable_numbers (); | |
30 | + let chart = ENIAM_LCGchart.make last in | |
31 | + let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | |
32 | + let t = ExtArray.get tokens id in | |
33 | + ENIAM_LCGrenderer.reset_variable_names (); | |
34 | + ENIAM_LCGrenderer.add_variable_numbers (); | |
35 | + let cats = clarify_categories t in | |
36 | + let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats [] in | |
37 | + ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | |
38 | + chart | |
39 | + | |
40 | +let test_example name tokens paths last = | |
41 | + ENIAM_LCGreductions.reset_variant_label (); | |
42 | + let chart = create_chart tokens paths last in | |
43 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; | |
44 | + let chart,references = ENIAM_LCGchart.lazify chart in | |
45 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | |
46 | + ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; | |
47 | + let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
48 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | |
49 | + ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; | |
50 | + if ENIAM_LCGchart.is_parsed chart then ( | |
51 | + let term = ENIAM_LCGchart.get_parsed_term chart in | |
52 | + Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file -> | |
53 | + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
54 | + Xlatex.latex_compile_and_clean "results/" (name^"4_term"); | |
55 | + let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
56 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree; | |
57 | + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
58 | + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
59 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree; | |
60 | + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
61 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree; | |
62 | + ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree; | |
63 | + ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree; | |
64 | + ()) | |
65 | + else print_endline "not reduced") | |
66 | + else print_endline "not parsed" | |
67 | + | |
68 | +let rec parse_sentence name id tokens = function | |
69 | + RawSentence s -> id | |
70 | + | StructSentence(paths,last) -> | |
71 | + test_example (name ^ string_of_int id ^ "_") tokens paths last; | |
72 | + id + 1 | |
73 | + | DepSentence(paths) -> id | |
74 | + | QuotedSentences sentences -> | |
75 | + Xlist.fold sentences id (fun id p -> | |
76 | + parse_sentence name id tokens p.sentence) | |
77 | + | AltSentence l -> | |
78 | + Xlist.fold l id (fun id (mode,sentence) -> | |
79 | + parse_sentence name id tokens sentence) | |
80 | + | |
81 | +let rec parse_paragraph name id tokens = function | |
82 | + RawParagraph s -> id | |
83 | + | StructParagraph sentences -> | |
84 | + Xlist.fold sentences id (fun id p -> | |
85 | + parse_sentence name id tokens p.sentence) | |
86 | + | AltParagraph l -> | |
87 | + Xlist.fold l id (fun id (mode,paragraph) -> | |
88 | + parse_paragraph name id tokens paragraph) | |
89 | + | |
90 | +let rec parse_text name id tokens = function | |
91 | + RawText s -> id | |
92 | + | StructText paragraphs -> | |
93 | + Xlist.fold paragraphs id (fun id paragraph -> | |
94 | + parse_paragraph name id tokens paragraph) | |
95 | + | AltText l -> | |
96 | + Xlist.fold l id (fun id (mode,text) -> | |
97 | + parse_text name id tokens text) | |
98 | + | |
99 | + | |
100 | +(* let _ = | |
101 | + Xlist.iter examples (fun (name,example) -> | |
102 | + let text,tokens = ENIAMsubsyntax.parse_text example in | |
103 | + ignore(parse_text name 1 tokens text)) *) | |
104 | + | |
105 | +(* | |
106 | +type entry = {title: string; info:string; biogram:string; (*primary:string; secondary:string;*) author:string} | |
107 | + | |
108 | +let process_xml = function | |
109 | + Xml.Element("entries",[],entries) -> | |
110 | + List.rev (Xlist.rev_map entries (function | |
111 | + Xml.Element("entry",[],[title;info;biogram(*;primary;secondary*);author]) -> | |
112 | + {title=Xml.to_string title; info=Xml.to_string info; biogram=Xml.to_string biogram; | |
113 | + (*primary=Xml.to_string primary; secondary=Xml.to_string secondary;*) author=Xml.to_string author} | |
114 | + | _ -> failwith "process_xml 1")) | |
115 | + | _ -> failwith "process_xml 2" | |
116 | + | |
117 | + | |
118 | +let load_ppibl filename = | |
119 | + let ppibl = File.load_file_gen ("data/" ^ filename) in | |
120 | + process_xml (Xml.parse_string ppibl) | |
121 | + | |
122 | +let named_entities = | |
123 | + File.fold_tab "data/ne.tab" StringMap.empty (fun map -> function | |
124 | + [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l) | |
125 | + | _ -> failwith "named_entities") | |
126 | + | |
127 | +let assign_named_entities t = | |
128 | + match t.token with | |
129 | + Lemma(lemma,"subst",interp) -> | |
130 | + (try | |
131 | + let cat = StringMap.find named_entities lemma in | |
132 | + {t with token=Proper(lemma,"subst",interp,cat)} | |
133 | + with Not_found -> t) | |
134 | + | Proper(lemma,"subst",interp,_) -> | |
135 | + (try | |
136 | + let cat = StringMap.find named_entities lemma in | |
137 | + {t with token=Proper(lemma,"subst",interp,cat)} | |
138 | + with Not_found -> t) | |
139 | + | _ -> t | |
140 | + | |
141 | +let test_strings = [ | |
142 | + (* "Debiutował opowiadaniem pt. <i>Zlecenie na dostawę</i>."; *) | |
143 | + "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; | |
144 | + (* "Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994." *) | |
145 | + (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP." *) | |
146 | +] | |
147 | + | |
148 | +(* let _ = | |
149 | + let entries = load_ppibl "ak322269.xml" in | |
150 | + Xlist.iter entries (fun entry -> print_endline entry.biogram) *) | |
151 | + | |
152 | +(* | |
153 | +let test_strings = [ | |
154 | + "Szpak frunie."; | |
155 | + "Kot np. miauczy."; | |
156 | + "Ala ma kota."; | |
157 | + "Ale mają kota:" | |
158 | + ] | |
159 | + | |
160 | +let test_strings2 = [ | |
161 | + "Szpak frunie. Kot miauczy."; | |
162 | + "Szpak powiedział: „Frunę. Kiszę.”"; | |
163 | + ] | |
164 | +*) | |
165 | + | |
166 | +let grammar = [ | |
167 | + "pos=year", Basic "year",symbol_weight; | |
168 | + "pos=year-interval", Basic "year-interval",symbol_weight; | |
169 | + "lemma=w,pos=prep,case=loc", Basic "time/(year+year-interval)",0.; | |
170 | + "lemma=w,pos=prep,case=loc", Basic "locat/np*MIASTO*T*loc*T",0.; | |
171 | + | |
172 | + "lemma=uczęszczać,pos=praet|fin,person=ter,negation=aff,mood=indicative", Basic "ip*number*gender{|(1+time),|(1+pp*ORGANIZACJA*do*gen),|(1+locat)}",0.; | |
173 | + "lemma=do,pos=prep,case=gen", Basic "pp*sense*lemma*case/np*sense*T*case*T",0.; | |
174 | + | |
175 | +] | |
176 | + | |
177 | +let _ = | |
178 | + print_endline "Testy wbudowane"; | |
179 | + Xlist.iter test_strings (fun s -> | |
180 | + print_endline ("\nTEST: " ^ s); | |
181 | + let paths = ENIAMsubsyntax.parse s in | |
182 | + let paths = Xlist.map paths assign_named_entities in | |
183 | + (* print_endline (ENIAMtokenizer.xml_of tokens); *) | |
184 | + print_endline (ENIAMpaths.to_string (paths,0))); | |
185 | +(* Xlist.iter test_strings2 (fun s -> | |
186 | + print_endline ("\nTEST: " ^ s); | |
187 | + let text,tokens = ENIAMsubsyntax.parse_text s in | |
188 | + (* print_endline (ENIAMtokenizer.xml_of tokens); *) | |
189 | + print_endline (ENIAMsubsyntaxStringOf.tokens tokens); | |
190 | + print_endline ""; | |
191 | + print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));*) | |
192 | +(* print_endline "Testy użytkownika."; | |
193 | + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; | |
194 | + let s = ref (read_line ()) in | |
195 | + while !s <> "" do | |
196 | + let tokens = ENIAMtokenizer.parse !s in | |
197 | + (* print_endline (ENIAMtokenizer.xml_of tokens); *) | |
198 | + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token)); | |
199 | + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; | |
200 | + s := read_line () | |
201 | + done;*) | |
202 | + () | |
203 | + | |
204 | +open ENIAM_LCGlexiconTypes | |
205 | +open ENIAM_LCGtypes | |
206 | + | |
207 | + | |
208 | +(* | |
209 | +type output = Text | Xml | Html | Marsh | Graphviz | |
210 | + | |
211 | +let output = ref Text | |
212 | +let comm_stdio = ref true | |
213 | +let sentence_split = ref true | |
214 | +let port = ref 0 | |
215 | + | |
216 | +let spec_list = [ | |
217 | + "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | |
218 | + "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; | |
219 | + "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; | |
220 | + "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; | |
221 | + "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; | |
222 | + "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; | |
223 | + "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; | |
224 | + "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; | |
225 | + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; | |
226 | + (* "-r", Arg.String (fun p -> | |
227 | + ENIAMtokenizerTypes.set_resource_path p; | |
228 | + ENIAMmorphologyTypes.set_resource_path p; | |
229 | + ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *) | |
230 | + ] | |
231 | + | |
232 | +let usage_msg = | |
233 | + "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:" | |
234 | +*)*) | |
235 | +let message = "ENIAM_LCGparser, a parser for Logical Categorial Grammar formalism\n\ | |
236 | +Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\ | |
237 | +Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences" | |
238 | +(* | |
239 | +let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s)) | |
240 | +*) | |
241 | +let input_text channel = | |
242 | + let s = ref (try input_line channel with End_of_file -> "") in | |
243 | + let lines = ref [] in | |
244 | + while !s <> "" do | |
245 | + lines := !s :: !lines; | |
246 | + s := try input_line channel with End_of_file -> "" | |
247 | + done; | |
248 | + String.concat "\n" (List.rev !lines) | |
249 | + | |
250 | +let rec main_loop in_chan out_chan = | |
251 | + let text = input_text in_chan in | |
252 | + if text = "" then () else ( | |
253 | + let text,tokens = ENIAMsubsyntax.parse_text text in | |
254 | + ignore(parse_text "E"(*name*) 1 tokens text) | |
255 | + (* print_endline "input text begin"; | |
256 | + print_endline text; | |
257 | + print_endline "input text end"; *) | |
258 | + (*if !sentence_split then | |
259 | + let text,tokens = ENIAMsubsyntax.parse_text text in | |
260 | + (match !output with | |
261 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n") | |
262 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n") | |
263 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n") | |
264 | + | Marsh -> Marshal.to_channel out_chan (text,tokens) [] | |
265 | + | Graphviz -> failwith "main_loop: ni") | |
266 | + else | |
267 | + let tokens = ENIAMsubsyntax.parse text in | |
268 | + (match !output with | |
269 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n") | |
270 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n") | |
271 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n") | |
272 | + | Marsh -> Marshal.to_channel out_chan tokens [] | |
273 | + | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))*); | |
274 | + flush out_chan; | |
275 | + main_loop in_chan out_chan) | |
276 | + | |
277 | +let _ = | |
278 | + prerr_endline message; | |
279 | + (* Arg.parse spec_list anon_fun usage_msg; *) | |
280 | + Gc.compact (); | |
281 | + prerr_endline "Ready!"; | |
282 | + (*if !comm_stdio then*) main_loop stdin stdout | |
283 | + (*else | |
284 | + let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in | |
285 | + Unix.establish_server main_loop sockaddr*) | |
... | ... |
subsyntax/ENIAM_MWE.ml
... | ... | @@ -40,7 +40,7 @@ let process_interp lemma interp = |
40 | 40 | | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s)) |
41 | 41 | | _ -> failwith "process_interp" |
42 | 42 | |
43 | -let load_mwe_dict dict filename = | |
43 | +let load_mwe_dict filename dict = | |
44 | 44 | File.fold_tab filename dict (fun dict -> function |
45 | 45 | [orths; lemma; interp] -> |
46 | 46 | let orths = Xstring.split " " orths in |
... | ... | @@ -60,7 +60,7 @@ let process_orth = function |
60 | 60 | | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l) |
61 | 61 | | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens) |
62 | 62 | |
63 | -let load_mwe_dict2 (dict,dict2) filename = | |
63 | +let load_mwe_dict2 filename (dict,dict2) = | |
64 | 64 | File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function |
65 | 65 | [orths; lemma] -> |
66 | 66 | (* print_endline (orths ^ "\t" ^ lemma); *) |
... | ... | @@ -84,12 +84,13 @@ let load_mwe_dict2 (dict,dict2) filename = |
84 | 84 | | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'")) |
85 | 85 | |
86 | 86 | let mwe_dict,mwe_dict2 = |
87 | - let dict = load_mwe_dict StringMap.empty brev_filename in | |
88 | - let dict = try load_mwe_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in | |
89 | - let dict = load_mwe_dict dict mwe_filename in | |
90 | - let dict,dict2 = load_mwe_dict2 (dict,StringMap.empty) sejf_filename in | |
91 | - let dict,dict2 = load_mwe_dict2 (dict,dict2) sejfek_filename in | |
92 | - let dict,dict2 = load_mwe_dict2 (dict,dict2) sawa_filename in | |
87 | + let dict = File.catch_no_file (load_mwe_dict brev_filename) StringMap.empty in | |
88 | + let dict = File.catch_no_file (load_mwe_dict fixed_filename) dict in | |
89 | + let dict = File.catch_no_file (load_mwe_dict mwe_filename) dict in | |
90 | + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejf_filename) (dict,StringMap.empty) in | |
91 | + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejfek_filename) (dict,dict2) in | |
92 | + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_filename) (dict,dict2) in | |
93 | + let dict,dict2 = File.catch_no_file (load_mwe_dict2 mwe2_filename) (dict,dict2) in | |
93 | 94 | dict,dict2 |
94 | 95 | |
95 | 96 | let get_orths paths = |
... | ... | @@ -223,7 +224,7 @@ let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: pro |
223 | 224 | next=t.next; |
224 | 225 | token=Lemma(lemma,cat,[Xlist.map interp (function |
225 | 226 | S s -> (try Xlist.assoc sels s with Not_found -> ["_"]) |
226 | - | V s -> [s] | |
227 | + | V s -> Xstring.split "\\." s | |
227 | 228 | | G -> ["_"])]); |
228 | 229 | weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *) |
229 | 230 | attrs=ENIAMtokens.merge_attrs l} |
... | ... |
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -21,16 +21,16 @@ open ENIAMsubsyntaxTypes |
21 | 21 | open ENIAMtokenizerTypes |
22 | 22 | open Xstd |
23 | 23 | |
24 | -let load_lemma_frequencies filename = | |
24 | +let load_lemma_frequencies filename map = | |
25 | 25 | let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in |
26 | - Xlist.fold l StringMap.empty (fun map line -> | |
26 | + Xlist.fold l map (fun map line -> | |
27 | 27 | if String.length line = 0 then map else |
28 | 28 | if String.get line 0 = '#' then map else |
29 | 29 | match Str.split_delim (Str.regexp "\t") line with |
30 | 30 | [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.)) |
31 | 31 | | _ -> failwith ("load_lemma_frequencies: " ^ line)) |
32 | 32 | |
33 | -let lemma_frequencies = load_lemma_frequencies lemma_frequencies_filename | |
33 | +let lemma_frequencies = File.catch_no_file (load_lemma_frequencies lemma_frequencies_filename) StringMap.empty | |
34 | 34 | |
35 | 35 | let modify_weights paths = |
36 | 36 | List.rev (Xlist.fold paths [] (fun paths t -> |
... | ... | @@ -210,10 +210,13 @@ let load_proper_name proper = function |
210 | 210 | StringMap.add_inc proper lemma types (fun types2 -> types @ types2) |
211 | 211 | | l -> failwith ("proper_names: " ^ String.concat " " l) |
212 | 212 | |
213 | +let load_proper_names filename proper = | |
214 | + File.fold_tab filename proper load_proper_name | |
215 | + | |
213 | 216 | let proper_names = |
214 | - let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in | |
215 | - let proper = File.fold_tab proper_names_filename2 proper load_proper_name in | |
216 | - let proper = File.fold_tab proper_names_filename3 proper load_proper_name in | |
217 | + let proper = File.catch_no_file (load_proper_names proper_names_filename) StringMap.empty in | |
218 | + let proper = File.catch_no_file (load_proper_names proper_names_filename2) proper in | |
219 | + let proper = File.catch_no_file (load_proper_names proper_names_filename3) proper in | |
217 | 220 | proper |
218 | 221 | |
219 | 222 | let remove l s = |
... | ... |
subsyntax/ENIAMsubsyntaxTypes.ml
... | ... | @@ -44,10 +44,15 @@ type text = |
44 | 44 | | StructText of paragraph list (* * token_record ExtArray.t*) (* akapity * tokeny *) |
45 | 45 | | AltText of (mode * text) list |
46 | 46 | |
47 | +let data_path = | |
48 | + try Sys.getenv "ENIAM_USER_DATA_PATH" | |
49 | + with Not_found -> "data" | |
50 | + | |
47 | 51 | let brev_filename = resource_path ^ "/subsyntax/brev.tab" |
48 | 52 | let fixed_filename = resource_path ^ "/Walenty/fixed.tab" |
49 | -let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" | |
50 | -let mwe_filename = resource_path ^ "/subsyntax/mwe.tab" | |
53 | +(* let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" *) | |
54 | +let mwe_filename = data_path ^ "/mwe.tab" | |
55 | +let mwe2_filename = data_path ^ "/mwe2.tab" | |
51 | 56 | let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic" |
52 | 57 | let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic" |
53 | 58 | let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic" |
... | ... | @@ -58,7 +63,7 @@ let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.t |
58 | 63 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *) |
59 | 64 | let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab" |
60 | 65 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab" |
61 | -let proper_names_filename3 = resource_path ^ "/subsyntax/ne.tab" | |
66 | +let proper_names_filename3 = data_path ^ "/ne.tab" | |
62 | 67 | |
63 | 68 | let int_of_mode = function |
64 | 69 | Raw -> 0 |
... | ... |
subsyntax/makefile
... | ... | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa #eniam-subsyntax.cmxa | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa | |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | 9 | SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml |
... | ... | @@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES) |
32 | 32 | eniam-subsyntax.cmxa: $(SOURCES) |
33 | 33 | ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^ |
34 | 34 | |
35 | -test: $(SOURCES) test.ml | |
36 | - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml | |
35 | +test: test.ml | |
36 | + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml | |
37 | 37 | |
38 | 38 | interface: interface.ml |
39 | 39 | $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml |
... | ... |
subsyntax/resources/mwe.tab deleted
1 | -Akademii Sztuki w Szczecinie Akademia Sztuki w Szczecinie subst:sg:gen.dat.loc:f | |
2 | -Atelier Bizio + Ligierko Atelier Bizio + Ligierko subst:sg:_:n2 | |
3 | -Instytucie Architektury i Planowania Przestrzennego Instytut Architektury i Planowania Przestrzennego subst:sg:loc.voc:m3 | |
4 | -Katedrze Architektury Współczesnej Teorii i Metodologii Projektowania Katedra Architektury Współczesnej Teorii i Metodologii Projektowania subst:sg:dat.loc:f | |
5 | -VII Liceum Ogólnokształcącego im . K . K . Baczyńskiego VII Liceum Ogólnokształcące im. K.K. Baczyńskiego subst:sg:gen:m3 | |
6 | -IV Liceum Ogólnokształcącego im . L . Szenwalda IV Liceum Ogólnokształcące im. L. Szenwalda subst:sg:gen:m3 | |
7 | -Muzeum Narodowym Muzeum Narodowe subst:sg:inst.loc:n2 | |
8 | -Nagrodę Artystyczną m . Szczecina Nagroda Artystyczna m. Szczecina subst:sg:acc:f | |
9 | -Zachodniopomorskiego Nobla Zachodniopomorski Nobel subst:sg:acc.gen:m3 | |
10 | -Politechnice Krakowskiej Politechnika Krakowska subst:sg:dat.loc:f | |
11 | -Politechnice Szczecińskiej Politechnika Szczecińska subst:sg:dat.loc:f | |
12 | -Politechniki Szczecińskiej Politechnika Szczecińska subst:sg:gen:f | |
13 | -Pracowni Podstaw Projektowania Pracownia Podstaw Projektowania subst:sg:gen.dat.loc:f | |
14 | -Przeglądu Teatrów Małych Form „ Kontrapunkt ” Przegląd Teatrów Małych Form „Kontrapunkt” subst:sg:gen:m3 | |
15 | -Mistrzowską Szkołę Reżyserii Filmowej Andrzeja Wajdy Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy subst:sg:acc:f | |
16 | -Uniwersytecie im . M . Kopernika Uniwersytet im. M. Kopernika subst:sg:loc.voc:m3 | |
17 | -Zachodniopomorski Uniwersytet Technologiczny Zachodniopomorski Uniwersytet Technologiczny subst:sg:acc.nom:m3 | |
18 | -Wydziale Budownictwa i Architektury Wydział Budownictwa i Architektury subst:sg:loc.voc:m3 | |
19 | -Wydziale Stuk Wizualnych Wydział Stuk Wizualnych subst:sg:loc.voc:m3 | |
20 | -Zakładzie Teorii Architektury , Historii i Konserwacji Zabytków Zakład Teorii Architektury, Historii i Konserwacji Zabytków subst:sg:loc.voc:m3 | |
21 | -Festiwalu Polskich Sztuk Współczesnych R @ Port Festiwalu Polskich Sztuk Współczesnych R@Port subst:sg:gen.loc.voc:m3 | |
22 | -Arabia Saudyjska Arabia Saudyjska subst:sg:nom:f |
subsyntax/resources/ne.tab deleted
1 | -Akademia Sztuki ORGANIZACJA | |
2 | -Atelier Bizio + Ligierko ORGANIZACJA | |
3 | -Instytut Architektury i Planowania Przestrzennego ORGANIZACJA | |
4 | -Katedra Architektury Współczesnej Teorii i Metodologii Projektowania ORGANIZACJA | |
5 | -VII Liceum Ogólnokształcące im. K.K. Baczyńskiego ORGANIZACJA | |
6 | -IV Liceum Ogólnokształcące im. L. Szenwalda ORGANIZACJA | |
7 | -Muzeum Narodowe ORGANIZACJA | |
8 | -Nagroda Artystyczna m. Szczecina WYRÓŻNIENIE | |
9 | -Zachodniopomorski Nobel WYRÓŻNIENIE | |
10 | -Politechnika Krakowska ORGANIZACJA | |
11 | -Politechnika Szczecińska ORGANIZACJA | |
12 | -Pracownia Podstaw Projektowania ORGANIZACJA | |
13 | -Przegląd Teatrów Małych Form „Kontrapunkt” ORGANIZACJA | |
14 | -Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy ORGANIZACJA | |
15 | -Uniwersytet im. M. Kopernika ORGANIZACJA | |
16 | -Zachodniopomorski Uniwersytet Technologiczny ORGANIZACJA | |
17 | -Wydział Budownictwa i Architektury ORGANIZACJA | |
18 | -Wydział Stuk Wizualnych ORGANIZACJA | |
19 | -Zakład Teorii Architektury, Historii i Konserwacji Zabytków ORGANIZACJA | |
20 | -Festiwal Polskich Sztuk Współczesnych R@Port WYDARZENIE | |
21 | -Sosnowiec MIASTO | |
22 | -Stefan IMIĘ | |
23 | -Józefa IMIĘ | |
24 | -Szczecin MIASTO | |
25 | -Waldemar IMIĘ | |
26 | -Marzęcki NAZWISKO | |
27 | -Austria KRAJ | |
28 | -Czechy KRAJ | |
29 | -Niemcy KRAJ | |
30 | -Francja KRAJ | |
31 | -Litwa KRAJ | |
32 | -USA KRAJ | |
33 | -Rosja KRAJ | |
34 | - |
tokenizer/ENIAMacronyms.ml
... | ... | @@ -19,9 +19,10 @@ |
19 | 19 | |
20 | 20 | open ENIAMtokenizerTypes |
21 | 21 | |
22 | +let load_mte mte_filename _ = File.load_lines mte_filename | |
23 | + | |
22 | 24 | let mte_patterns = |
23 | - let lines = try File.load_lines mte_filename | |
24 | - with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in | |
25 | + let lines = File.catch_no_file (load_mte mte_filename) [] in | |
25 | 26 | let l = List.rev (Xlist.rev_map lines (fun line -> |
26 | 27 | match Str.split (Str.regexp "\t") line with |
27 | 28 | [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp |
... | ... |