Commit df1486af60e99aceb686f5ccc8894d6dae28df9f
1 parent
caeb305a
poprawki w interfejsie subsyntax
Showing
10 changed files
with
314 additions
and
11 deletions
LCGlexicon/interface.ml
0 → 100644
1 | +(* | ||
2 | + * ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish | ||
3 | + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | ||
4 | + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | ||
5 | + * | ||
6 | + * This library is free software: you can redistribute it and/or modify | ||
7 | + * it under the terms of the GNU Lesser General Public License as published by | ||
8 | + * the Free Software Foundation, either version 3 of the License, or | ||
9 | + * (at your option) any later version. | ||
10 | + * | ||
11 | + * This library is distributed in the hope that it will be useful, | ||
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | + * GNU Lesser General Public License for more details. | ||
15 | + * | ||
16 | + * You should have received a copy of the GNU Lesser General Public License | ||
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
18 | + *) | ||
19 | + | ||
20 | +open ENIAM_LCGlexiconTypes | ||
21 | +open ENIAM_LCGtypes | ||
22 | +open ENIAMsubsyntaxTypes | ||
23 | + | ||
24 | +let rules = ENIAM_LCGlexicon.make_rules ENIAM_LCGlexiconTypes.rules_filename | ||
25 | + | ||
26 | +let examples = [ | ||
27 | + (* "Szpak","Szpak śpiewa.";*) | ||
28 | + (* "miał","Miałem miał."; *) | ||
29 | +(* "Ala","Ala ma kota."; | ||
30 | + "Ale","Ale mają kota:"; *) | ||
31 | + (* "zima","Szpak frunie zimą.";*) | ||
32 | + (* "październik","Kot miauczy w październiku."; *) | ||
33 | +(* "Szpak-Kot","Szpak frunie. Kot miauczy."; | ||
34 | + "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*) | ||
35 | + (* "teraz","Teraz frunie jakiś szpak."; | ||
36 | + "chłopcy","Chłopcy mają ulicę kwiatami."; *) | ||
37 | + (* "arabia","Arabia Saudyjska biegnie.";*) | ||
38 | +(* "Tom","Tom idzie."; *) | ||
39 | + "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; | ||
40 | + "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; | ||
41 | +] | ||
42 | + | ||
43 | +let clarify_categories senses token = | ||
44 | + match token.ENIAMtokenizerTypes.token with | ||
45 | + ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) | ||
46 | + | ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) | ||
47 | + | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) | ||
48 | + | _ -> [] | ||
49 | + | ||
50 | +let create_chart tokens lex_sems paths last = | ||
51 | + ENIAM_LCGrenderer.reset_variable_numbers (); | ||
52 | + let chart = ENIAM_LCGchart.make last in | ||
53 | + let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | ||
54 | + let t = ExtArray.get tokens id in | ||
55 | + let s = ExtArray.get lex_sems id in | ||
56 | + ENIAM_LCGrenderer.reset_variable_names (); | ||
57 | + ENIAM_LCGrenderer.add_variable_numbers (); | ||
58 | + let cats = clarify_categories ["X"] t in | ||
59 | + let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | ||
60 | + ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | ||
61 | + chart | ||
62 | + | ||
63 | +let test_example name tokens lex_sems paths last = | ||
64 | + ENIAM_LCGreductions.reset_variant_label (); | ||
65 | + let chart = create_chart tokens lex_sems paths last in | ||
66 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; | ||
67 | + let chart,references = ENIAM_LCGchart.lazify chart in | ||
68 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | ||
69 | + ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; | ||
70 | + let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | ||
71 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | ||
72 | + ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; | ||
73 | + if ENIAM_LCGchart.is_parsed chart then ( | ||
74 | + let term = ENIAM_LCGchart.get_parsed_term chart in | ||
75 | + Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file -> | ||
76 | + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | ||
77 | + Xlatex.latex_compile_and_clean "results/" (name^"4_term"); | ||
78 | + let dependency_tree = ENIAM_LCGreductions.reduce term references in | ||
79 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree; | ||
80 | + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | ||
81 | + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | ||
82 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree; | ||
83 | + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | ||
84 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree; | ||
85 | + ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree; | ||
86 | + ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree; | ||
87 | + ()) | ||
88 | + else print_endline "not reduced") | ||
89 | + else print_endline "not parsed" | ||
90 | + | ||
91 | +let rec parse_sentence name id tokens lex_sems = function | ||
92 | + RawSentence s -> id | ||
93 | + | StructSentence(paths,last) -> | ||
94 | + test_example (name ^ string_of_int id ^ "_") tokens lex_sems paths last; | ||
95 | + id + 1 | ||
96 | + | DepSentence(paths) -> id | ||
97 | + | QuotedSentences sentences -> | ||
98 | + Xlist.fold sentences id (fun id p -> | ||
99 | + parse_sentence name id tokens lex_sems p.sentence) | ||
100 | + | AltSentence l -> | ||
101 | + Xlist.fold l id (fun id (mode,sentence) -> | ||
102 | + parse_sentence name id tokens lex_sems sentence) | ||
103 | + | ||
104 | +let rec parse_paragraph name id tokens lex_sems = function | ||
105 | + RawParagraph s -> id | ||
106 | + | StructParagraph sentences -> | ||
107 | + Xlist.fold sentences id (fun id p -> | ||
108 | + parse_sentence name id tokens lex_sems p.sentence) | ||
109 | + | AltParagraph l -> | ||
110 | + Xlist.fold l id (fun id (mode,paragraph) -> | ||
111 | + parse_paragraph name id tokens lex_sems paragraph) | ||
112 | + | ||
113 | +let rec parse_text name id tokens lex_sems = function | ||
114 | + RawText s -> id | ||
115 | + | StructText paragraphs -> | ||
116 | + Xlist.fold paragraphs id (fun id paragraph -> | ||
117 | + parse_paragraph name id tokens lex_sems paragraph) | ||
118 | + | AltText l -> | ||
119 | + Xlist.fold l id (fun id (mode,text) -> | ||
120 | + parse_text name id tokens lex_sems text) | ||
121 | + | ||
122 | + | ||
123 | +let _ = | ||
124 | + Xlist.iter examples (fun (name,example) -> | ||
125 | + let text,tokens = ENIAMsubsyntax.parse_text example in | ||
126 | + let lex_sems = ENIAMlexSemantics.assign tokens text in | ||
127 | + ignore(parse_text name 1 tokens lex_sems text)) | ||
128 | + | ||
129 | +(* | ||
130 | +type output = Text | Xml | Html | Marsh | Graphviz | ||
131 | + | ||
132 | +let output = ref Text | ||
133 | +let comm_stdio = ref true | ||
134 | +let sentence_split = ref true | ||
135 | +let port = ref 0 | ||
136 | + | ||
137 | +let spec_list = [ | ||
138 | + "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | ||
139 | + "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; | ||
140 | + "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; | ||
141 | + "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; | ||
142 | + "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; | ||
143 | + "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; | ||
144 | + "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; | ||
145 | + "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; | ||
146 | + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; | ||
147 | + (* "-r", Arg.String (fun p -> | ||
148 | + ENIAMtokenizerTypes.set_resource_path p; | ||
149 | + ENIAMmorphologyTypes.set_resource_path p; | ||
150 | + ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *) | ||
151 | + ] | ||
152 | + | ||
153 | +let usage_msg = | ||
154 | + "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:" | ||
155 | + | ||
156 | +let message = "ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish\n\ | ||
157 | +Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\ | ||
158 | +Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences" | ||
159 | + | ||
160 | +let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s)) | ||
161 | + | ||
162 | +let input_text channel = | ||
163 | + let s = ref (try input_line channel with End_of_file -> "") in | ||
164 | + let lines = ref [] in | ||
165 | + while !s <> "" do | ||
166 | + lines := !s :: !lines; | ||
167 | + s := try input_line channel with End_of_file -> "" | ||
168 | + done; | ||
169 | + String.concat "\n" (List.rev !lines) | ||
170 | + | ||
171 | +let rec main_loop in_chan out_chan = | ||
172 | + let text = input_text in_chan in | ||
173 | + if text = "" then () else ( | ||
174 | + (* print_endline "input text begin"; | ||
175 | + print_endline text; | ||
176 | + print_endline "input text end"; *) | ||
177 | + (if !sentence_split then | ||
178 | + let text,tokens = ENIAMsubsyntax.parse_text text in | ||
179 | + (match !output with | ||
180 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n") | ||
181 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n") | ||
182 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n") | ||
183 | + | Marsh -> Marshal.to_channel out_chan (text,tokens) [] | ||
184 | + | Graphviz -> failwith "main_loop: ni") | ||
185 | + else | ||
186 | + let tokens = ENIAMsubsyntax.parse text in | ||
187 | + (match !output with | ||
188 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n") | ||
189 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n") | ||
190 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n") | ||
191 | + | Marsh -> Marshal.to_channel out_chan tokens [] | ||
192 | + | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))); | ||
193 | + flush out_chan; | ||
194 | + main_loop in_chan out_chan) | ||
195 | + | ||
196 | +let _ = | ||
197 | + prerr_endline message; | ||
198 | + Arg.parse spec_list anon_fun usage_msg; | ||
199 | + Gc.compact (); | ||
200 | + prerr_endline "Ready!"; | ||
201 | + if !comm_stdio then main_loop stdin stdout | ||
202 | + else | ||
203 | + let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in | ||
204 | + Unix.establish_server main_loop sockaddr | ||
205 | +*) |
LCGlexicon/makefile
@@ -42,6 +42,9 @@ test2: test2.ml | @@ -42,6 +42,9 @@ test2: test2.ml | ||
42 | mkdir -p results | 42 | mkdir -p results |
43 | $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS2) test2.ml | 43 | $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS2) test2.ml |
44 | 44 | ||
45 | +interface: interface.ml | ||
46 | + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) interface.ml | ||
47 | + | ||
45 | print_lexicon: ENIAM_LCGlexiconLatexOf.ml | 48 | print_lexicon: ENIAM_LCGlexiconLatexOf.ml |
46 | mkdir -p results | 49 | mkdir -p results |
47 | $(OCAMLOPT) -o print_lexicon $(OCAMLOPTFLAGS) ENIAM_LCGlexiconLatexOf.ml | 50 | $(OCAMLOPT) -o print_lexicon $(OCAMLOPTFLAGS) ENIAM_LCGlexiconLatexOf.ml |
@@ -67,4 +70,4 @@ print_lexicon: ENIAM_LCGlexiconLatexOf.ml | @@ -67,4 +70,4 @@ print_lexicon: ENIAM_LCGlexiconLatexOf.ml | ||
67 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< | 70 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
68 | 71 | ||
69 | clean: | 72 | clean: |
70 | - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test test2 print_lexicon | 73 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test test2 parser print_lexicon |
NKJP2/ENIAM_NKJP.ml
@@ -158,14 +158,72 @@ let load_morphosyntax path name = | @@ -158,14 +158,72 @@ let load_morphosyntax path name = | ||
158 | List.rev (Xlist.rev_map entries load_morph_entry) | 158 | List.rev (Xlist.rev_map entries load_morph_entry) |
159 | | _ -> failwith "load_morphosyntax" | 159 | | _ -> failwith "load_morphosyntax" |
160 | 160 | ||
161 | -let rec merge_entries rev = function | 161 | +let parse_seg_corresp corresp = |
162 | + if not (Xstring.check_prefix "text.xml#string-range(" corresp) then failwith "parse_seg_corresp" else | ||
163 | + if not (Xstring.check_sufix ")" corresp) then failwith "parse_seg_corresp" else | ||
164 | + let corresp = Xstring.cut_prefix "text.xml#string-range(" corresp in | ||
165 | + let corresp = Xstring.cut_sufix ")" corresp in | ||
166 | + let id,beg,len = match Xstring.split "," corresp with | ||
167 | + [id;beg;len] -> parse_id id, int_of_string beg, int_of_string len | ||
168 | + | _ -> failwith "parse_seg_corresp" in | ||
169 | + let id_div,id_ab = match id with | ||
170 | + {corref=""; prefix="txt"; numbers=[id_div;id_ab]; suffix="ab"} -> id_div,id_ab | ||
171 | + | _ -> failwith "parse_seg_corresp" in | ||
172 | + id_div,id_ab,beg,len | ||
173 | + | ||
174 | +let pos_set = StringSet.of_list | ||
175 | + ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"adj";"adjc";"adjp";"adja";"num"; | ||
176 | + "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt"; | ||
177 | + "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj";"burk";"interp"; | ||
178 | + "brev";"xxx";"numcol"] | ||
179 | + | ||
180 | +let parse_disamb disamb = | ||
181 | + if disamb = "::interp" then ":","interp",[] else | ||
182 | + if disamb = ":-):interp" then ":-)","interp",[] else | ||
183 | + (* if Xstring.check_sufix ":interp" disamb then Xstring.cut_sufix ":interp" disamb, "interp", [] else *) | ||
184 | + match Xstring.split_delim ":" disamb with | ||
185 | + lemma1 :: lemma2 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2,"subst",interp | ||
186 | + | lemma1 :: lemma2 :: lemma3 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2 ^ ":" ^ lemma3,"subst",interp | ||
187 | + | lemma :: pos :: interp -> | ||
188 | + if StringSet.mem pos_set pos then lemma,pos,interp | ||
189 | + else failwith ("parse_disamb: " ^ disamb) | ||
190 | + | _ -> failwith "parse_disamb" | ||
191 | + | ||
192 | +let rec merge_tokens name id_p rev = function | ||
193 | + (corresp,nps,{corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="seg"}) :: segmentation, | ||
194 | + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="seg"}, | ||
195 | + {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="seg"},orth,disamb) :: morphosyntax -> | ||
196 | + (* if id_p <> id_segm_p then Printf.printf "merge_tokens inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *) | ||
197 | + if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_tokens 2" else | ||
198 | + if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_tokens 3" else | ||
199 | + let id_div,id_ab,beg,len = parse_seg_corresp corresp in( | ||
200 | + (* if id_div <> id_p then (*failwith*)print_endline (Printf.sprintf "merge_tokens 4: %s %d %s" name id_p corresp); (*else*) *) | ||
201 | + let lemma,cat,interp = parse_disamb disamb in | ||
202 | + merge_tokens name id_p ((id_div,id_ab,beg,nps,len,orth,lemma,cat,interp) :: rev) (segmentation,morphosyntax)) | ||
203 | + | [],[] -> List.rev rev | ||
204 | + | _ -> failwith "merge_tokens 1" | ||
205 | + | ||
206 | +let rec merge_sentences name id_p rev = function | ||
207 | + ({corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="s"},segm_tokens) :: segmentation, | ||
208 | + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="s"}, | ||
209 | + {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="s"},morph_tokens) :: morphosyntax -> | ||
210 | + (* if id_p <> id_segm_p then Printf.printf "merge_sentences inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *) | ||
211 | + if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_sentences 2" else | ||
212 | + if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_sentences 3" else | ||
213 | + let tokens = merge_tokens name id_p [] (segm_tokens,morph_tokens) in | ||
214 | + merge_sentences name id_p ((id_segm_p,id_segm_s,tokens) :: rev) (segmentation,morphosyntax) | ||
215 | + | [],[] -> List.rev rev | ||
216 | + | _ -> failwith "merge_sentences" | ||
217 | + | ||
218 | +let rec merge_entries name rev = function | ||
162 | ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text, | 219 | ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text, |
163 | ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"}, | 220 | ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"}, |
164 | {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation, | 221 | {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation, |
165 | ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"}, | 222 | ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"}, |
166 | {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax -> | 223 | {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax -> |
167 | if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else | 224 | if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else |
168 | - merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax) | 225 | + let sentences = merge_sentences name id_div [] (segm_sentences,morph_sentences) in |
226 | + merge_entries name ((id_div,paragraphs,sentences) :: rev) (text,segmentation,morphosyntax) | ||
169 | | [],[],[] -> List.rev rev | 227 | | [],[],[] -> List.rev rev |
170 | | _ -> failwith "merge_entries" | 228 | | _ -> failwith "merge_entries" |
171 | 229 | ||
@@ -174,7 +232,7 @@ let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" | @@ -174,7 +232,7 @@ let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" | ||
174 | let _ = | 232 | let _ = |
175 | let names = get_folders nkjp_path in | 233 | let names = get_folders nkjp_path in |
176 | Xlist.iter names (fun name -> | 234 | Xlist.iter names (fun name -> |
177 | - print_endline name; | 235 | + (* print_endline name; *) |
178 | let typ,channel = load_header nkjp_path name in | 236 | let typ,channel = load_header nkjp_path name in |
179 | (* print_endline typ; *) | 237 | (* print_endline typ; *) |
180 | (* print_endline channel; *) | 238 | (* print_endline channel; *) |
@@ -182,7 +240,7 @@ let _ = | @@ -182,7 +240,7 @@ let _ = | ||
182 | let text = load_text nkjp_path name in | 240 | let text = load_text nkjp_path name in |
183 | let segmentation = load_segmentation nkjp_path name in | 241 | let segmentation = load_segmentation nkjp_path name in |
184 | let morphosyntax = load_morphosyntax nkjp_path name in | 242 | let morphosyntax = load_morphosyntax nkjp_path name in |
185 | - let entries = merge_entries [] (text,segmentation,morphosyntax) in | 243 | + let entries = merge_entries name [] (text,segmentation,morphosyntax) in |
186 | ()) | 244 | ()) |
187 | 245 | ||
188 | (* | 246 | (* |
subsyntax/ENIAM_MWE.ml
@@ -30,7 +30,7 @@ let load_dict dict filename = | @@ -30,7 +30,7 @@ let load_dict dict filename = | ||
30 | 30 | ||
31 | let mwe_dict = | 31 | let mwe_dict = |
32 | let dict = load_dict StringMap.empty brev_filename in | 32 | let dict = load_dict StringMap.empty brev_filename in |
33 | - let dict = load_dict dict fixed_filename in | 33 | + let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in |
34 | (* let dict = load_dict dict complete_entries_filename in*) | 34 | (* let dict = load_dict dict complete_entries_filename in*) |
35 | let dict = load_dict dict mwe_filename in | 35 | let dict = load_dict dict mwe_filename in |
36 | dict | 36 | dict |
subsyntax/ENIAMsubsyntax.ml
@@ -200,7 +200,7 @@ let select_tokens paths = | @@ -200,7 +200,7 @@ let select_tokens paths = | ||
200 | (* | Dig(value,cat) -> t :: paths *) | 200 | (* | Dig(value,cat) -> t :: paths *) |
201 | | Other orth -> t :: paths | 201 | | Other orth -> t :: paths |
202 | | Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths | 202 | | Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths |
203 | - | Proper(lemma,pos,interp,cat) -> t :: paths | 203 | + | Proper(lemma,pos,interp,cat) -> if pos = "brev" then paths else t :: paths |
204 | (* | Compound _ -> t :: paths *) | 204 | (* | Compound _ -> t :: paths *) |
205 | | _ -> paths)) | 205 | | _ -> paths)) |
206 | 206 | ||
@@ -213,6 +213,7 @@ let load_proper_name proper = function | @@ -213,6 +213,7 @@ let load_proper_name proper = function | ||
213 | let proper_names = | 213 | let proper_names = |
214 | let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in | 214 | let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in |
215 | let proper = File.fold_tab proper_names_filename2 proper load_proper_name in | 215 | let proper = File.fold_tab proper_names_filename2 proper load_proper_name in |
216 | + let proper = File.fold_tab proper_names_filename3 proper load_proper_name in | ||
216 | proper | 217 | proper |
217 | 218 | ||
218 | let remove l s = | 219 | let remove l s = |
subsyntax/ENIAMsubsyntaxTypes.ml
@@ -55,6 +55,7 @@ let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.t | @@ -55,6 +55,7 @@ let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.t | ||
55 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *) | 55 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *) |
56 | let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab" | 56 | let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab" |
57 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab" | 57 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab" |
58 | +let proper_names_filename3 = resource_path ^ "/subsyntax/ne.tab" | ||
58 | 59 | ||
59 | let int_of_mode = function | 60 | let int_of_mode = function |
60 | Raw -> 0 | 61 | Raw -> 0 |
subsyntax/resources/ne.tab
0 → 100644
1 | +Akademia Sztuki ORGANIZACJA | ||
2 | +Atelier Bizio + Ligierko ORGANIZACJA | ||
3 | +Instytut Architektury i Planowania Przestrzennego ORGANIZACJA | ||
4 | +Katedra Architektury Współczesnej Teorii i Metodologii Projektowania ORGANIZACJA | ||
5 | +VII Liceum Ogólnokształcące im. K.K. Baczyńskiego ORGANIZACJA | ||
6 | +IV Liceum Ogólnokształcące im. L. Szenwalda ORGANIZACJA | ||
7 | +Muzeum Narodowe ORGANIZACJA | ||
8 | +Nagroda Artystyczna m. Szczecina WYRÓŻNIENIE | ||
9 | +Zachodniopomorski Nobel WYRÓŻNIENIE | ||
10 | +Politechnika Krakowska ORGANIZACJA | ||
11 | +Politechnika Szczecińska ORGANIZACJA | ||
12 | +Pracownia Podstaw Projektowania ORGANIZACJA | ||
13 | +Przegląd Teatrów Małych Form „Kontrapunkt” ORGANIZACJA | ||
14 | +Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy ORGANIZACJA | ||
15 | +Uniwersytet im. M. Kopernika ORGANIZACJA | ||
16 | +Zachodniopomorski Uniwersytet Technologiczny ORGANIZACJA | ||
17 | +Wydział Budownictwa i Architektury ORGANIZACJA | ||
18 | +Wydział Stuk Wizualnych ORGANIZACJA | ||
19 | +Zakład Teorii Architektury, Historii i Konserwacji Zabytków ORGANIZACJA | ||
20 | +Festiwal Polskich Sztuk Współczesnych R@Port WYDARZENIE | ||
21 | +Sosnowiec MIASTO | ||
22 | +Stefan IMIĘ | ||
23 | +Józefa IMIĘ | ||
24 | +Szczecin MIASTO | ||
25 | +Waldemar IMIĘ | ||
26 | +Marzęcki NAZWISKO | ||
27 | +Austria KRAJ | ||
28 | +Czechy KRAJ | ||
29 | +Niemcy KRAJ | ||
30 | +Francja KRAJ | ||
31 | +Litwa KRAJ | ||
32 | +USA KRAJ | ||
33 | +Rosja KRAJ | ||
34 | + |
tokenizer/ENIAMacronyms.ml
@@ -21,7 +21,7 @@ open ENIAMtokenizerTypes | @@ -21,7 +21,7 @@ open ENIAMtokenizerTypes | ||
21 | 21 | ||
22 | let mte_patterns = | 22 | let mte_patterns = |
23 | let lines = try File.load_lines mte_filename | 23 | let lines = try File.load_lines mte_filename |
24 | - with _ -> (print_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in | 24 | + with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in |
25 | let l = List.rev (Xlist.rev_map lines (fun line -> | 25 | let l = List.rev (Xlist.rev_map lines (fun line -> |
26 | match Str.split (Str.regexp "\t") line with | 26 | match Str.split (Str.regexp "\t") line with |
27 | [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp | 27 | [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp |
tokenizer/ENIAMtokenizerTypes.ml
@@ -72,4 +72,5 @@ let resource_path = | @@ -72,4 +72,5 @@ let resource_path = | ||
72 | if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else | 72 | if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else |
73 | failwith "resource directory does not exists" | 73 | failwith "resource directory does not exists" |
74 | 74 | ||
75 | -let mte_filename = resource_path ^ "/tokenizer/mte.tab" | 75 | +(* let mte_filename = resource_path ^ "/tokenizer/mte.tab" *) |
76 | +let mte_filename = resource_path ^ "/tokenizer/mte_20151215.tab" |
tokenizer/makefile
@@ -18,7 +18,7 @@ install: all | @@ -18,7 +18,7 @@ install: all | ||
18 | mkdir -p /usr/share/eniam/tokenizer | 18 | mkdir -p /usr/share/eniam/tokenizer |
19 | cp resources/mte_20151215.tab /usr/share/eniam/tokenizer/mte_20151215.tab | 19 | cp resources/mte_20151215.tab /usr/share/eniam/tokenizer/mte_20151215.tab |
20 | cp resources/README /usr/share/eniam/tokenizer/README | 20 | cp resources/README /usr/share/eniam/tokenizer/README |
21 | - ln -s /usr/share/eniam/tokenizer/mte_20151215.tab /usr/share/eniam/tokenizer/mte.tab | 21 | +# ln -s /usr/share/eniam/tokenizer/mte_20151215.tab /usr/share/eniam/tokenizer/mte.tab |
22 | 22 | ||
23 | install-local: all | 23 | install-local: all |
24 | mkdir -p $(INSTALLDIR) | 24 | mkdir -p $(INSTALLDIR) |
@@ -28,7 +28,7 @@ install-local: all | @@ -28,7 +28,7 @@ install-local: all | ||
28 | mkdir -p /usr/local/share/eniam/tokenizer | 28 | mkdir -p /usr/local/share/eniam/tokenizer |
29 | cp resources/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte_20151215.tab | 29 | cp resources/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte_20151215.tab |
30 | cp resources/README /usr/local/share/eniam/tokenizer/README | 30 | cp resources/README /usr/local/share/eniam/tokenizer/README |
31 | - ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab | 31 | +# ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab |
32 | 32 | ||
33 | eniam-tokenizer.cma: $(SOURCES) | 33 | eniam-tokenizer.cma: $(SOURCES) |
34 | ocamlc -linkall -a -o eniam-tokenizer.cma $(OCAMLFLAGS) $^ | 34 | ocamlc -linkall -a -o eniam-tokenizer.cma $(OCAMLFLAGS) $^ |