Commit df1486af60e99aceb686f5ccc8894d6dae28df9f
1 parent
caeb305a
poprawki w interfejsie subsyntax
Showing
10 changed files
with
314 additions
and
11 deletions
LCGlexicon/interface.ml
0 → 100644
1 | +(* | |
2 | + * ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish | |
3 | + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open ENIAM_LCGlexiconTypes | |
21 | +open ENIAM_LCGtypes | |
22 | +open ENIAMsubsyntaxTypes | |
23 | + | |
24 | +let rules = ENIAM_LCGlexicon.make_rules ENIAM_LCGlexiconTypes.rules_filename | |
25 | + | |
26 | +let examples = [ | |
27 | + (* "Szpak","Szpak śpiewa.";*) | |
28 | + (* "miał","Miałem miał."; *) | |
29 | +(* "Ala","Ala ma kota."; | |
30 | + "Ale","Ale mają kota:"; *) | |
31 | + (* "zima","Szpak frunie zimą.";*) | |
32 | + (* "październik","Kot miauczy w październiku."; *) | |
33 | +(* "Szpak-Kot","Szpak frunie. Kot miauczy."; | |
34 | + "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*) | |
35 | + (* "teraz","Teraz frunie jakiś szpak."; | |
36 | + "chłopcy","Chłopcy mają ulicę kwiatami."; *) | |
37 | + (* "arabia","Arabia Saudyjska biegnie.";*) | |
38 | +(* "Tom","Tom idzie."; *) | |
39 | + "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; | |
40 | + "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; | |
41 | +] | |
42 | + | |
43 | +let clarify_categories senses token = | |
44 | + match token.ENIAMtokenizerTypes.token with | |
45 | + ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) | |
46 | + | ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) | |
47 | + | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) | |
48 | + | _ -> [] | |
49 | + | |
50 | +let create_chart tokens lex_sems paths last = | |
51 | + ENIAM_LCGrenderer.reset_variable_numbers (); | |
52 | + let chart = ENIAM_LCGchart.make last in | |
53 | + let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | |
54 | + let t = ExtArray.get tokens id in | |
55 | + let s = ExtArray.get lex_sems id in | |
56 | + ENIAM_LCGrenderer.reset_variable_names (); | |
57 | + ENIAM_LCGrenderer.add_variable_numbers (); | |
58 | + let cats = clarify_categories ["X"] t in | |
59 | + let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | |
60 | + ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | |
61 | + chart | |
62 | + | |
63 | +let test_example name tokens lex_sems paths last = | |
64 | + ENIAM_LCGreductions.reset_variant_label (); | |
65 | + let chart = create_chart tokens lex_sems paths last in | |
66 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; | |
67 | + let chart,references = ENIAM_LCGchart.lazify chart in | |
68 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | |
69 | + ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; | |
70 | + let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
71 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | |
72 | + ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; | |
73 | + if ENIAM_LCGchart.is_parsed chart then ( | |
74 | + let term = ENIAM_LCGchart.get_parsed_term chart in | |
75 | + Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file -> | |
76 | + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
77 | + Xlatex.latex_compile_and_clean "results/" (name^"4_term"); | |
78 | + let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
79 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree; | |
80 | + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
81 | + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
82 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree; | |
83 | + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
84 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree; | |
85 | + ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree; | |
86 | + ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree; | |
87 | + ()) | |
88 | + else print_endline "not reduced") | |
89 | + else print_endline "not parsed" | |
90 | + | |
91 | +let rec parse_sentence name id tokens lex_sems = function | |
92 | + RawSentence s -> id | |
93 | + | StructSentence(paths,last) -> | |
94 | + test_example (name ^ string_of_int id ^ "_") tokens lex_sems paths last; | |
95 | + id + 1 | |
96 | + | DepSentence(paths) -> id | |
97 | + | QuotedSentences sentences -> | |
98 | + Xlist.fold sentences id (fun id p -> | |
99 | + parse_sentence name id tokens lex_sems p.sentence) | |
100 | + | AltSentence l -> | |
101 | + Xlist.fold l id (fun id (mode,sentence) -> | |
102 | + parse_sentence name id tokens lex_sems sentence) | |
103 | + | |
104 | +let rec parse_paragraph name id tokens lex_sems = function | |
105 | + RawParagraph s -> id | |
106 | + | StructParagraph sentences -> | |
107 | + Xlist.fold sentences id (fun id p -> | |
108 | + parse_sentence name id tokens lex_sems p.sentence) | |
109 | + | AltParagraph l -> | |
110 | + Xlist.fold l id (fun id (mode,paragraph) -> | |
111 | + parse_paragraph name id tokens lex_sems paragraph) | |
112 | + | |
113 | +let rec parse_text name id tokens lex_sems = function | |
114 | + RawText s -> id | |
115 | + | StructText paragraphs -> | |
116 | + Xlist.fold paragraphs id (fun id paragraph -> | |
117 | + parse_paragraph name id tokens lex_sems paragraph) | |
118 | + | AltText l -> | |
119 | + Xlist.fold l id (fun id (mode,text) -> | |
120 | + parse_text name id tokens lex_sems text) | |
121 | + | |
122 | + | |
123 | +let _ = | |
124 | + Xlist.iter examples (fun (name,example) -> | |
125 | + let text,tokens = ENIAMsubsyntax.parse_text example in | |
126 | + let lex_sems = ENIAMlexSemantics.assign tokens text in | |
127 | + ignore(parse_text name 1 tokens lex_sems text)) | |
128 | + | |
129 | +(* | |
130 | +type output = Text | Xml | Html | Marsh | Graphviz | |
131 | + | |
132 | +let output = ref Text | |
133 | +let comm_stdio = ref true | |
134 | +let sentence_split = ref true | |
135 | +let port = ref 0 | |
136 | + | |
137 | +let spec_list = [ | |
138 | + "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | |
139 | + "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; | |
140 | + "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; | |
141 | + "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; | |
142 | + "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; | |
143 | + "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; | |
144 | + "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; | |
145 | + "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; | |
146 | + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; | |
147 | + (* "-r", Arg.String (fun p -> | |
148 | + ENIAMtokenizerTypes.set_resource_path p; | |
149 | + ENIAMmorphologyTypes.set_resource_path p; | |
150 | + ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *) | |
151 | + ] | |
152 | + | |
153 | +let usage_msg = | |
154 | + "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:" | |
155 | + | |
156 | +let message = "ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish\n\ | |
157 | +Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\ | |
158 | +Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences" | |
159 | + | |
160 | +let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s)) | |
161 | + | |
162 | +let input_text channel = | |
163 | + let s = ref (try input_line channel with End_of_file -> "") in | |
164 | + let lines = ref [] in | |
165 | + while !s <> "" do | |
166 | + lines := !s :: !lines; | |
167 | + s := try input_line channel with End_of_file -> "" | |
168 | + done; | |
169 | + String.concat "\n" (List.rev !lines) | |
170 | + | |
171 | +let rec main_loop in_chan out_chan = | |
172 | + let text = input_text in_chan in | |
173 | + if text = "" then () else ( | |
174 | + (* print_endline "input text begin"; | |
175 | + print_endline text; | |
176 | + print_endline "input text end"; *) | |
177 | + (if !sentence_split then | |
178 | + let text,tokens = ENIAMsubsyntax.parse_text text in | |
179 | + (match !output with | |
180 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n") | |
181 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n") | |
182 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n") | |
183 | + | Marsh -> Marshal.to_channel out_chan (text,tokens) [] | |
184 | + | Graphviz -> failwith "main_loop: ni") | |
185 | + else | |
186 | + let tokens = ENIAMsubsyntax.parse text in | |
187 | + (match !output with | |
188 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n") | |
189 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n") | |
190 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n") | |
191 | + | Marsh -> Marshal.to_channel out_chan tokens [] | |
192 | + | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))); | |
193 | + flush out_chan; | |
194 | + main_loop in_chan out_chan) | |
195 | + | |
196 | +let _ = | |
197 | + prerr_endline message; | |
198 | + Arg.parse spec_list anon_fun usage_msg; | |
199 | + Gc.compact (); | |
200 | + prerr_endline "Ready!"; | |
201 | + if !comm_stdio then main_loop stdin stdout | |
202 | + else | |
203 | + let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in | |
204 | + Unix.establish_server main_loop sockaddr | |
205 | +*) | |
... | ... |
LCGlexicon/makefile
... | ... | @@ -42,6 +42,9 @@ test2: test2.ml |
42 | 42 | mkdir -p results |
43 | 43 | $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS2) test2.ml |
44 | 44 | |
45 | +interface: interface.ml | |
46 | + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) interface.ml | |
47 | + | |
45 | 48 | print_lexicon: ENIAM_LCGlexiconLatexOf.ml |
46 | 49 | mkdir -p results |
47 | 50 | $(OCAMLOPT) -o print_lexicon $(OCAMLOPTFLAGS) ENIAM_LCGlexiconLatexOf.ml |
... | ... | @@ -67,4 +70,4 @@ print_lexicon: ENIAM_LCGlexiconLatexOf.ml |
67 | 70 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
68 | 71 | |
69 | 72 | clean: |
70 | - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test test2 print_lexicon | |
73 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test test2 parser print_lexicon | |
... | ... |
NKJP2/ENIAM_NKJP.ml
... | ... | @@ -158,14 +158,72 @@ let load_morphosyntax path name = |
158 | 158 | List.rev (Xlist.rev_map entries load_morph_entry) |
159 | 159 | | _ -> failwith "load_morphosyntax" |
160 | 160 | |
161 | -let rec merge_entries rev = function | |
161 | +let parse_seg_corresp corresp = | |
162 | + if not (Xstring.check_prefix "text.xml#string-range(" corresp) then failwith "parse_seg_corresp" else | |
163 | + if not (Xstring.check_sufix ")" corresp) then failwith "parse_seg_corresp" else | |
164 | + let corresp = Xstring.cut_prefix "text.xml#string-range(" corresp in | |
165 | + let corresp = Xstring.cut_sufix ")" corresp in | |
166 | + let id,beg,len = match Xstring.split "," corresp with | |
167 | + [id;beg;len] -> parse_id id, int_of_string beg, int_of_string len | |
168 | + | _ -> failwith "parse_seg_corresp" in | |
169 | + let id_div,id_ab = match id with | |
170 | + {corref=""; prefix="txt"; numbers=[id_div;id_ab]; suffix="ab"} -> id_div,id_ab | |
171 | + | _ -> failwith "parse_seg_corresp" in | |
172 | + id_div,id_ab,beg,len | |
173 | + | |
174 | +let pos_set = StringSet.of_list | |
175 | + ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"adj";"adjc";"adjp";"adja";"num"; | |
176 | + "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt"; | |
177 | + "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj";"burk";"interp"; | |
178 | + "brev";"xxx";"numcol"] | |
179 | + | |
180 | +let parse_disamb disamb = | |
181 | + if disamb = "::interp" then ":","interp",[] else | |
182 | + if disamb = ":-):interp" then ":-)","interp",[] else | |
183 | + (* if Xstring.check_sufix ":interp" disamb then Xstring.cut_sufix ":interp" disamb, "interp", [] else *) | |
184 | + match Xstring.split_delim ":" disamb with | |
185 | + lemma1 :: lemma2 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2,"subst",interp | |
186 | + | lemma1 :: lemma2 :: lemma3 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2 ^ ":" ^ lemma3,"subst",interp | |
187 | + | lemma :: pos :: interp -> | |
188 | + if StringSet.mem pos_set pos then lemma,pos,interp | |
189 | + else failwith ("parse_disamb: " ^ disamb) | |
190 | + | _ -> failwith "parse_disamb" | |
191 | + | |
192 | +let rec merge_tokens name id_p rev = function | |
193 | + (corresp,nps,{corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="seg"}) :: segmentation, | |
194 | + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="seg"}, | |
195 | + {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="seg"},orth,disamb) :: morphosyntax -> | |
196 | + (* if id_p <> id_segm_p then Printf.printf "merge_tokens inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *) | |
197 | + if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_tokens 2" else | |
198 | + if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_tokens 3" else | |
199 | + let id_div,id_ab,beg,len = parse_seg_corresp corresp in( | |
200 | + (* if id_div <> id_p then (*failwith*)print_endline (Printf.sprintf "merge_tokens 4: %s %d %s" name id_p corresp); (*else*) *) | |
201 | + let lemma,cat,interp = parse_disamb disamb in | |
202 | + merge_tokens name id_p ((id_div,id_ab,beg,nps,len,orth,lemma,cat,interp) :: rev) (segmentation,morphosyntax)) | |
203 | + | [],[] -> List.rev rev | |
204 | + | _ -> failwith "merge_tokens 1" | |
205 | + | |
206 | +let rec merge_sentences name id_p rev = function | |
207 | + ({corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="s"},segm_tokens) :: segmentation, | |
208 | + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="s"}, | |
209 | + {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="s"},morph_tokens) :: morphosyntax -> | |
210 | + (* if id_p <> id_segm_p then Printf.printf "merge_sentences inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *) | |
211 | + if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_sentences 2" else | |
212 | + if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_sentences 3" else | |
213 | + let tokens = merge_tokens name id_p [] (segm_tokens,morph_tokens) in | |
214 | + merge_sentences name id_p ((id_segm_p,id_segm_s,tokens) :: rev) (segmentation,morphosyntax) | |
215 | + | [],[] -> List.rev rev | |
216 | + | _ -> failwith "merge_sentences" | |
217 | + | |
218 | +let rec merge_entries name rev = function | |
162 | 219 | ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text, |
163 | 220 | ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"}, |
164 | 221 | {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation, |
165 | 222 | ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"}, |
166 | 223 | {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax -> |
167 | 224 | if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else |
168 | - merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax) | |
225 | + let sentences = merge_sentences name id_div [] (segm_sentences,morph_sentences) in | |
226 | + merge_entries name ((id_div,paragraphs,sentences) :: rev) (text,segmentation,morphosyntax) | |
169 | 227 | | [],[],[] -> List.rev rev |
170 | 228 | | _ -> failwith "merge_entries" |
171 | 229 | |
... | ... | @@ -174,7 +232,7 @@ let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" |
174 | 232 | let _ = |
175 | 233 | let names = get_folders nkjp_path in |
176 | 234 | Xlist.iter names (fun name -> |
177 | - print_endline name; | |
235 | + (* print_endline name; *) | |
178 | 236 | let typ,channel = load_header nkjp_path name in |
179 | 237 | (* print_endline typ; *) |
180 | 238 | (* print_endline channel; *) |
... | ... | @@ -182,7 +240,7 @@ let _ = |
182 | 240 | let text = load_text nkjp_path name in |
183 | 241 | let segmentation = load_segmentation nkjp_path name in |
184 | 242 | let morphosyntax = load_morphosyntax nkjp_path name in |
185 | - let entries = merge_entries [] (text,segmentation,morphosyntax) in | |
243 | + let entries = merge_entries name [] (text,segmentation,morphosyntax) in | |
186 | 244 | ()) |
187 | 245 | |
188 | 246 | (* |
... | ... |
subsyntax/ENIAM_MWE.ml
... | ... | @@ -30,7 +30,7 @@ let load_dict dict filename = |
30 | 30 | |
31 | 31 | let mwe_dict = |
32 | 32 | let dict = load_dict StringMap.empty brev_filename in |
33 | - let dict = load_dict dict fixed_filename in | |
33 | + let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in | |
34 | 34 | (* let dict = load_dict dict complete_entries_filename in*) |
35 | 35 | let dict = load_dict dict mwe_filename in |
36 | 36 | dict |
... | ... |
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -200,7 +200,7 @@ let select_tokens paths = |
200 | 200 | (* | Dig(value,cat) -> t :: paths *) |
201 | 201 | | Other orth -> t :: paths |
202 | 202 | | Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths |
203 | - | Proper(lemma,pos,interp,cat) -> t :: paths | |
203 | + | Proper(lemma,pos,interp,cat) -> if pos = "brev" then paths else t :: paths | |
204 | 204 | (* | Compound _ -> t :: paths *) |
205 | 205 | | _ -> paths)) |
206 | 206 | |
... | ... | @@ -213,6 +213,7 @@ let load_proper_name proper = function |
213 | 213 | let proper_names = |
214 | 214 | let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in |
215 | 215 | let proper = File.fold_tab proper_names_filename2 proper load_proper_name in |
216 | + let proper = File.fold_tab proper_names_filename3 proper load_proper_name in | |
216 | 217 | proper |
217 | 218 | |
218 | 219 | let remove l s = |
... | ... |
subsyntax/ENIAMsubsyntaxTypes.ml
... | ... | @@ -55,6 +55,7 @@ let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.t |
55 | 55 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *) |
56 | 56 | let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab" |
57 | 57 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab" |
58 | +let proper_names_filename3 = resource_path ^ "/subsyntax/ne.tab" | |
58 | 59 | |
59 | 60 | let int_of_mode = function |
60 | 61 | Raw -> 0 |
... | ... |
subsyntax/resources/ne.tab
0 → 100644
1 | +Akademia Sztuki ORGANIZACJA | |
2 | +Atelier Bizio + Ligierko ORGANIZACJA | |
3 | +Instytut Architektury i Planowania Przestrzennego ORGANIZACJA | |
4 | +Katedra Architektury Współczesnej Teorii i Metodologii Projektowania ORGANIZACJA | |
5 | +VII Liceum Ogólnokształcące im. K.K. Baczyńskiego ORGANIZACJA | |
6 | +IV Liceum Ogólnokształcące im. L. Szenwalda ORGANIZACJA | |
7 | +Muzeum Narodowe ORGANIZACJA | |
8 | +Nagroda Artystyczna m. Szczecina WYRÓŻNIENIE | |
9 | +Zachodniopomorski Nobel WYRÓŻNIENIE | |
10 | +Politechnika Krakowska ORGANIZACJA | |
11 | +Politechnika Szczecińska ORGANIZACJA | |
12 | +Pracownia Podstaw Projektowania ORGANIZACJA | |
13 | +Przegląd Teatrów Małych Form „Kontrapunkt” ORGANIZACJA | |
14 | +Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy ORGANIZACJA | |
15 | +Uniwersytet im. M. Kopernika ORGANIZACJA | |
16 | +Zachodniopomorski Uniwersytet Technologiczny ORGANIZACJA | |
17 | +Wydział Budownictwa i Architektury ORGANIZACJA | |
18 | +Wydział Stuk Wizualnych ORGANIZACJA | |
19 | +Zakład Teorii Architektury, Historii i Konserwacji Zabytków ORGANIZACJA | |
20 | +Festiwal Polskich Sztuk Współczesnych R@Port WYDARZENIE | |
21 | +Sosnowiec MIASTO | |
22 | +Stefan IMIĘ | |
23 | +Józefa IMIĘ | |
24 | +Szczecin MIASTO | |
25 | +Waldemar IMIĘ | |
26 | +Marzęcki NAZWISKO | |
27 | +Austria KRAJ | |
28 | +Czechy KRAJ | |
29 | +Niemcy KRAJ | |
30 | +Francja KRAJ | |
31 | +Litwa KRAJ | |
32 | +USA KRAJ | |
33 | +Rosja KRAJ | |
34 | + | |
... | ... |
tokenizer/ENIAMacronyms.ml
... | ... | @@ -21,7 +21,7 @@ open ENIAMtokenizerTypes |
21 | 21 | |
22 | 22 | let mte_patterns = |
23 | 23 | let lines = try File.load_lines mte_filename |
24 | - with _ -> (print_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in | |
24 | + with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in | |
25 | 25 | let l = List.rev (Xlist.rev_map lines (fun line -> |
26 | 26 | match Str.split (Str.regexp "\t") line with |
27 | 27 | [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp |
... | ... |
tokenizer/ENIAMtokenizerTypes.ml
... | ... | @@ -72,4 +72,5 @@ let resource_path = |
72 | 72 | if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else |
73 | 73 | failwith "resource directory does not exists" |
74 | 74 | |
75 | -let mte_filename = resource_path ^ "/tokenizer/mte.tab" | |
75 | +(* let mte_filename = resource_path ^ "/tokenizer/mte.tab" *) | |
76 | +let mte_filename = resource_path ^ "/tokenizer/mte_20151215.tab" | |
... | ... |
tokenizer/makefile
... | ... | @@ -18,7 +18,7 @@ install: all |
18 | 18 | mkdir -p /usr/share/eniam/tokenizer |
19 | 19 | cp resources/mte_20151215.tab /usr/share/eniam/tokenizer/mte_20151215.tab |
20 | 20 | cp resources/README /usr/share/eniam/tokenizer/README |
21 | - ln -s /usr/share/eniam/tokenizer/mte_20151215.tab /usr/share/eniam/tokenizer/mte.tab | |
21 | +# ln -s /usr/share/eniam/tokenizer/mte_20151215.tab /usr/share/eniam/tokenizer/mte.tab | |
22 | 22 | |
23 | 23 | install-local: all |
24 | 24 | mkdir -p $(INSTALLDIR) |
... | ... | @@ -28,7 +28,7 @@ install-local: all |
28 | 28 | mkdir -p /usr/local/share/eniam/tokenizer |
29 | 29 | cp resources/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte_20151215.tab |
30 | 30 | cp resources/README /usr/local/share/eniam/tokenizer/README |
31 | - ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab | |
31 | +# ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab | |
32 | 32 | |
33 | 33 | eniam-tokenizer.cma: $(SOURCES) |
34 | 34 | ocamlc -linkall -a -o eniam-tokenizer.cma $(OCAMLFLAGS) $^ |
... | ... |