Commit df1486af60e99aceb686f5ccc8894d6dae28df9f

Authored by Wojciech Jaworski
1 parent caeb305a

poprawki w interfejsie subsyntax

LCGlexicon/interface.ml 0 → 100644
  1 +(*
  2 + * ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
  3 + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open ENIAM_LCGlexiconTypes
  21 +open ENIAM_LCGtypes
  22 +open ENIAMsubsyntaxTypes
  23 +
  24 +let rules = ENIAM_LCGlexicon.make_rules ENIAM_LCGlexiconTypes.rules_filename
  25 +
  26 +let examples = [
  27 + (* "Szpak","Szpak śpiewa.";*)
  28 + (* "miał","Miałem miał."; *)
  29 +(* "Ala","Ala ma kota.";
  30 + "Ale","Ale mają kota:"; *)
  31 + (* "zima","Szpak frunie zimą.";*)
  32 + (* "październik","Kot miauczy w październiku."; *)
  33 +(* "Szpak-Kot","Szpak frunie. Kot miauczy.";
  34 + "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*)
  35 + (* "teraz","Teraz frunie jakiś szpak.";
  36 + "chłopcy","Chłopcy mają ulicę kwiatami."; *)
  37 + (* "arabia","Arabia Saudyjska biegnie.";*)
  38 +(* "Tom","Tom idzie."; *)
  39 + "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie.";
  40 + "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994.";
  41 +]
  42 +
  43 +let clarify_categories senses token =
  44 + match token.ENIAMtokenizerTypes.token with
  45 + ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
  46 + | ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
  47 + | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[])
  48 + | _ -> []
  49 +
  50 +let create_chart tokens lex_sems paths last =
  51 + ENIAM_LCGrenderer.reset_variable_numbers ();
  52 + let chart = ENIAM_LCGchart.make last in
  53 + let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
  54 + let t = ExtArray.get tokens id in
  55 + let s = ExtArray.get lex_sems id in
  56 + ENIAM_LCGrenderer.reset_variable_names ();
  57 + ENIAM_LCGrenderer.add_variable_numbers ();
  58 + let cats = clarify_categories ["X"] t in
  59 + let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
  60 + ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
  61 + chart
  62 +
  63 +let test_example name tokens lex_sems paths last =
  64 + ENIAM_LCGreductions.reset_variant_label ();
  65 + let chart = create_chart tokens lex_sems paths last in
  66 + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart;
  67 + let chart,references = ENIAM_LCGchart.lazify chart in
  68 + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart;
  69 + ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references;
  70 + let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
  71 + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart;
  72 + ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references;
  73 + if ENIAM_LCGchart.is_parsed chart then (
  74 + let term = ENIAM_LCGchart.get_parsed_term chart in
  75 + Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file ->
  76 + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
  77 + Xlatex.latex_compile_and_clean "results/" (name^"4_term");
  78 + let dependency_tree = ENIAM_LCGreductions.reduce term references in
  79 + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree;
  80 + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
  81 + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
  82 + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree;
  83 + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
  84 + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree;
  85 + ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree;
  86 + ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree;
  87 + ())
  88 + else print_endline "not reduced")
  89 + else print_endline "not parsed"
  90 +
  91 +let rec parse_sentence name id tokens lex_sems = function
  92 + RawSentence s -> id
  93 + | StructSentence(paths,last) ->
  94 + test_example (name ^ string_of_int id ^ "_") tokens lex_sems paths last;
  95 + id + 1
  96 + | DepSentence(paths) -> id
  97 + | QuotedSentences sentences ->
  98 + Xlist.fold sentences id (fun id p ->
  99 + parse_sentence name id tokens lex_sems p.sentence)
  100 + | AltSentence l ->
  101 + Xlist.fold l id (fun id (mode,sentence) ->
  102 + parse_sentence name id tokens lex_sems sentence)
  103 +
  104 +let rec parse_paragraph name id tokens lex_sems = function
  105 + RawParagraph s -> id
  106 + | StructParagraph sentences ->
  107 + Xlist.fold sentences id (fun id p ->
  108 + parse_sentence name id tokens lex_sems p.sentence)
  109 + | AltParagraph l ->
  110 + Xlist.fold l id (fun id (mode,paragraph) ->
  111 + parse_paragraph name id tokens lex_sems paragraph)
  112 +
  113 +let rec parse_text name id tokens lex_sems = function
  114 + RawText s -> id
  115 + | StructText paragraphs ->
  116 + Xlist.fold paragraphs id (fun id paragraph ->
  117 + parse_paragraph name id tokens lex_sems paragraph)
  118 + | AltText l ->
  119 + Xlist.fold l id (fun id (mode,text) ->
  120 + parse_text name id tokens lex_sems text)
  121 +
  122 +
  123 +let _ =
  124 + Xlist.iter examples (fun (name,example) ->
  125 + let text,tokens = ENIAMsubsyntax.parse_text example in
  126 + let lex_sems = ENIAMlexSemantics.assign tokens text in
  127 + ignore(parse_text name 1 tokens lex_sems text))
  128 +
  129 +(*
  130 +type output = Text | Xml | Html | Marsh | Graphviz
  131 +
  132 +let output = ref Text
  133 +let comm_stdio = ref true
  134 +let sentence_split = ref true
  135 +let port = ref 0
  136 +
  137 +let spec_list = [
  138 + "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
  139 + "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences";
  140 + "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)";
  141 + "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number";
  142 + "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)";
  143 + "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
  144 + "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
  145 + "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
  146 + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off";
  147 + (* "-r", Arg.String (fun p ->
  148 + ENIAMtokenizerTypes.set_resource_path p;
  149 + ENIAMmorphologyTypes.set_resource_path p;
  150 + ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *)
  151 + ]
  152 +
  153 +let usage_msg =
  154 + "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:"
  155 +
  156 +let message = "ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish\n\
  157 +Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\
  158 +Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences"
  159 +
  160 +let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s))
  161 +
  162 +let input_text channel =
  163 + let s = ref (try input_line channel with End_of_file -> "") in
  164 + let lines = ref [] in
  165 + while !s <> "" do
  166 + lines := !s :: !lines;
  167 + s := try input_line channel with End_of_file -> ""
  168 + done;
  169 + String.concat "\n" (List.rev !lines)
  170 +
  171 +let rec main_loop in_chan out_chan =
  172 + let text = input_text in_chan in
  173 + if text = "" then () else (
  174 + (* print_endline "input text begin";
  175 + print_endline text;
  176 + print_endline "input text end"; *)
  177 + (if !sentence_split then
  178 + let text,tokens = ENIAMsubsyntax.parse_text text in
  179 + (match !output with
  180 + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n")
  181 + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n")
  182 + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n")
  183 + | Marsh -> Marshal.to_channel out_chan (text,tokens) []
  184 + | Graphviz -> failwith "main_loop: ni")
  185 + else
  186 + let tokens = ENIAMsubsyntax.parse text in
  187 + (match !output with
  188 + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n")
  189 + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n")
  190 + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n")
  191 + | Marsh -> Marshal.to_channel out_chan tokens []
  192 + | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n")));
  193 + flush out_chan;
  194 + main_loop in_chan out_chan)
  195 +
  196 +let _ =
  197 + prerr_endline message;
  198 + Arg.parse spec_list anon_fun usage_msg;
  199 + Gc.compact ();
  200 + prerr_endline "Ready!";
  201 + if !comm_stdio then main_loop stdin stdout
  202 + else
  203 + let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in
  204 + Unix.establish_server main_loop sockaddr
  205 +*)
... ...
LCGlexicon/makefile
... ... @@ -42,6 +42,9 @@ test2: test2.ml
42 42 mkdir -p results
43 43 $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS2) test2.ml
44 44  
  45 +interface: interface.ml
  46 + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) interface.ml
  47 +
45 48 print_lexicon: ENIAM_LCGlexiconLatexOf.ml
46 49 mkdir -p results
47 50 $(OCAMLOPT) -o print_lexicon $(OCAMLOPTFLAGS) ENIAM_LCGlexiconLatexOf.ml
... ... @@ -67,4 +70,4 @@ print_lexicon: ENIAM_LCGlexiconLatexOf.ml
67 70 $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
68 71  
69 72 clean:
70   - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test test2 print_lexicon
  73 + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test test2 parser print_lexicon
... ...
NKJP2/ENIAM_NKJP.ml
... ... @@ -158,14 +158,72 @@ let load_morphosyntax path name =
158 158 List.rev (Xlist.rev_map entries load_morph_entry)
159 159 | _ -> failwith "load_morphosyntax"
160 160  
161   -let rec merge_entries rev = function
  161 +let parse_seg_corresp corresp =
  162 + if not (Xstring.check_prefix "text.xml#string-range(" corresp) then failwith "parse_seg_corresp" else
  163 + if not (Xstring.check_sufix ")" corresp) then failwith "parse_seg_corresp" else
  164 + let corresp = Xstring.cut_prefix "text.xml#string-range(" corresp in
  165 + let corresp = Xstring.cut_sufix ")" corresp in
  166 + let id,beg,len = match Xstring.split "," corresp with
  167 + [id;beg;len] -> parse_id id, int_of_string beg, int_of_string len
  168 + | _ -> failwith "parse_seg_corresp" in
  169 + let id_div,id_ab = match id with
  170 + {corref=""; prefix="txt"; numbers=[id_div;id_ab]; suffix="ab"} -> id_div,id_ab
  171 + | _ -> failwith "parse_seg_corresp" in
  172 + id_div,id_ab,beg,len
  173 +
  174 +let pos_set = StringSet.of_list
  175 + ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"adj";"adjc";"adjp";"adja";"num";
  176 + "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt";
  177 + "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj";"burk";"interp";
  178 + "brev";"xxx";"numcol"]
  179 +
  180 +let parse_disamb disamb =
  181 + if disamb = "::interp" then ":","interp",[] else
  182 + if disamb = ":-):interp" then ":-)","interp",[] else
  183 + (* if Xstring.check_sufix ":interp" disamb then Xstring.cut_sufix ":interp" disamb, "interp", [] else *)
  184 + match Xstring.split_delim ":" disamb with
  185 + lemma1 :: lemma2 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2,"subst",interp
  186 + | lemma1 :: lemma2 :: lemma3 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2 ^ ":" ^ lemma3,"subst",interp
  187 + | lemma :: pos :: interp ->
  188 + if StringSet.mem pos_set pos then lemma,pos,interp
  189 + else failwith ("parse_disamb: " ^ disamb)
  190 + | _ -> failwith "parse_disamb"
  191 +
  192 +let rec merge_tokens name id_p rev = function
  193 + (corresp,nps,{corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="seg"}) :: segmentation,
  194 + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="seg"},
  195 + {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="seg"},orth,disamb) :: morphosyntax ->
  196 + (* if id_p <> id_segm_p then Printf.printf "merge_tokens inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *)
  197 + if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_tokens 2" else
  198 + if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_tokens 3" else
  199 + let id_div,id_ab,beg,len = parse_seg_corresp corresp in(
  200 + (* if id_div <> id_p then (*failwith*)print_endline (Printf.sprintf "merge_tokens 4: %s %d %s" name id_p corresp); (*else*) *)
  201 + let lemma,cat,interp = parse_disamb disamb in
  202 + merge_tokens name id_p ((id_div,id_ab,beg,nps,len,orth,lemma,cat,interp) :: rev) (segmentation,morphosyntax))
  203 + | [],[] -> List.rev rev
  204 + | _ -> failwith "merge_tokens 1"
  205 +
  206 +let rec merge_sentences name id_p rev = function
  207 + ({corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="s"},segm_tokens) :: segmentation,
  208 + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="s"},
  209 + {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="s"},morph_tokens) :: morphosyntax ->
  210 + (* if id_p <> id_segm_p then Printf.printf "merge_sentences inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *)
  211 + if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_sentences 2" else
  212 + if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_sentences 3" else
  213 + let tokens = merge_tokens name id_p [] (segm_tokens,morph_tokens) in
  214 + merge_sentences name id_p ((id_segm_p,id_segm_s,tokens) :: rev) (segmentation,morphosyntax)
  215 + | [],[] -> List.rev rev
  216 + | _ -> failwith "merge_sentences"
  217 +
  218 +let rec merge_entries name rev = function
162 219 ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text,
163 220 ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"},
164 221 {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation,
165 222 ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"},
166 223 {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax ->
167 224 if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else
168   - merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax)
  225 + let sentences = merge_sentences name id_div [] (segm_sentences,morph_sentences) in
  226 + merge_entries name ((id_div,paragraphs,sentences) :: rev) (text,segmentation,morphosyntax)
169 227 | [],[],[] -> List.rev rev
170 228 | _ -> failwith "merge_entries"
171 229  
... ... @@ -174,7 +232,7 @@ let nkjp_path = &quot;../../NLP resources/NKJP-PodkorpusMilionowy-1.2/&quot;
174 232 let _ =
175 233 let names = get_folders nkjp_path in
176 234 Xlist.iter names (fun name ->
177   - print_endline name;
  235 + (* print_endline name; *)
178 236 let typ,channel = load_header nkjp_path name in
179 237 (* print_endline typ; *)
180 238 (* print_endline channel; *)
... ... @@ -182,7 +240,7 @@ let _ =
182 240 let text = load_text nkjp_path name in
183 241 let segmentation = load_segmentation nkjp_path name in
184 242 let morphosyntax = load_morphosyntax nkjp_path name in
185   - let entries = merge_entries [] (text,segmentation,morphosyntax) in
  243 + let entries = merge_entries name [] (text,segmentation,morphosyntax) in
186 244 ())
187 245  
188 246 (*
... ...
subsyntax/ENIAM_MWE.ml
... ... @@ -30,7 +30,7 @@ let load_dict dict filename =
30 30  
31 31 let mwe_dict =
32 32 let dict = load_dict StringMap.empty brev_filename in
33   - let dict = load_dict dict fixed_filename in
  33 + let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
34 34 (* let dict = load_dict dict complete_entries_filename in*)
35 35 let dict = load_dict dict mwe_filename in
36 36 dict
... ...
subsyntax/ENIAMsubsyntax.ml
... ... @@ -200,7 +200,7 @@ let select_tokens paths =
200 200 (* | Dig(value,cat) -> t :: paths *)
201 201 | Other orth -> t :: paths
202 202 | Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths
203   - | Proper(lemma,pos,interp,cat) -> t :: paths
  203 + | Proper(lemma,pos,interp,cat) -> if pos = "brev" then paths else t :: paths
204 204 (* | Compound _ -> t :: paths *)
205 205 | _ -> paths))
206 206  
... ... @@ -213,6 +213,7 @@ let load_proper_name proper = function
213 213 let proper_names =
214 214 let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in
215 215 let proper = File.fold_tab proper_names_filename2 proper load_proper_name in
  216 + let proper = File.fold_tab proper_names_filename3 proper load_proper_name in
216 217 proper
217 218  
218 219 let remove l s =
... ...
subsyntax/ENIAMsubsyntaxTypes.ml
... ... @@ -55,6 +55,7 @@ let lemma_frequencies_filename = resource_path ^ &quot;/subsyntax/NKJP1M-lemma-freq.t
55 55 let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *)
56 56 let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab"
57 57 let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab"
  58 +let proper_names_filename3 = resource_path ^ "/subsyntax/ne.tab"
58 59  
59 60 let int_of_mode = function
60 61 Raw -> 0
... ...
subsyntax/resources/ne.tab 0 → 100644
  1 +Akademia Sztuki ORGANIZACJA
  2 +Atelier Bizio + Ligierko ORGANIZACJA
  3 +Instytut Architektury i Planowania Przestrzennego ORGANIZACJA
  4 +Katedra Architektury Współczesnej Teorii i Metodologii Projektowania ORGANIZACJA
  5 +VII Liceum Ogólnokształcące im. K.K. Baczyńskiego ORGANIZACJA
  6 +IV Liceum Ogólnokształcące im. L. Szenwalda ORGANIZACJA
  7 +Muzeum Narodowe ORGANIZACJA
  8 +Nagroda Artystyczna m. Szczecina WYRÓŻNIENIE
  9 +Zachodniopomorski Nobel WYRÓŻNIENIE
  10 +Politechnika Krakowska ORGANIZACJA
  11 +Politechnika Szczecińska ORGANIZACJA
  12 +Pracownia Podstaw Projektowania ORGANIZACJA
  13 +Przegląd Teatrów Małych Form „Kontrapunkt” ORGANIZACJA
  14 +Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy ORGANIZACJA
  15 +Uniwersytet im. M. Kopernika ORGANIZACJA
  16 +Zachodniopomorski Uniwersytet Technologiczny ORGANIZACJA
  17 +Wydział Budownictwa i Architektury ORGANIZACJA
  18 +Wydział Stuk Wizualnych ORGANIZACJA
  19 +Zakład Teorii Architektury, Historii i Konserwacji Zabytków ORGANIZACJA
  20 +Festiwal Polskich Sztuk Współczesnych R@Port WYDARZENIE
  21 +Sosnowiec MIASTO
  22 +Stefan IMIĘ
  23 +Józefa IMIĘ
  24 +Szczecin MIASTO
  25 +Waldemar IMIĘ
  26 +Marzęcki NAZWISKO
  27 +Austria KRAJ
  28 +Czechy KRAJ
  29 +Niemcy KRAJ
  30 +Francja KRAJ
  31 +Litwa KRAJ
  32 +USA KRAJ
  33 +Rosja KRAJ
  34 +
... ...
tokenizer/ENIAMacronyms.ml
... ... @@ -21,7 +21,7 @@ open ENIAMtokenizerTypes
21 21  
22 22 let mte_patterns =
23 23 let lines = try File.load_lines mte_filename
24   - with _ -> (print_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in
  24 + with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in
25 25 let l = List.rev (Xlist.rev_map lines (fun line ->
26 26 match Str.split (Str.regexp "\t") line with
27 27 [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp
... ...
tokenizer/ENIAMtokenizerTypes.ml
... ... @@ -72,4 +72,5 @@ let resource_path =
72 72 if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else
73 73 failwith "resource directory does not exists"
74 74  
75   -let mte_filename = resource_path ^ "/tokenizer/mte.tab"
  75 +(* let mte_filename = resource_path ^ "/tokenizer/mte.tab" *)
  76 +let mte_filename = resource_path ^ "/tokenizer/mte_20151215.tab"
... ...
tokenizer/makefile
... ... @@ -18,7 +18,7 @@ install: all
18 18 mkdir -p /usr/share/eniam/tokenizer
19 19 cp resources/mte_20151215.tab /usr/share/eniam/tokenizer/mte_20151215.tab
20 20 cp resources/README /usr/share/eniam/tokenizer/README
21   - ln -s /usr/share/eniam/tokenizer/mte_20151215.tab /usr/share/eniam/tokenizer/mte.tab
  21 +# ln -s /usr/share/eniam/tokenizer/mte_20151215.tab /usr/share/eniam/tokenizer/mte.tab
22 22  
23 23 install-local: all
24 24 mkdir -p $(INSTALLDIR)
... ... @@ -28,7 +28,7 @@ install-local: all
28 28 mkdir -p /usr/local/share/eniam/tokenizer
29 29 cp resources/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte_20151215.tab
30 30 cp resources/README /usr/local/share/eniam/tokenizer/README
31   - ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab
  31 +# ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab
32 32  
33 33 eniam-tokenizer.cma: $(SOURCES)
34 34 ocamlc -linkall -a -o eniam-tokenizer.cma $(OCAMLFLAGS) $^
... ...