poprawki w interfejsie subsyntax

Wojciech Jaworski
1 parent caeb305a
Showing 10 changed files with 314 additions and 11 deletions
LCGlexicon/interface.ml
LCGlexicon/makefile
NKJP2/ENIAM_NKJP.ml
subsyntax/ENIAM_MWE.ml
subsyntax/ENIAMsubsyntax.ml
subsyntax/ENIAMsubsyntaxTypes.ml
subsyntax/resources/ne.tab
tokenizer/ENIAMacronyms.ml
tokenizer/ENIAMtokenizerTypes.ml
tokenizer/makefile
+(*
+ *  ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
+ *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
+ *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
+ *
+ *  This library is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *)
+
+open ENIAM_LCGlexiconTypes
+open ENIAM_LCGtypes
+open ENIAMsubsyntaxTypes
+
+let rules = ENIAM_LCGlexicon.make_rules ENIAM_LCGlexiconTypes.rules_filename
+
+let examples = [
+  (* "Szpak","Szpak śpiewa.";*)
+  (* "miał","Miałem miał."; *)
+(*  "Ala","Ala ma kota.";
+  "Ale","Ale mają kota:"; *)
+  (*  "zima","Szpak frunie zimą.";*)
+  (* "październik","Kot miauczy w październiku."; *)
+(*  "Szpak-Kot","Szpak frunie. Kot miauczy.";
+    "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*)
+    (* "teraz","Teraz frunie jakiś szpak.";
+      "chłopcy","Chłopcy mają ulicę kwiatami."; *)
+     (*  "arabia","Arabia Saudyjska biegnie.";*)
+(*  "Tom","Tom idzie."; *)
+  "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie.";
+  "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994.";
+]
+
+let clarify_categories senses token =
+  match token.ENIAMtokenizerTypes.token with
+    ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
+  | ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
+  | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[])
+  | _ -> []
+
+let create_chart tokens lex_sems paths last =
+  ENIAM_LCGrenderer.reset_variable_numbers ();
+  let chart = ENIAM_LCGchart.make last in
+  let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
+      let t = ExtArray.get tokens id in
+      let s = ExtArray.get lex_sems id in
+      ENIAM_LCGrenderer.reset_variable_names ();
+      ENIAM_LCGrenderer.add_variable_numbers ();
+      let cats = clarify_categories ["X"] t in
+      let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
+      ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
+  chart
+
+let test_example name tokens lex_sems paths last =
+  ENIAM_LCGreductions.reset_variant_label ();
+  let chart = create_chart tokens lex_sems paths last in
+  ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart;
+  let chart,references = ENIAM_LCGchart.lazify chart in
+  ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart;
+  ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references;
+  let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
+  ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart;
+  ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references;
+  if ENIAM_LCGchart.is_parsed chart then (
+    let term = ENIAM_LCGchart.get_parsed_term chart in
+    Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file ->
+        Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
+    Xlatex.latex_compile_and_clean "results/" (name^"4_term");
+    let dependency_tree = ENIAM_LCGreductions.reduce term references in
+    ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree;
+    if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
+      ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
+      ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree;
+      ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
+      ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree;
+      ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree;
+      ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree;
+      ())
+    else print_endline "not reduced")
+  else print_endline "not parsed"
+
+let rec parse_sentence name id tokens lex_sems = function
+    RawSentence s -> id
+  | StructSentence(paths,last) ->
+    test_example (name ^ string_of_int id ^ "_") tokens lex_sems paths last;
+    id + 1
+  | DepSentence(paths) -> id
+  | QuotedSentences sentences ->
+    Xlist.fold sentences id (fun id p ->
+        parse_sentence name id tokens lex_sems p.sentence)
+  | AltSentence l ->
+    Xlist.fold l id (fun id (mode,sentence) ->
+        parse_sentence name id tokens lex_sems sentence)
+
+let rec parse_paragraph name id tokens lex_sems = function
+    RawParagraph s -> id
+  | StructParagraph sentences ->
+    Xlist.fold sentences id (fun id p ->
+        parse_sentence name id tokens lex_sems p.sentence)
+  | AltParagraph l ->
+    Xlist.fold l id (fun id (mode,paragraph) ->
+        parse_paragraph name id tokens lex_sems paragraph)
+
+let rec parse_text name id tokens lex_sems = function
+    RawText s -> id
+  | StructText paragraphs ->
+    Xlist.fold paragraphs id (fun id paragraph ->
+      parse_paragraph name id tokens lex_sems paragraph)
+  | AltText l ->
+    Xlist.fold l id (fun id (mode,text) ->
+      parse_text name id tokens lex_sems text)
+
+
+let _ =
+  Xlist.iter examples (fun (name,example) ->
+      let text,tokens = ENIAMsubsyntax.parse_text example in
+      let lex_sems = ENIAMlexSemantics.assign tokens text in
+      ignore(parse_text name 1 tokens lex_sems text))
+
+(*
+type output = Text | Xml | Html | Marsh | Graphviz
+
+let output = ref Text
+let comm_stdio = ref true
+let sentence_split = ref true
+let port = ref 0
+
+let spec_list = [
+  "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
+  "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences";
+  "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)";
+  "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number";
+  "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)";
+  "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
+  "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
+  "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
+  "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off";
+  (* "-r", Arg.String (fun p ->
+        ENIAMtokenizerTypes.set_resource_path p;
+        ENIAMmorphologyTypes.set_resource_path p;
+        ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *)
+  ]
+
+let usage_msg =
+  "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:"
+
+let message = "ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish\n\
+Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\
+Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences"
+
+let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s))
+
+let input_text channel =
+  let s = ref (try input_line channel with End_of_file -> "") in
+  let lines = ref [] in
+  while !s <> "" do
+    lines := !s :: !lines;
+    s := try input_line channel with End_of_file -> ""
+  done;
+  String.concat "\n" (List.rev !lines)
+
+let rec main_loop in_chan out_chan =
+  let text = input_text in_chan in
+  if text = "" then () else (
+    (* print_endline "input text begin";
+    print_endline text;
+    print_endline "input text end"; *)
+    (if !sentence_split then
+       let text,tokens = ENIAMsubsyntax.parse_text text in
+       (match !output with
+          Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n")
+        | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n")
+        | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n")
+        | Marsh -> Marshal.to_channel out_chan (text,tokens) []
+        | Graphviz -> failwith "main_loop: ni")
+    else
+      let tokens = ENIAMsubsyntax.parse text in
+      (match !output with
+         Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n")
+       | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n")
+       | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n")
+       | Marsh -> Marshal.to_channel out_chan tokens []
+       | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n")));
+    flush out_chan;
+    main_loop in_chan out_chan)
+
+let _ =
+  prerr_endline message;
+  Arg.parse spec_list anon_fun usage_msg;
+  Gc.compact ();
+  prerr_endline "Ready!";
+  if !comm_stdio then main_loop stdin stdout
+  else
+    let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in
+    Unix.establish_server main_loop sockaddr
+*)
@@ -42,6 +42,9 @@ test2: test2.ml
 	mkdir -p results
 	$(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS2) test2.ml
+interface: interface.ml
+	$(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) interface.ml
+
 print_lexicon: ENIAM_LCGlexiconLatexOf.ml
 	mkdir -p results
 	$(OCAMLOPT) -o print_lexicon $(OCAMLOPTFLAGS) ENIAM_LCGlexiconLatexOf.ml
@@ -67,4 +70,4 @@ print_lexicon: ENIAM_LCGlexiconLatexOf.ml
 	$(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
 clean:
-	rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test test2 print_lexicon
+	rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test test2 parser print_lexicon
@@ -158,14 +158,72 @@ let load_morphosyntax path name =
         List.rev (Xlist.rev_map entries load_morph_entry)
     | _ -> failwith "load_morphosyntax"
-let rec merge_entries rev = function
+let parse_seg_corresp corresp =
+  if not (Xstring.check_prefix "text.xml#string-range(" corresp) then failwith "parse_seg_corresp" else
+  if not (Xstring.check_sufix ")" corresp) then failwith "parse_seg_corresp" else
+  let corresp = Xstring.cut_prefix "text.xml#string-range(" corresp in
+  let corresp = Xstring.cut_sufix ")" corresp in
+  let id,beg,len = match Xstring.split "," corresp with
+    [id;beg;len] -> parse_id id, int_of_string beg, int_of_string len
+  | _ -> failwith "parse_seg_corresp" in
+  let id_div,id_ab = match id with
+    {corref=""; prefix="txt"; numbers=[id_div;id_ab]; suffix="ab"} -> id_div,id_ab
+  | _ -> failwith "parse_seg_corresp" in
+  id_div,id_ab,beg,len
+
+let pos_set = StringSet.of_list
+         ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"adj";"adjc";"adjp";"adja";"num";
+          "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt";
+          "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj";"burk";"interp";
+          "brev";"xxx";"numcol"]
+
+let parse_disamb disamb =
+  if disamb = "::interp" then ":","interp",[] else
+  if disamb = ":-):interp" then ":-)","interp",[] else
+  (* if Xstring.check_sufix ":interp" disamb then  Xstring.cut_sufix ":interp" disamb, "interp", [] else *)
+  match Xstring.split_delim ":" disamb with
+    lemma1 :: lemma2 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2,"subst",interp
+  | lemma1 :: lemma2 :: lemma3 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2 ^ ":" ^ lemma3,"subst",interp
+  | lemma :: pos :: interp ->
+        if StringSet.mem pos_set pos then lemma,pos,interp
+        else failwith ("parse_disamb: " ^ disamb)
+  | _ -> failwith "parse_disamb"
+
+let rec merge_tokens name id_p rev = function
+    (corresp,nps,{corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="seg"}) :: segmentation,
+    ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="seg"},
+     {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="seg"},orth,disamb) :: morphosyntax ->
+        (* if id_p <> id_segm_p then Printf.printf "merge_tokens inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *)
+        if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_tokens 2" else
+        if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_tokens 3" else
+        let id_div,id_ab,beg,len = parse_seg_corresp corresp in(
+        (* if id_div <> id_p then (*failwith*)print_endline (Printf.sprintf "merge_tokens 4: %s %d %s" name id_p corresp); (*else*) *)
+        let lemma,cat,interp = parse_disamb disamb in
+        merge_tokens name id_p ((id_div,id_ab,beg,nps,len,orth,lemma,cat,interp) :: rev) (segmentation,morphosyntax))
+  | [],[] -> List.rev rev
+  | _ -> failwith "merge_tokens 1"
+
+let rec merge_sentences name id_p rev = function
+    ({corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="s"},segm_tokens) :: segmentation,
+    ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="s"},
+     {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="s"},morph_tokens) :: morphosyntax ->
+        (* if id_p <> id_segm_p then Printf.printf "merge_sentences inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *)
+        if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_sentences 2" else
+        if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_sentences 3" else
+        let tokens = merge_tokens name id_p [] (segm_tokens,morph_tokens) in
+        merge_sentences name id_p ((id_segm_p,id_segm_s,tokens) :: rev) (segmentation,morphosyntax)
+  | [],[] -> List.rev rev
+  | _ -> failwith "merge_sentences"
+
+let rec merge_entries name rev = function
     ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text,
     ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"},
      {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation,
     ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"},
      {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax ->
         if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else
-        merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax)
+        let sentences = merge_sentences name id_div [] (segm_sentences,morph_sentences) in
+        merge_entries name ((id_div,paragraphs,sentences) :: rev) (text,segmentation,morphosyntax)
   | [],[],[] -> List.rev rev
   | _ -> failwith "merge_entries"
@@ -174,7 +232,7 @@ let nkjp_path = &quot;../../NLP resources/NKJP-PodkorpusMilionowy-1.2/&quot;
 let _ =
   let names = get_folders nkjp_path in
   Xlist.iter names (fun name ->
-    print_endline name;
+    (* print_endline name; *)
     let typ,channel = load_header nkjp_path name in
     (* print_endline typ; *)
     (* print_endline channel; *)
@@ -182,7 +240,7 @@ let _ =
     let text = load_text nkjp_path name in
     let segmentation = load_segmentation nkjp_path name in
     let morphosyntax = load_morphosyntax nkjp_path name in
-    let entries = merge_entries [] (text,segmentation,morphosyntax) in
+    let entries = merge_entries name [] (text,segmentation,morphosyntax) in
     ())
 (*
@@ -30,7 +30,7 @@ let load_dict dict filename =
 let mwe_dict =
   let dict = load_dict StringMap.empty brev_filename in
-  let dict = load_dict dict fixed_filename in
+  let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
 (*    let dict = load_dict dict complete_entries_filename in*)
   let dict = load_dict dict mwe_filename in
   dict
@@ -200,7 +200,7 @@ let select_tokens paths =
 (*     | Dig(value,cat) -> t :: paths *)
     | Other orth -> t :: paths
     | Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths
-    | Proper(lemma,pos,interp,cat) -> t :: paths
+    | Proper(lemma,pos,interp,cat) -> if pos = "brev" then paths else t :: paths
 (*     | Compound _ -> t :: paths *)
     | _ -> paths))
@@ -213,6 +213,7 @@ let load_proper_name proper = function
 let proper_names =
   let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in
   let proper = File.fold_tab proper_names_filename2 proper load_proper_name in
+  let proper = File.fold_tab proper_names_filename3 proper load_proper_name in
   proper
 let remove l s =
@@ -55,6 +55,7 @@ let lemma_frequencies_filename = resource_path ^ &quot;/subsyntax/NKJP1M-lemma-freq.t
 let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *)
 let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab"
 let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab"
+let proper_names_filename3 = resource_path ^ "/subsyntax/ne.tab"
 let int_of_mode = function
     Raw -> 0
+Akademia Sztuki	ORGANIZACJA
+Atelier Bizio + Ligierko	ORGANIZACJA
+Instytut Architektury i Planowania Przestrzennego	ORGANIZACJA
+Katedra Architektury Współczesnej Teorii i Metodologii Projektowania	ORGANIZACJA
+VII Liceum Ogólnokształcące im. K.K. Baczyńskiego	ORGANIZACJA
+IV Liceum Ogólnokształcące im. L. Szenwalda	ORGANIZACJA
+Muzeum Narodowe	ORGANIZACJA
+Nagroda Artystyczna m. Szczecina	WYRÓŻNIENIE
+Zachodniopomorski Nobel	WYRÓŻNIENIE
+Politechnika Krakowska	ORGANIZACJA
+Politechnika Szczecińska	ORGANIZACJA
+Pracownia Podstaw Projektowania	ORGANIZACJA
+Przegląd Teatrów Małych Form „Kontrapunkt”	ORGANIZACJA
+Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy	ORGANIZACJA
+Uniwersytet im. M. Kopernika	ORGANIZACJA
+Zachodniopomorski Uniwersytet Technologiczny	ORGANIZACJA
+Wydział Budownictwa i Architektury	ORGANIZACJA
+Wydział Stuk Wizualnych	ORGANIZACJA
+Zakład Teorii Architektury, Historii i Konserwacji Zabytków	ORGANIZACJA
+Festiwal Polskich Sztuk Współczesnych R@Port	WYDARZENIE
+Sosnowiec	MIASTO
+Stefan	IMIĘ
+Józefa	IMIĘ
+Szczecin	MIASTO
+Waldemar	IMIĘ
+Marzęcki	NAZWISKO
+Austria	KRAJ
+Czechy	KRAJ
+Niemcy	KRAJ
+Francja	KRAJ
+Litwa	KRAJ
+USA	KRAJ
+Rosja	KRAJ
+
@@ -21,7 +21,7 @@ open ENIAMtokenizerTypes
 let mte_patterns =
   let lines = try File.load_lines mte_filename
-   with _ -> (print_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in
+   with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in
   let l = List.rev (Xlist.rev_map lines (fun line ->
     match Str.split (Str.regexp "\t") line with
       [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp
@@ -72,4 +72,5 @@ let resource_path =
     if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else
     failwith "resource directory does not exists"
-let mte_filename = resource_path ^ "/tokenizer/mte.tab"
+(* let mte_filename = resource_path ^ "/tokenizer/mte.tab" *)
+let mte_filename = resource_path ^ "/tokenizer/mte_20151215.tab"
@@ -18,7 +18,7 @@ install: all
 	mkdir -p /usr/share/eniam/tokenizer
 	cp resources/mte_20151215.tab  /usr/share/eniam/tokenizer/mte_20151215.tab
 	cp resources/README  /usr/share/eniam/tokenizer/README
-	ln -s /usr/share/eniam/tokenizer/mte_20151215.tab /usr/share/eniam/tokenizer/mte.tab
+#	ln -s /usr/share/eniam/tokenizer/mte_20151215.tab /usr/share/eniam/tokenizer/mte.tab
 install-local: all
 	mkdir -p $(INSTALLDIR)
@@ -28,7 +28,7 @@ install-local: all
 	mkdir -p /usr/local/share/eniam/tokenizer
 	cp resources/mte_20151215.tab  /usr/local/share/eniam/tokenizer/mte_20151215.tab
 	cp resources/README  /usr/local/share/eniam/tokenizer/README
-	ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab
+#	ln -s /usr/local/share/eniam/tokenizer/mte_20151215.tab /usr/local/share/eniam/tokenizer/mte.tab
 eniam-tokenizer.cma: $(SOURCES)
 	ocamlc -linkall -a -o eniam-tokenizer.cma $(OCAMLFLAGS) $^