Commit b369a30acf75ced421d7d50d287a58cac987ec2d

Authored by Wojciech Jaworski
1 parent 2f308cb1

Parser gramatyk semantycznych

LCGlexicon/ENIAM_LCGlexiconTypes.ml
... ... @@ -83,7 +83,13 @@ let resource_path =
83 83 if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else
84 84 failwith "resource directory does not exists"
85 85  
  86 +let data_path =
  87 + try Sys.getenv "ENIAM_USER_DATA_PATH"
  88 + with Not_found -> "data"
  89 +
86 90 let rules_filename = resource_path ^ "/LCGlexicon/lexicon-pl.dic"
  91 +let user_lexicon_filename = data_path ^ "/lexicon.dic"
  92 +let user_senses_filename = data_path ^ "/senses.tab"
87 93  
88 94 let subst_uncountable_lexemes_filename = resource_path ^ "/LCGlexicon/subst_uncountable.dat"
89 95 let subst_uncountable_lexemes_filename2 = resource_path ^ "/LCGlexicon/subst_uncountable_stare.dat"
... ... @@ -91,7 +97,4 @@ let subst_container_lexemes_filename = resource_path ^ "/LCGlexicon/subst_contai
91 97 let subst_numeral_lexemes_filename = resource_path ^ "/LCGlexicon/subst_numeral.dat"
92 98 let subst_time_lexemes_filename = resource_path ^ "/LCGlexicon/subst_time.dat"
93 99  
94   -(*let proper_names_filename = resource_path ^ "/lexSemantics/proper_names_sgjp_polimorf.tab"
95   - let proper_names_filename2 = resource_path ^ "/lexSemantics/proper_names.tab"*)
96   -
97 100 let adv_modes_filename = resource_path ^ "/Walenty/adv_modes.tab"
... ...
LCGlexicon/ENIAMcategoriesPL.ml
... ... @@ -36,7 +36,7 @@ let selector_values = Xlist.fold [
36 36 "match-result";"url";"email";"obj-id";"adj";"adjc";"adjp";"adja";
37 37 "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt";
38 38 "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj";
39   - "sinterj";"burk";"interp";"unk"];
  39 + "sinterj";"burk";"interp";"unk";"html-tag"];
40 40 Pos2, [];
41 41 Cat, [];
42 42 Number, all_numbers;
... ... @@ -74,22 +74,26 @@ let split_voc cases =
74 74 "voc" -> cases, "voc" :: voc
75 75 | s -> s :: cases, voc)
76 76  
77   -let subst_uncountable_lexemes = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename)
78   -let subst_uncountable_lexemes2 = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename2)
79   -let subst_container_lexemes = StringSet.of_list (File.load_lines subst_container_lexemes_filename)
80   -let subst_numeral_lexemes = StringSet.of_list (File.load_lines subst_numeral_lexemes_filename)
81   -let subst_time_lexemes = StringSet.of_list (File.load_lines subst_time_lexemes_filename)
  77 +let load_subst_data filename _ =
  78 + StringSet.of_list (File.load_lines filename)
  79 +
  80 +let subst_uncountable_lexemes = File.catch_no_file (load_subst_data subst_uncountable_lexemes_filename) StringSet.empty
  81 +let subst_uncountable_lexemes2 = File.catch_no_file (load_subst_data subst_uncountable_lexemes_filename2) StringSet.empty
  82 +let subst_container_lexemes = File.catch_no_file (load_subst_data subst_container_lexemes_filename) StringSet.empty
  83 +let subst_numeral_lexemes = File.catch_no_file (load_subst_data subst_numeral_lexemes_filename) StringSet.empty
  84 +let subst_time_lexemes = File.catch_no_file (load_subst_data subst_time_lexemes_filename) StringSet.empty
82 85  
83 86 let subst_pronoun_lexemes = StringSet.of_list ["co"; "kto"; "cokolwiek"; "ktokolwiek"; "nic"; "nikt"; "coś"; "ktoś"; "to"]
84 87 let adj_pronoun_lexemes = StringSet.of_list ["czyj"; "jaki"; "który"; "jakiś"; "ten"; "taki"]
85 88  
86 89 (* let adj_quant_lexemes = StringSet.of_list ["każdy"; "wszelki"; "wszystek"; "żaden"; "jakiś"; "pewien"; "niektóry"; "jedyny"; "sam"] *)
87 90  
88   -let adv_modes =
89   - try File.fold_tab adv_modes_filename StringMap.empty (fun adv_modes -> function
  91 +let load_adv_modes filename adv_modes =
  92 + File.fold_tab filename adv_modes (fun adv_modes -> function
90 93 [adv;mode] -> StringMap.add_inc adv_modes adv [mode] (fun l -> mode :: l)
91 94 | _ -> failwith "adv_modes")
92   - with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); StringMap.empty)
  95 +
  96 +let adv_modes = File.catch_no_file (load_adv_modes adv_modes_filename) StringMap.empty
93 97  
94 98 let noun_type proper lemma pos =
95 99 let nsyn =
... ... @@ -347,6 +351,7 @@ let clarify_categories proper cat = function
347 351 | lemma,"interp",[] -> [{empty_cats with lemma=lemma; pos="interp"; pos2="interp"}]
348 352 | lemma,"unk",[] ->
349 353 [{empty_cats with lemma=lemma; pos="unk"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}]
  354 + | lemma,"html-tag",[] -> [{empty_cats with lemma=lemma; pos="html-tag"; pos2="html-tag"}]
350 355 | lemma,c,l -> failwith ("clarify_categories: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat "."))))
351 356  
352 357 (* FIXME: przenieść gdzieś indziej *)
... ... @@ -547,4 +552,5 @@ let pos_categories = Xlist.fold [
547 552 "burk",[Lemma;];
548 553 "interp",[Lemma;];
549 554 "unk",[Lemma;Number;Case;Gender;Person;];
  555 + "html-tag",[Lemma;];
550 556 ] StringMap.empty (fun map (k,l) -> StringMap.add map k l)
... ...
LCGlexicon/TODO
  1 +- poprawić parser.ml tak by łączył się sieciowo z subsyntax
1 2  
2 3 "Można było" - brakuje uzgodnienia rodzaju przymiotnika w przypadku predykatywnym, i ogólnie kontroli składniowej
3 4  
... ...
LCGlexicon/makefile
... ... @@ -4,7 +4,8 @@ OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6 6 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa
7   -OCAMLOPTFLAGS2=$(OCAMLOPTFLAGS) eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lexSemantics.cmxa
  7 +OCAMLOPTFLAGS2=$(OCAMLOPTFLAGS) eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
  8 +OCAMLOPTFLAGS3=$(OCAMLOPTFLAGS2) eniam-lexSemantics.cmxa
8 9 INSTALLDIR=`ocamlc -where`/eniam
9 10  
10 11 SOURCES= ENIAM_LCGlexiconTypes.ml ENIAMcategoriesPL.ml ENIAM_LCGlexiconParser.ml ENIAM_LCGlexicon.ml
... ... @@ -40,10 +41,14 @@ test: test.ml
40 41  
41 42 test2: test2.ml
42 43 mkdir -p results
43   - $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS2) test2.ml
  44 + $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS3) test2.ml
44 45  
45 46 interface: interface.ml
46   - $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) interface.ml
  47 + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS3) interface.ml
  48 +
  49 +parser: parser.ml
  50 + mkdir -p results
  51 + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) parser.ml
47 52  
48 53 print_lexicon: ENIAM_LCGlexiconLatexOf.ml
49 54 mkdir -p results
... ...
LCGlexicon/parser.ml 0 → 100644
  1 +open Xstd
  2 +open ENIAMsubsyntaxTypes
  3 +
  4 +let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.user_lexicon_filename
  5 +
  6 +let load_senses_map filename =
  7 + File.fold_tab filename StringMap.empty (fun map -> function
  8 + [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l)
  9 + | l -> failwith ("load_senses_map: " ^ String.concat "\t" l))
  10 +
  11 +let senses_map = load_senses_map ENIAM_LCGlexiconTypes.user_senses_filename
  12 +
  13 +
  14 +let examples = [
  15 + (* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *)
  16 + "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994.";
  17 +]
  18 +
  19 +let clarify_categories token =
  20 + match token.ENIAMtokenizerTypes.token with
  21 + ENIAMtokenizerTypes.Lemma(lemma,pos,interp) ->
  22 + let senses = try StringMap.find senses_map lemma with Not_found -> ["X"] in
  23 + List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
  24 + | ENIAMtokenizerTypes.Proper(lemma,pos,interp,senses) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
  25 + | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false ["X"] (lemma,"interp",[])
  26 + | _ -> []
  27 +
  28 +let create_chart tokens paths last =
  29 + ENIAM_LCGrenderer.reset_variable_numbers ();
  30 + let chart = ENIAM_LCGchart.make last in
  31 + let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
  32 + let t = ExtArray.get tokens id in
  33 + ENIAM_LCGrenderer.reset_variable_names ();
  34 + ENIAM_LCGrenderer.add_variable_numbers ();
  35 + let cats = clarify_categories t in
  36 + let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats [] in
  37 + ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
  38 + chart
  39 +
  40 +let test_example name tokens paths last =
  41 + ENIAM_LCGreductions.reset_variant_label ();
  42 + let chart = create_chart tokens paths last in
  43 + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart;
  44 + let chart,references = ENIAM_LCGchart.lazify chart in
  45 + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart;
  46 + ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references;
  47 + let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
  48 + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart;
  49 + ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references;
  50 + if ENIAM_LCGchart.is_parsed chart then (
  51 + let term = ENIAM_LCGchart.get_parsed_term chart in
  52 + Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file ->
  53 + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
  54 + Xlatex.latex_compile_and_clean "results/" (name^"4_term");
  55 + let dependency_tree = ENIAM_LCGreductions.reduce term references in
  56 + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree;
  57 + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
  58 + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
  59 + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree;
  60 + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
  61 + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree;
  62 + ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree;
  63 + ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree;
  64 + ())
  65 + else print_endline "not reduced")
  66 + else print_endline "not parsed"
  67 +
  68 +let rec parse_sentence name id tokens = function
  69 + RawSentence s -> id
  70 + | StructSentence(paths,last) ->
  71 + test_example (name ^ string_of_int id ^ "_") tokens paths last;
  72 + id + 1
  73 + | DepSentence(paths) -> id
  74 + | QuotedSentences sentences ->
  75 + Xlist.fold sentences id (fun id p ->
  76 + parse_sentence name id tokens p.sentence)
  77 + | AltSentence l ->
  78 + Xlist.fold l id (fun id (mode,sentence) ->
  79 + parse_sentence name id tokens sentence)
  80 +
  81 +let rec parse_paragraph name id tokens = function
  82 + RawParagraph s -> id
  83 + | StructParagraph sentences ->
  84 + Xlist.fold sentences id (fun id p ->
  85 + parse_sentence name id tokens p.sentence)
  86 + | AltParagraph l ->
  87 + Xlist.fold l id (fun id (mode,paragraph) ->
  88 + parse_paragraph name id tokens paragraph)
  89 +
  90 +let rec parse_text name id tokens = function
  91 + RawText s -> id
  92 + | StructText paragraphs ->
  93 + Xlist.fold paragraphs id (fun id paragraph ->
  94 + parse_paragraph name id tokens paragraph)
  95 + | AltText l ->
  96 + Xlist.fold l id (fun id (mode,text) ->
  97 + parse_text name id tokens text)
  98 +
  99 +
  100 +(* let _ =
  101 + Xlist.iter examples (fun (name,example) ->
  102 + let text,tokens = ENIAMsubsyntax.parse_text example in
  103 + ignore(parse_text name 1 tokens text)) *)
  104 +
  105 +(*
  106 +type entry = {title: string; info:string; biogram:string; (*primary:string; secondary:string;*) author:string}
  107 +
  108 +let process_xml = function
  109 + Xml.Element("entries",[],entries) ->
  110 + List.rev (Xlist.rev_map entries (function
  111 + Xml.Element("entry",[],[title;info;biogram(*;primary;secondary*);author]) ->
  112 + {title=Xml.to_string title; info=Xml.to_string info; biogram=Xml.to_string biogram;
  113 + (*primary=Xml.to_string primary; secondary=Xml.to_string secondary;*) author=Xml.to_string author}
  114 + | _ -> failwith "process_xml 1"))
  115 + | _ -> failwith "process_xml 2"
  116 +
  117 +
  118 +let load_ppibl filename =
  119 + let ppibl = File.load_file_gen ("data/" ^ filename) in
  120 + process_xml (Xml.parse_string ppibl)
  121 +
  122 +let named_entities =
  123 + File.fold_tab "data/ne.tab" StringMap.empty (fun map -> function
  124 + [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l)
  125 + | _ -> failwith "named_entities")
  126 +
  127 +let assign_named_entities t =
  128 + match t.token with
  129 + Lemma(lemma,"subst",interp) ->
  130 + (try
  131 + let cat = StringMap.find named_entities lemma in
  132 + {t with token=Proper(lemma,"subst",interp,cat)}
  133 + with Not_found -> t)
  134 + | Proper(lemma,"subst",interp,_) ->
  135 + (try
  136 + let cat = StringMap.find named_entities lemma in
  137 + {t with token=Proper(lemma,"subst",interp,cat)}
  138 + with Not_found -> t)
  139 + | _ -> t
  140 +
  141 +let test_strings = [
  142 + (* "Debiutował opowiadaniem pt. <i>Zlecenie na dostawę</i>."; *)
  143 + "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie.";
  144 + (* "Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994." *)
  145 + (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP." *)
  146 +]
  147 +
  148 +(* let _ =
  149 + let entries = load_ppibl "ak322269.xml" in
  150 + Xlist.iter entries (fun entry -> print_endline entry.biogram) *)
  151 +
  152 +(*
  153 +let test_strings = [
  154 + "Szpak frunie.";
  155 + "Kot np. miauczy.";
  156 + "Ala ma kota.";
  157 + "Ale mają kota:"
  158 + ]
  159 +
  160 +let test_strings2 = [
  161 + "Szpak frunie. Kot miauczy.";
  162 + "Szpak powiedział: „Frunę. Kiszę.”";
  163 + ]
  164 +*)
  165 +
  166 +let grammar = [
  167 + "pos=year", Basic "year",symbol_weight;
  168 + "pos=year-interval", Basic "year-interval",symbol_weight;
  169 + "lemma=w,pos=prep,case=loc", Basic "time/(year+year-interval)",0.;
  170 + "lemma=w,pos=prep,case=loc", Basic "locat/np*MIASTO*T*loc*T",0.;
  171 +
  172 + "lemma=uczęszczać,pos=praet|fin,person=ter,negation=aff,mood=indicative", Basic "ip*number*gender{|(1+time),|(1+pp*ORGANIZACJA*do*gen),|(1+locat)}",0.;
  173 + "lemma=do,pos=prep,case=gen", Basic "pp*sense*lemma*case/np*sense*T*case*T",0.;
  174 +
  175 +]
  176 +
  177 +let _ =
  178 + print_endline "Testy wbudowane";
  179 + Xlist.iter test_strings (fun s ->
  180 + print_endline ("\nTEST: " ^ s);
  181 + let paths = ENIAMsubsyntax.parse s in
  182 + let paths = Xlist.map paths assign_named_entities in
  183 + (* print_endline (ENIAMtokenizer.xml_of tokens); *)
  184 + print_endline (ENIAMpaths.to_string (paths,0)));
  185 +(* Xlist.iter test_strings2 (fun s ->
  186 + print_endline ("\nTEST: " ^ s);
  187 + let text,tokens = ENIAMsubsyntax.parse_text s in
  188 + (* print_endline (ENIAMtokenizer.xml_of tokens); *)
  189 + print_endline (ENIAMsubsyntaxStringOf.tokens tokens);
  190 + print_endline "";
  191 + print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));*)
  192 +(* print_endline "Testy użytkownika.";
  193 + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
  194 + let s = ref (read_line ()) in
  195 + while !s <> "" do
  196 + let tokens = ENIAMtokenizer.parse !s in
  197 + (* print_endline (ENIAMtokenizer.xml_of tokens); *)
  198 + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token));
  199 + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
  200 + s := read_line ()
  201 + done;*)
  202 + ()
  203 +
  204 +open ENIAM_LCGlexiconTypes
  205 +open ENIAM_LCGtypes
  206 +
  207 +
  208 +(*
  209 +type output = Text | Xml | Html | Marsh | Graphviz
  210 +
  211 +let output = ref Text
  212 +let comm_stdio = ref true
  213 +let sentence_split = ref true
  214 +let port = ref 0
  215 +
  216 +let spec_list = [
  217 + "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
  218 + "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences";
  219 + "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)";
  220 + "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number";
  221 + "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)";
  222 + "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
  223 + "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
  224 + "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
  225 + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off";
  226 + (* "-r", Arg.String (fun p ->
  227 + ENIAMtokenizerTypes.set_resource_path p;
  228 + ENIAMmorphologyTypes.set_resource_path p;
  229 + ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *)
  230 + ]
  231 +
  232 +let usage_msg =
  233 + "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:"
  234 +*)*)
  235 +let message = "ENIAM_LCGparser, a parser for Logical Categorial Grammar formalism\n\
  236 +Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\
  237 +Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences"
  238 +(*
  239 +let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s))
  240 +*)
  241 +let input_text channel =
  242 + let s = ref (try input_line channel with End_of_file -> "") in
  243 + let lines = ref [] in
  244 + while !s <> "" do
  245 + lines := !s :: !lines;
  246 + s := try input_line channel with End_of_file -> ""
  247 + done;
  248 + String.concat "\n" (List.rev !lines)
  249 +
  250 +let rec main_loop in_chan out_chan =
  251 + let text = input_text in_chan in
  252 + if text = "" then () else (
  253 + let text,tokens = ENIAMsubsyntax.parse_text text in
  254 + ignore(parse_text "E"(*name*) 1 tokens text)
  255 + (* print_endline "input text begin";
  256 + print_endline text;
  257 + print_endline "input text end"; *)
  258 + (*if !sentence_split then
  259 + let text,tokens = ENIAMsubsyntax.parse_text text in
  260 + (match !output with
  261 + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n")
  262 + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n")
  263 + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n")
  264 + | Marsh -> Marshal.to_channel out_chan (text,tokens) []
  265 + | Graphviz -> failwith "main_loop: ni")
  266 + else
  267 + let tokens = ENIAMsubsyntax.parse text in
  268 + (match !output with
  269 + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n")
  270 + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n")
  271 + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n")
  272 + | Marsh -> Marshal.to_channel out_chan tokens []
  273 + | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))*);
  274 + flush out_chan;
  275 + main_loop in_chan out_chan)
  276 +
  277 +let _ =
  278 + prerr_endline message;
  279 + (* Arg.parse spec_list anon_fun usage_msg; *)
  280 + Gc.compact ();
  281 + prerr_endline "Ready!";
  282 + (*if !comm_stdio then*) main_loop stdin stdout
  283 + (*else
  284 + let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in
  285 + Unix.establish_server main_loop sockaddr*)
... ...
subsyntax/ENIAM_MWE.ml
... ... @@ -40,7 +40,7 @@ let process_interp lemma interp =
40 40 | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s))
41 41 | _ -> failwith "process_interp"
42 42  
43   -let load_mwe_dict dict filename =
  43 +let load_mwe_dict filename dict =
44 44 File.fold_tab filename dict (fun dict -> function
45 45 [orths; lemma; interp] ->
46 46 let orths = Xstring.split " " orths in
... ... @@ -60,7 +60,7 @@ let process_orth = function
60 60 | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l)
61 61 | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens)
62 62  
63   -let load_mwe_dict2 (dict,dict2) filename =
  63 +let load_mwe_dict2 filename (dict,dict2) =
64 64 File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function
65 65 [orths; lemma] ->
66 66 (* print_endline (orths ^ "\t" ^ lemma); *)
... ... @@ -84,12 +84,13 @@ let load_mwe_dict2 (dict,dict2) filename =
84 84 | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'"))
85 85  
86 86 let mwe_dict,mwe_dict2 =
87   - let dict = load_mwe_dict StringMap.empty brev_filename in
88   - let dict = try load_mwe_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
89   - let dict = load_mwe_dict dict mwe_filename in
90   - let dict,dict2 = load_mwe_dict2 (dict,StringMap.empty) sejf_filename in
91   - let dict,dict2 = load_mwe_dict2 (dict,dict2) sejfek_filename in
92   - let dict,dict2 = load_mwe_dict2 (dict,dict2) sawa_filename in
  87 + let dict = File.catch_no_file (load_mwe_dict brev_filename) StringMap.empty in
  88 + let dict = File.catch_no_file (load_mwe_dict fixed_filename) dict in
  89 + let dict = File.catch_no_file (load_mwe_dict mwe_filename) dict in
  90 + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejf_filename) (dict,StringMap.empty) in
  91 + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejfek_filename) (dict,dict2) in
  92 + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_filename) (dict,dict2) in
  93 + let dict,dict2 = File.catch_no_file (load_mwe_dict2 mwe2_filename) (dict,dict2) in
93 94 dict,dict2
94 95  
95 96 let get_orths paths =
... ... @@ -223,7 +224,7 @@ let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: pro
223 224 next=t.next;
224 225 token=Lemma(lemma,cat,[Xlist.map interp (function
225 226 S s -> (try Xlist.assoc sels s with Not_found -> ["_"])
226   - | V s -> [s]
  227 + | V s -> Xstring.split "\\." s
227 228 | G -> ["_"])]);
228 229 weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *)
229 230 attrs=ENIAMtokens.merge_attrs l}
... ...
subsyntax/ENIAMsubsyntax.ml
... ... @@ -21,16 +21,16 @@ open ENIAMsubsyntaxTypes
21 21 open ENIAMtokenizerTypes
22 22 open Xstd
23 23  
24   -let load_lemma_frequencies filename =
  24 +let load_lemma_frequencies filename map =
25 25 let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
26   - Xlist.fold l StringMap.empty (fun map line ->
  26 + Xlist.fold l map (fun map line ->
27 27 if String.length line = 0 then map else
28 28 if String.get line 0 = '#' then map else
29 29 match Str.split_delim (Str.regexp "\t") line with
30 30 [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.))
31 31 | _ -> failwith ("load_lemma_frequencies: " ^ line))
32 32  
33   -let lemma_frequencies = load_lemma_frequencies lemma_frequencies_filename
  33 +let lemma_frequencies = File.catch_no_file (load_lemma_frequencies lemma_frequencies_filename) StringMap.empty
34 34  
35 35 let modify_weights paths =
36 36 List.rev (Xlist.fold paths [] (fun paths t ->
... ... @@ -210,10 +210,13 @@ let load_proper_name proper = function
210 210 StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
211 211 | l -> failwith ("proper_names: " ^ String.concat " " l)
212 212  
  213 +let load_proper_names filename proper =
  214 + File.fold_tab filename proper load_proper_name
  215 +
213 216 let proper_names =
214   - let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in
215   - let proper = File.fold_tab proper_names_filename2 proper load_proper_name in
216   - let proper = File.fold_tab proper_names_filename3 proper load_proper_name in
  217 + let proper = File.catch_no_file (load_proper_names proper_names_filename) StringMap.empty in
  218 + let proper = File.catch_no_file (load_proper_names proper_names_filename2) proper in
  219 + let proper = File.catch_no_file (load_proper_names proper_names_filename3) proper in
217 220 proper
218 221  
219 222 let remove l s =
... ...
subsyntax/ENIAMsubsyntaxTypes.ml
... ... @@ -44,10 +44,15 @@ type text =
44 44 | StructText of paragraph list (* * token_record ExtArray.t*) (* akapity * tokeny *)
45 45 | AltText of (mode * text) list
46 46  
  47 +let data_path =
  48 + try Sys.getenv "ENIAM_USER_DATA_PATH"
  49 + with Not_found -> "data"
  50 +
47 51 let brev_filename = resource_path ^ "/subsyntax/brev.tab"
48 52 let fixed_filename = resource_path ^ "/Walenty/fixed.tab"
49   -let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab"
50   -let mwe_filename = resource_path ^ "/subsyntax/mwe.tab"
  53 +(* let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" *)
  54 +let mwe_filename = data_path ^ "/mwe.tab"
  55 +let mwe2_filename = data_path ^ "/mwe2.tab"
51 56 let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic"
52 57 let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic"
53 58 let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic"
... ... @@ -58,7 +63,7 @@ let lemma_frequencies_filename = resource_path ^ &quot;/subsyntax/NKJP1M-lemma-freq.t
58 63 let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *)
59 64 let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab"
60 65 let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab"
61   -let proper_names_filename3 = resource_path ^ "/subsyntax/ne.tab"
  66 +let proper_names_filename3 = data_path ^ "/ne.tab"
62 67  
63 68 let int_of_mode = function
64 69 Raw -> 0
... ...
subsyntax/makefile
... ... @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa #eniam-subsyntax.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9 9 SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml
... ... @@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES)
32 32 eniam-subsyntax.cmxa: $(SOURCES)
33 33 ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^
34 34  
35   -test: $(SOURCES) test.ml
36   - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml
  35 +test: test.ml
  36 + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml
37 37  
38 38 interface: interface.ml
39 39 $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml
... ...
subsyntax/resources/mwe.tab deleted
1   -Akademii Sztuki w Szczecinie Akademia Sztuki w Szczecinie subst:sg:gen.dat.loc:f
2   -Atelier Bizio + Ligierko Atelier Bizio + Ligierko subst:sg:_:n2
3   -Instytucie Architektury i Planowania Przestrzennego Instytut Architektury i Planowania Przestrzennego subst:sg:loc.voc:m3
4   -Katedrze Architektury Współczesnej Teorii i Metodologii Projektowania Katedra Architektury Współczesnej Teorii i Metodologii Projektowania subst:sg:dat.loc:f
5   -VII Liceum Ogólnokształcącego im . K . K . Baczyńskiego VII Liceum Ogólnokształcące im. K.K. Baczyńskiego subst:sg:gen:m3
6   -IV Liceum Ogólnokształcącego im . L . Szenwalda IV Liceum Ogólnokształcące im. L. Szenwalda subst:sg:gen:m3
7   -Muzeum Narodowym Muzeum Narodowe subst:sg:inst.loc:n2
8   -Nagrodę Artystyczną m . Szczecina Nagroda Artystyczna m. Szczecina subst:sg:acc:f
9   -Zachodniopomorskiego Nobla Zachodniopomorski Nobel subst:sg:acc.gen:m3
10   -Politechnice Krakowskiej Politechnika Krakowska subst:sg:dat.loc:f
11   -Politechnice Szczecińskiej Politechnika Szczecińska subst:sg:dat.loc:f
12   -Politechniki Szczecińskiej Politechnika Szczecińska subst:sg:gen:f
13   -Pracowni Podstaw Projektowania Pracownia Podstaw Projektowania subst:sg:gen.dat.loc:f
14   -Przeglądu Teatrów Małych Form „ Kontrapunkt ” Przegląd Teatrów Małych Form „Kontrapunkt” subst:sg:gen:m3
15   -Mistrzowską Szkołę Reżyserii Filmowej Andrzeja Wajdy Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy subst:sg:acc:f
16   -Uniwersytecie im . M . Kopernika Uniwersytet im. M. Kopernika subst:sg:loc.voc:m3
17   -Zachodniopomorski Uniwersytet Technologiczny Zachodniopomorski Uniwersytet Technologiczny subst:sg:acc.nom:m3
18   -Wydziale Budownictwa i Architektury Wydział Budownictwa i Architektury subst:sg:loc.voc:m3
19   -Wydziale Stuk Wizualnych Wydział Stuk Wizualnych subst:sg:loc.voc:m3
20   -Zakładzie Teorii Architektury , Historii i Konserwacji Zabytków Zakład Teorii Architektury, Historii i Konserwacji Zabytków subst:sg:loc.voc:m3
21   -Festiwalu Polskich Sztuk Współczesnych R @ Port Festiwalu Polskich Sztuk Współczesnych R@Port subst:sg:gen.loc.voc:m3
22   -Arabia Saudyjska Arabia Saudyjska subst:sg:nom:f
subsyntax/resources/ne.tab deleted
1   -Akademia Sztuki ORGANIZACJA
2   -Atelier Bizio + Ligierko ORGANIZACJA
3   -Instytut Architektury i Planowania Przestrzennego ORGANIZACJA
4   -Katedra Architektury Współczesnej Teorii i Metodologii Projektowania ORGANIZACJA
5   -VII Liceum Ogólnokształcące im. K.K. Baczyńskiego ORGANIZACJA
6   -IV Liceum Ogólnokształcące im. L. Szenwalda ORGANIZACJA
7   -Muzeum Narodowe ORGANIZACJA
8   -Nagroda Artystyczna m. Szczecina WYRÓŻNIENIE
9   -Zachodniopomorski Nobel WYRÓŻNIENIE
10   -Politechnika Krakowska ORGANIZACJA
11   -Politechnika Szczecińska ORGANIZACJA
12   -Pracownia Podstaw Projektowania ORGANIZACJA
13   -Przegląd Teatrów Małych Form „Kontrapunkt” ORGANIZACJA
14   -Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy ORGANIZACJA
15   -Uniwersytet im. M. Kopernika ORGANIZACJA
16   -Zachodniopomorski Uniwersytet Technologiczny ORGANIZACJA
17   -Wydział Budownictwa i Architektury ORGANIZACJA
18   -Wydział Stuk Wizualnych ORGANIZACJA
19   -Zakład Teorii Architektury, Historii i Konserwacji Zabytków ORGANIZACJA
20   -Festiwal Polskich Sztuk Współczesnych R@Port WYDARZENIE
21   -Sosnowiec MIASTO
22   -Stefan IMIĘ
23   -Józefa IMIĘ
24   -Szczecin MIASTO
25   -Waldemar IMIĘ
26   -Marzęcki NAZWISKO
27   -Austria KRAJ
28   -Czechy KRAJ
29   -Niemcy KRAJ
30   -Francja KRAJ
31   -Litwa KRAJ
32   -USA KRAJ
33   -Rosja KRAJ
34   -
tokenizer/ENIAMacronyms.ml
... ... @@ -19,9 +19,10 @@
19 19  
20 20 open ENIAMtokenizerTypes
21 21  
  22 +let load_mte mte_filename _ = File.load_lines mte_filename
  23 +
22 24 let mte_patterns =
23   - let lines = try File.load_lines mte_filename
24   - with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in
  25 + let lines = File.catch_no_file (load_mte mte_filename) [] in
25 26 let l = List.rev (Xlist.rev_map lines (fun line ->
26 27 match Str.split (Str.regexp "\t") line with
27 28 [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp
... ...