From b369a30acf75ced421d7d50d287a58cac987ec2d Mon Sep 17 00:00:00 2001 From: Wojciech Jaworski <wjaworski@mimuw.edu.pl> Date: Sat, 15 Apr 2017 21:24:46 +0200 Subject: [PATCH] Parser gramatyk semantycznych --- LCGlexicon/ENIAM_LCGlexiconTypes.ml | 9 ++++++--- LCGlexicon/ENIAMcategoriesPL.ml | 24 +++++++++++++++--------- LCGlexicon/TODO | 1 + LCGlexicon/makefile | 11 ++++++++--- LCGlexicon/parser.ml | 285 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ subsyntax/ENIAM_MWE.ml | 19 ++++++++++--------- subsyntax/ENIAMsubsyntax.ml | 15 +++++++++------ subsyntax/ENIAMsubsyntaxTypes.ml | 11 ++++++++--- subsyntax/makefile | 6 +++--- subsyntax/resources/mwe.tab | 22 ---------------------- subsyntax/resources/ne.tab | 34 ---------------------------------- tokenizer/ENIAMacronyms.ml | 5 +++-- 12 files changed, 348 insertions(+), 94 deletions(-) create mode 100644 LCGlexicon/parser.ml delete mode 100644 subsyntax/resources/mwe.tab delete mode 100644 subsyntax/resources/ne.tab diff --git a/LCGlexicon/ENIAM_LCGlexiconTypes.ml b/LCGlexicon/ENIAM_LCGlexiconTypes.ml index fc76e0c..41a1f47 100644 --- a/LCGlexicon/ENIAM_LCGlexiconTypes.ml +++ b/LCGlexicon/ENIAM_LCGlexiconTypes.ml @@ -83,7 +83,13 @@ let resource_path = if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else failwith "resource directory does not exists" +let data_path = + try Sys.getenv "ENIAM_USER_DATA_PATH" + with Not_found -> "data" + let rules_filename = resource_path ^ "/LCGlexicon/lexicon-pl.dic" +let user_lexicon_filename = data_path ^ "/lexicon.dic" +let user_senses_filename = data_path ^ "/senses.tab" let subst_uncountable_lexemes_filename = resource_path ^ "/LCGlexicon/subst_uncountable.dat" let subst_uncountable_lexemes_filename2 = resource_path ^ "/LCGlexicon/subst_uncountable_stare.dat" @@ -91,7 +97,4 @@ let subst_container_lexemes_filename = resource_path ^ "/LCGlexicon/subst_contai let subst_numeral_lexemes_filename = resource_path ^ "/LCGlexicon/subst_numeral.dat" let subst_time_lexemes_filename = resource_path ^ "/LCGlexicon/subst_time.dat" -(*let proper_names_filename = resource_path ^ "/lexSemantics/proper_names_sgjp_polimorf.tab" - let proper_names_filename2 = resource_path ^ "/lexSemantics/proper_names.tab"*) - let adv_modes_filename = resource_path ^ "/Walenty/adv_modes.tab" diff --git a/LCGlexicon/ENIAMcategoriesPL.ml b/LCGlexicon/ENIAMcategoriesPL.ml index ce02473..0a7f725 100644 --- a/LCGlexicon/ENIAMcategoriesPL.ml +++ b/LCGlexicon/ENIAMcategoriesPL.ml @@ -36,7 +36,7 @@ let selector_values = Xlist.fold [ "match-result";"url";"email";"obj-id";"adj";"adjc";"adjp";"adja"; "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt"; "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj"; - "sinterj";"burk";"interp";"unk"]; + "sinterj";"burk";"interp";"unk";"html-tag"]; Pos2, []; Cat, []; Number, all_numbers; @@ -74,22 +74,26 @@ let split_voc cases = "voc" -> cases, "voc" :: voc | s -> s :: cases, voc) -let subst_uncountable_lexemes = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename) -let subst_uncountable_lexemes2 = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename2) -let subst_container_lexemes = StringSet.of_list (File.load_lines subst_container_lexemes_filename) -let subst_numeral_lexemes = StringSet.of_list (File.load_lines subst_numeral_lexemes_filename) -let subst_time_lexemes = StringSet.of_list (File.load_lines subst_time_lexemes_filename) +let load_subst_data filename _ = + StringSet.of_list (File.load_lines filename) + +let subst_uncountable_lexemes = File.catch_no_file (load_subst_data subst_uncountable_lexemes_filename) StringSet.empty +let subst_uncountable_lexemes2 = File.catch_no_file (load_subst_data subst_uncountable_lexemes_filename2) StringSet.empty +let subst_container_lexemes = File.catch_no_file (load_subst_data subst_container_lexemes_filename) StringSet.empty +let subst_numeral_lexemes = File.catch_no_file (load_subst_data subst_numeral_lexemes_filename) StringSet.empty +let subst_time_lexemes = File.catch_no_file (load_subst_data subst_time_lexemes_filename) StringSet.empty let subst_pronoun_lexemes = StringSet.of_list ["co"; "kto"; "cokolwiek"; "ktokolwiek"; "nic"; "nikt"; "coś"; "ktoś"; "to"] let adj_pronoun_lexemes = StringSet.of_list ["czyj"; "jaki"; "który"; "jakiś"; "ten"; "taki"] (* let adj_quant_lexemes = StringSet.of_list ["każdy"; "wszelki"; "wszystek"; "żaden"; "jakiś"; "pewien"; "niektóry"; "jedyny"; "sam"] *) -let adv_modes = - try File.fold_tab adv_modes_filename StringMap.empty (fun adv_modes -> function +let load_adv_modes filename adv_modes = + File.fold_tab filename adv_modes (fun adv_modes -> function [adv;mode] -> StringMap.add_inc adv_modes adv [mode] (fun l -> mode :: l) | _ -> failwith "adv_modes") - with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); StringMap.empty) + +let adv_modes = File.catch_no_file (load_adv_modes adv_modes_filename) StringMap.empty let noun_type proper lemma pos = let nsyn = @@ -347,6 +351,7 @@ let clarify_categories proper cat = function | lemma,"interp",[] -> [{empty_cats with lemma=lemma; pos="interp"; pos2="interp"}] | lemma,"unk",[] -> [{empty_cats with lemma=lemma; pos="unk"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}] + | lemma,"html-tag",[] -> [{empty_cats with lemma=lemma; pos="html-tag"; pos2="html-tag"}] | lemma,c,l -> failwith ("clarify_categories: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) (* FIXME: przenieść gdzieś indziej *) @@ -547,4 +552,5 @@ let pos_categories = Xlist.fold [ "burk",[Lemma;]; "interp",[Lemma;]; "unk",[Lemma;Number;Case;Gender;Person;]; + "html-tag",[Lemma;]; ] StringMap.empty (fun map (k,l) -> StringMap.add map k l) diff --git a/LCGlexicon/TODO b/LCGlexicon/TODO index f817dd6..d5b4b96 100644 --- a/LCGlexicon/TODO +++ b/LCGlexicon/TODO @@ -1,3 +1,4 @@ +- poprawić parser.ml tak by łączył się sieciowo z subsyntax "Można było" - brakuje uzgodnienia rodzaju przymiotnika w przypadku predykatywnym, i ogólnie kontroli składniowej diff --git a/LCGlexicon/makefile b/LCGlexicon/makefile index 37e64ec..9fc5308 100755 --- a/LCGlexicon/makefile +++ b/LCGlexicon/makefile @@ -4,7 +4,8 @@ OCAMLDEP=ocamldep INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam OCAMLFLAGS=$(INCLUDES) -g OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa -OCAMLOPTFLAGS2=$(OCAMLOPTFLAGS) eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lexSemantics.cmxa +OCAMLOPTFLAGS2=$(OCAMLOPTFLAGS) eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa +OCAMLOPTFLAGS3=$(OCAMLOPTFLAGS2) eniam-lexSemantics.cmxa INSTALLDIR=`ocamlc -where`/eniam SOURCES= ENIAM_LCGlexiconTypes.ml ENIAMcategoriesPL.ml ENIAM_LCGlexiconParser.ml ENIAM_LCGlexicon.ml @@ -40,10 +41,14 @@ test: test.ml test2: test2.ml mkdir -p results - $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS2) test2.ml + $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS3) test2.ml interface: interface.ml - $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) interface.ml + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS3) interface.ml + +parser: parser.ml + mkdir -p results + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) parser.ml print_lexicon: ENIAM_LCGlexiconLatexOf.ml mkdir -p results diff --git a/LCGlexicon/parser.ml b/LCGlexicon/parser.ml new file mode 100644 index 0000000..5527050 --- /dev/null +++ b/LCGlexicon/parser.ml @@ -0,0 +1,285 @@ +open Xstd +open ENIAMsubsyntaxTypes + +let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.user_lexicon_filename + +let load_senses_map filename = + File.fold_tab filename StringMap.empty (fun map -> function + [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l) + | l -> failwith ("load_senses_map: " ^ String.concat "\t" l)) + +let senses_map = load_senses_map ENIAM_LCGlexiconTypes.user_senses_filename + + +let examples = [ + (* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) + "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; +] + +let clarify_categories token = + match token.ENIAMtokenizerTypes.token with + ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> + let senses = try StringMap.find senses_map lemma with Not_found -> ["X"] in + List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) + | ENIAMtokenizerTypes.Proper(lemma,pos,interp,senses) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) + | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false ["X"] (lemma,"interp",[]) + | _ -> [] + +let create_chart tokens paths last = + ENIAM_LCGrenderer.reset_variable_numbers (); + let chart = ENIAM_LCGchart.make last in + let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> + let t = ExtArray.get tokens id in + ENIAM_LCGrenderer.reset_variable_names (); + ENIAM_LCGrenderer.add_variable_numbers (); + let cats = clarify_categories t in + let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats [] in + ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in + chart + +let test_example name tokens paths last = + ENIAM_LCGreductions.reset_variant_label (); + let chart = create_chart tokens paths last in + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; + let chart,references = ENIAM_LCGchart.lazify chart in + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; + ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; + let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; + ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; + if ENIAM_LCGchart.is_parsed chart then ( + let term = ENIAM_LCGchart.get_parsed_term chart in + Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file -> + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); + Xlatex.latex_compile_and_clean "results/" (name^"4_term"); + let dependency_tree = ENIAM_LCGreductions.reduce term references in + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree; + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree; + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree; + ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree; + ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree; + ()) + else print_endline "not reduced") + else print_endline "not parsed" + +let rec parse_sentence name id tokens = function + RawSentence s -> id + | StructSentence(paths,last) -> + test_example (name ^ string_of_int id ^ "_") tokens paths last; + id + 1 + | DepSentence(paths) -> id + | QuotedSentences sentences -> + Xlist.fold sentences id (fun id p -> + parse_sentence name id tokens p.sentence) + | AltSentence l -> + Xlist.fold l id (fun id (mode,sentence) -> + parse_sentence name id tokens sentence) + +let rec parse_paragraph name id tokens = function + RawParagraph s -> id + | StructParagraph sentences -> + Xlist.fold sentences id (fun id p -> + parse_sentence name id tokens p.sentence) + | AltParagraph l -> + Xlist.fold l id (fun id (mode,paragraph) -> + parse_paragraph name id tokens paragraph) + +let rec parse_text name id tokens = function + RawText s -> id + | StructText paragraphs -> + Xlist.fold paragraphs id (fun id paragraph -> + parse_paragraph name id tokens paragraph) + | AltText l -> + Xlist.fold l id (fun id (mode,text) -> + parse_text name id tokens text) + + +(* let _ = + Xlist.iter examples (fun (name,example) -> + let text,tokens = ENIAMsubsyntax.parse_text example in + ignore(parse_text name 1 tokens text)) *) + +(* +type entry = {title: string; info:string; biogram:string; (*primary:string; secondary:string;*) author:string} + +let process_xml = function + Xml.Element("entries",[],entries) -> + List.rev (Xlist.rev_map entries (function + Xml.Element("entry",[],[title;info;biogram(*;primary;secondary*);author]) -> + {title=Xml.to_string title; info=Xml.to_string info; biogram=Xml.to_string biogram; + (*primary=Xml.to_string primary; secondary=Xml.to_string secondary;*) author=Xml.to_string author} + | _ -> failwith "process_xml 1")) + | _ -> failwith "process_xml 2" + + +let load_ppibl filename = + let ppibl = File.load_file_gen ("data/" ^ filename) in + process_xml (Xml.parse_string ppibl) + +let named_entities = + File.fold_tab "data/ne.tab" StringMap.empty (fun map -> function + [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l) + | _ -> failwith "named_entities") + +let assign_named_entities t = + match t.token with + Lemma(lemma,"subst",interp) -> + (try + let cat = StringMap.find named_entities lemma in + {t with token=Proper(lemma,"subst",interp,cat)} + with Not_found -> t) + | Proper(lemma,"subst",interp,_) -> + (try + let cat = StringMap.find named_entities lemma in + {t with token=Proper(lemma,"subst",interp,cat)} + with Not_found -> t) + | _ -> t + +let test_strings = [ + (* "Debiutował opowiadaniem pt. <i>Zlecenie na dostawę</i>."; *) + "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; + (* "Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994." *) + (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP." *) +] + +(* let _ = + let entries = load_ppibl "ak322269.xml" in + Xlist.iter entries (fun entry -> print_endline entry.biogram) *) + +(* +let test_strings = [ + "Szpak frunie."; + "Kot np. miauczy."; + "Ala ma kota."; + "Ale mają kota:" + ] + +let test_strings2 = [ + "Szpak frunie. Kot miauczy."; + "Szpak powiedział: „Frunę. Kiszę.”"; + ] +*) + +let grammar = [ + "pos=year", Basic "year",symbol_weight; + "pos=year-interval", Basic "year-interval",symbol_weight; + "lemma=w,pos=prep,case=loc", Basic "time/(year+year-interval)",0.; + "lemma=w,pos=prep,case=loc", Basic "locat/np*MIASTO*T*loc*T",0.; + + "lemma=uczęszczać,pos=praet|fin,person=ter,negation=aff,mood=indicative", Basic "ip*number*gender{|(1+time),|(1+pp*ORGANIZACJA*do*gen),|(1+locat)}",0.; + "lemma=do,pos=prep,case=gen", Basic "pp*sense*lemma*case/np*sense*T*case*T",0.; + +] + +let _ = + print_endline "Testy wbudowane"; + Xlist.iter test_strings (fun s -> + print_endline ("\nTEST: " ^ s); + let paths = ENIAMsubsyntax.parse s in + let paths = Xlist.map paths assign_named_entities in + (* print_endline (ENIAMtokenizer.xml_of tokens); *) + print_endline (ENIAMpaths.to_string (paths,0))); +(* Xlist.iter test_strings2 (fun s -> + print_endline ("\nTEST: " ^ s); + let text,tokens = ENIAMsubsyntax.parse_text s in + (* print_endline (ENIAMtokenizer.xml_of tokens); *) + print_endline (ENIAMsubsyntaxStringOf.tokens tokens); + print_endline ""; + print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));*) +(* print_endline "Testy użytkownika."; + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; + let s = ref (read_line ()) in + while !s <> "" do + let tokens = ENIAMtokenizer.parse !s in + (* print_endline (ENIAMtokenizer.xml_of tokens); *) + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token)); + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; + s := read_line () + done;*) + () + +open ENIAM_LCGlexiconTypes +open ENIAM_LCGtypes + + +(* +type output = Text | Xml | Html | Marsh | Graphviz + +let output = ref Text +let comm_stdio = ref true +let sentence_split = ref true +let port = ref 0 + +let spec_list = [ + "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; + "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; + "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; + "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; + "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; + "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; + "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; + "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; + (* "-r", Arg.String (fun p -> + ENIAMtokenizerTypes.set_resource_path p; + ENIAMmorphologyTypes.set_resource_path p; + ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *) + ] + +let usage_msg = + "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:" +*)*) +let message = "ENIAM_LCGparser, a parser for Logical Categorial Grammar formalism\n\ +Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\ +Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences" +(* +let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s)) +*) +let input_text channel = + let s = ref (try input_line channel with End_of_file -> "") in + let lines = ref [] in + while !s <> "" do + lines := !s :: !lines; + s := try input_line channel with End_of_file -> "" + done; + String.concat "\n" (List.rev !lines) + +let rec main_loop in_chan out_chan = + let text = input_text in_chan in + if text = "" then () else ( + let text,tokens = ENIAMsubsyntax.parse_text text in + ignore(parse_text "E"(*name*) 1 tokens text) + (* print_endline "input text begin"; + print_endline text; + print_endline "input text end"; *) + (*if !sentence_split then + let text,tokens = ENIAMsubsyntax.parse_text text in + (match !output with + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n") + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n") + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n") + | Marsh -> Marshal.to_channel out_chan (text,tokens) [] + | Graphviz -> failwith "main_loop: ni") + else + let tokens = ENIAMsubsyntax.parse text in + (match !output with + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n") + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n") + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n") + | Marsh -> Marshal.to_channel out_chan tokens [] + | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))*); + flush out_chan; + main_loop in_chan out_chan) + +let _ = + prerr_endline message; + (* Arg.parse spec_list anon_fun usage_msg; *) + Gc.compact (); + prerr_endline "Ready!"; + (*if !comm_stdio then*) main_loop stdin stdout + (*else + let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in + Unix.establish_server main_loop sockaddr*) diff --git a/subsyntax/ENIAM_MWE.ml b/subsyntax/ENIAM_MWE.ml index 356a779..25a157e 100644 --- a/subsyntax/ENIAM_MWE.ml +++ b/subsyntax/ENIAM_MWE.ml @@ -40,7 +40,7 @@ let process_interp lemma interp = | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s)) | _ -> failwith "process_interp" -let load_mwe_dict dict filename = +let load_mwe_dict filename dict = File.fold_tab filename dict (fun dict -> function [orths; lemma; interp] -> let orths = Xstring.split " " orths in @@ -60,7 +60,7 @@ let process_orth = function | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l) | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens) -let load_mwe_dict2 (dict,dict2) filename = +let load_mwe_dict2 filename (dict,dict2) = File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function [orths; lemma] -> (* print_endline (orths ^ "\t" ^ lemma); *) @@ -84,12 +84,13 @@ let load_mwe_dict2 (dict,dict2) filename = | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'")) let mwe_dict,mwe_dict2 = - let dict = load_mwe_dict StringMap.empty brev_filename in - let dict = try load_mwe_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in - let dict = load_mwe_dict dict mwe_filename in - let dict,dict2 = load_mwe_dict2 (dict,StringMap.empty) sejf_filename in - let dict,dict2 = load_mwe_dict2 (dict,dict2) sejfek_filename in - let dict,dict2 = load_mwe_dict2 (dict,dict2) sawa_filename in + let dict = File.catch_no_file (load_mwe_dict brev_filename) StringMap.empty in + let dict = File.catch_no_file (load_mwe_dict fixed_filename) dict in + let dict = File.catch_no_file (load_mwe_dict mwe_filename) dict in + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejf_filename) (dict,StringMap.empty) in + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejfek_filename) (dict,dict2) in + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_filename) (dict,dict2) in + let dict,dict2 = File.catch_no_file (load_mwe_dict2 mwe2_filename) (dict,dict2) in dict,dict2 let get_orths paths = @@ -223,7 +224,7 @@ let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: pro next=t.next; token=Lemma(lemma,cat,[Xlist.map interp (function S s -> (try Xlist.assoc sels s with Not_found -> ["_"]) - | V s -> [s] + | V s -> Xstring.split "\\." s | G -> ["_"])]); weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *) attrs=ENIAMtokens.merge_attrs l} diff --git a/subsyntax/ENIAMsubsyntax.ml b/subsyntax/ENIAMsubsyntax.ml index e0eef8c..07d144f 100644 --- a/subsyntax/ENIAMsubsyntax.ml +++ b/subsyntax/ENIAMsubsyntax.ml @@ -21,16 +21,16 @@ open ENIAMsubsyntaxTypes open ENIAMtokenizerTypes open Xstd -let load_lemma_frequencies filename = +let load_lemma_frequencies filename map = let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in - Xlist.fold l StringMap.empty (fun map line -> + Xlist.fold l map (fun map line -> if String.length line = 0 then map else if String.get line 0 = '#' then map else match Str.split_delim (Str.regexp "\t") line with [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.)) | _ -> failwith ("load_lemma_frequencies: " ^ line)) -let lemma_frequencies = load_lemma_frequencies lemma_frequencies_filename +let lemma_frequencies = File.catch_no_file (load_lemma_frequencies lemma_frequencies_filename) StringMap.empty let modify_weights paths = List.rev (Xlist.fold paths [] (fun paths t -> @@ -210,10 +210,13 @@ let load_proper_name proper = function StringMap.add_inc proper lemma types (fun types2 -> types @ types2) | l -> failwith ("proper_names: " ^ String.concat " " l) +let load_proper_names filename proper = + File.fold_tab filename proper load_proper_name + let proper_names = - let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in - let proper = File.fold_tab proper_names_filename2 proper load_proper_name in - let proper = File.fold_tab proper_names_filename3 proper load_proper_name in + let proper = File.catch_no_file (load_proper_names proper_names_filename) StringMap.empty in + let proper = File.catch_no_file (load_proper_names proper_names_filename2) proper in + let proper = File.catch_no_file (load_proper_names proper_names_filename3) proper in proper let remove l s = diff --git a/subsyntax/ENIAMsubsyntaxTypes.ml b/subsyntax/ENIAMsubsyntaxTypes.ml index 954ed9b..5e216b6 100644 --- a/subsyntax/ENIAMsubsyntaxTypes.ml +++ b/subsyntax/ENIAMsubsyntaxTypes.ml @@ -44,10 +44,15 @@ type text = | StructText of paragraph list (* * token_record ExtArray.t*) (* akapity * tokeny *) | AltText of (mode * text) list +let data_path = + try Sys.getenv "ENIAM_USER_DATA_PATH" + with Not_found -> "data" + let brev_filename = resource_path ^ "/subsyntax/brev.tab" let fixed_filename = resource_path ^ "/Walenty/fixed.tab" -let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" -let mwe_filename = resource_path ^ "/subsyntax/mwe.tab" +(* let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" *) +let mwe_filename = data_path ^ "/mwe.tab" +let mwe2_filename = data_path ^ "/mwe2.tab" let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic" let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic" let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic" @@ -58,7 +63,7 @@ let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.t let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *) let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab" let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab" -let proper_names_filename3 = resource_path ^ "/subsyntax/ne.tab" +let proper_names_filename3 = data_path ^ "/ne.tab" let int_of_mode = function Raw -> 0 diff --git a/subsyntax/makefile b/subsyntax/makefile index 3d028f9..fa331bb 100755 --- a/subsyntax/makefile +++ b/subsyntax/makefile @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt OCAMLDEP=ocamldep INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam OCAMLFLAGS=$(INCLUDES) -g -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa #eniam-subsyntax.cmxa +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa INSTALLDIR=`ocamlc -where`/eniam SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml @@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES) eniam-subsyntax.cmxa: $(SOURCES) ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^ -test: $(SOURCES) test.ml - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml +test: test.ml + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml interface: interface.ml $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml diff --git a/subsyntax/resources/mwe.tab b/subsyntax/resources/mwe.tab deleted file mode 100644 index f70de00..0000000 --- a/subsyntax/resources/mwe.tab +++ /dev/null @@ -1,22 +0,0 @@ -Akademii Sztuki w Szczecinie Akademia Sztuki w Szczecinie subst:sg:gen.dat.loc:f -Atelier Bizio + Ligierko Atelier Bizio + Ligierko subst:sg:_:n2 -Instytucie Architektury i Planowania Przestrzennego Instytut Architektury i Planowania Przestrzennego subst:sg:loc.voc:m3 -Katedrze Architektury Współczesnej Teorii i Metodologii Projektowania Katedra Architektury Współczesnej Teorii i Metodologii Projektowania subst:sg:dat.loc:f -VII Liceum Ogólnokształcącego im . K . K . Baczyńskiego VII Liceum Ogólnokształcące im. K.K. Baczyńskiego subst:sg:gen:m3 -IV Liceum Ogólnokształcącego im . L . Szenwalda IV Liceum Ogólnokształcące im. L. Szenwalda subst:sg:gen:m3 -Muzeum Narodowym Muzeum Narodowe subst:sg:inst.loc:n2 -Nagrodę Artystyczną m . Szczecina Nagroda Artystyczna m. Szczecina subst:sg:acc:f -Zachodniopomorskiego Nobla Zachodniopomorski Nobel subst:sg:acc.gen:m3 -Politechnice Krakowskiej Politechnika Krakowska subst:sg:dat.loc:f -Politechnice Szczecińskiej Politechnika Szczecińska subst:sg:dat.loc:f -Politechniki Szczecińskiej Politechnika Szczecińska subst:sg:gen:f -Pracowni Podstaw Projektowania Pracownia Podstaw Projektowania subst:sg:gen.dat.loc:f -Przeglądu Teatrów Małych Form „ Kontrapunkt ” Przegląd Teatrów Małych Form „Kontrapunkt” subst:sg:gen:m3 -Mistrzowską Szkołę Reżyserii Filmowej Andrzeja Wajdy Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy subst:sg:acc:f -Uniwersytecie im . M . Kopernika Uniwersytet im. M. Kopernika subst:sg:loc.voc:m3 -Zachodniopomorski Uniwersytet Technologiczny Zachodniopomorski Uniwersytet Technologiczny subst:sg:acc.nom:m3 -Wydziale Budownictwa i Architektury Wydział Budownictwa i Architektury subst:sg:loc.voc:m3 -Wydziale Stuk Wizualnych Wydział Stuk Wizualnych subst:sg:loc.voc:m3 -Zakładzie Teorii Architektury , Historii i Konserwacji Zabytków Zakład Teorii Architektury, Historii i Konserwacji Zabytków subst:sg:loc.voc:m3 -Festiwalu Polskich Sztuk Współczesnych R @ Port Festiwalu Polskich Sztuk Współczesnych R@Port subst:sg:gen.loc.voc:m3 -Arabia Saudyjska Arabia Saudyjska subst:sg:nom:f diff --git a/subsyntax/resources/ne.tab b/subsyntax/resources/ne.tab deleted file mode 100644 index 1f7410b..0000000 --- a/subsyntax/resources/ne.tab +++ /dev/null @@ -1,34 +0,0 @@ -Akademia Sztuki ORGANIZACJA -Atelier Bizio + Ligierko ORGANIZACJA -Instytut Architektury i Planowania Przestrzennego ORGANIZACJA -Katedra Architektury Współczesnej Teorii i Metodologii Projektowania ORGANIZACJA -VII Liceum Ogólnokształcące im. K.K. Baczyńskiego ORGANIZACJA -IV Liceum Ogólnokształcące im. L. Szenwalda ORGANIZACJA -Muzeum Narodowe ORGANIZACJA -Nagroda Artystyczna m. Szczecina WYRÓŻNIENIE -Zachodniopomorski Nobel WYRÓŻNIENIE -Politechnika Krakowska ORGANIZACJA -Politechnika Szczecińska ORGANIZACJA -Pracownia Podstaw Projektowania ORGANIZACJA -Przegląd Teatrów Małych Form „Kontrapunkt” ORGANIZACJA -Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy ORGANIZACJA -Uniwersytet im. M. Kopernika ORGANIZACJA -Zachodniopomorski Uniwersytet Technologiczny ORGANIZACJA -Wydział Budownictwa i Architektury ORGANIZACJA -Wydział Stuk Wizualnych ORGANIZACJA -Zakład Teorii Architektury, Historii i Konserwacji Zabytków ORGANIZACJA -Festiwal Polskich Sztuk Współczesnych R@Port WYDARZENIE -Sosnowiec MIASTO -Stefan IMIĘ -Józefa IMIĘ -Szczecin MIASTO -Waldemar IMIĘ -Marzęcki NAZWISKO -Austria KRAJ -Czechy KRAJ -Niemcy KRAJ -Francja KRAJ -Litwa KRAJ -USA KRAJ -Rosja KRAJ - diff --git a/tokenizer/ENIAMacronyms.ml b/tokenizer/ENIAMacronyms.ml index 754bcc0..3c53de7 100644 --- a/tokenizer/ENIAMacronyms.ml +++ b/tokenizer/ENIAMacronyms.ml @@ -19,9 +19,10 @@ open ENIAMtokenizerTypes +let load_mte mte_filename _ = File.load_lines mte_filename + let mte_patterns = - let lines = try File.load_lines mte_filename - with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in + let lines = File.catch_no_file (load_mte mte_filename) [] in let l = List.rev (Xlist.rev_map lines (fun line -> match Str.split (Str.regexp "\t") line with [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp -- libgit2 0.22.2