Commit b369a30acf75ced421d7d50d287a58cac987ec2d
1 parent
2f308cb1
Parser gramatyk semantycznych
Showing
12 changed files
with
348 additions
and
94 deletions
LCGlexicon/ENIAM_LCGlexiconTypes.ml
@@ -83,7 +83,13 @@ let resource_path = | @@ -83,7 +83,13 @@ let resource_path = | ||
83 | if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else | 83 | if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else |
84 | failwith "resource directory does not exists" | 84 | failwith "resource directory does not exists" |
85 | 85 | ||
86 | +let data_path = | ||
87 | + try Sys.getenv "ENIAM_USER_DATA_PATH" | ||
88 | + with Not_found -> "data" | ||
89 | + | ||
86 | let rules_filename = resource_path ^ "/LCGlexicon/lexicon-pl.dic" | 90 | let rules_filename = resource_path ^ "/LCGlexicon/lexicon-pl.dic" |
91 | +let user_lexicon_filename = data_path ^ "/lexicon.dic" | ||
92 | +let user_senses_filename = data_path ^ "/senses.tab" | ||
87 | 93 | ||
88 | let subst_uncountable_lexemes_filename = resource_path ^ "/LCGlexicon/subst_uncountable.dat" | 94 | let subst_uncountable_lexemes_filename = resource_path ^ "/LCGlexicon/subst_uncountable.dat" |
89 | let subst_uncountable_lexemes_filename2 = resource_path ^ "/LCGlexicon/subst_uncountable_stare.dat" | 95 | let subst_uncountable_lexemes_filename2 = resource_path ^ "/LCGlexicon/subst_uncountable_stare.dat" |
@@ -91,7 +97,4 @@ let subst_container_lexemes_filename = resource_path ^ "/LCGlexicon/subst_contai | @@ -91,7 +97,4 @@ let subst_container_lexemes_filename = resource_path ^ "/LCGlexicon/subst_contai | ||
91 | let subst_numeral_lexemes_filename = resource_path ^ "/LCGlexicon/subst_numeral.dat" | 97 | let subst_numeral_lexemes_filename = resource_path ^ "/LCGlexicon/subst_numeral.dat" |
92 | let subst_time_lexemes_filename = resource_path ^ "/LCGlexicon/subst_time.dat" | 98 | let subst_time_lexemes_filename = resource_path ^ "/LCGlexicon/subst_time.dat" |
93 | 99 | ||
94 | -(*let proper_names_filename = resource_path ^ "/lexSemantics/proper_names_sgjp_polimorf.tab" | ||
95 | - let proper_names_filename2 = resource_path ^ "/lexSemantics/proper_names.tab"*) | ||
96 | - | ||
97 | let adv_modes_filename = resource_path ^ "/Walenty/adv_modes.tab" | 100 | let adv_modes_filename = resource_path ^ "/Walenty/adv_modes.tab" |
LCGlexicon/ENIAMcategoriesPL.ml
@@ -36,7 +36,7 @@ let selector_values = Xlist.fold [ | @@ -36,7 +36,7 @@ let selector_values = Xlist.fold [ | ||
36 | "match-result";"url";"email";"obj-id";"adj";"adjc";"adjp";"adja"; | 36 | "match-result";"url";"email";"obj-id";"adj";"adjc";"adjp";"adja"; |
37 | "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt"; | 37 | "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt"; |
38 | "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj"; | 38 | "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj"; |
39 | - "sinterj";"burk";"interp";"unk"]; | 39 | + "sinterj";"burk";"interp";"unk";"html-tag"]; |
40 | Pos2, []; | 40 | Pos2, []; |
41 | Cat, []; | 41 | Cat, []; |
42 | Number, all_numbers; | 42 | Number, all_numbers; |
@@ -74,22 +74,26 @@ let split_voc cases = | @@ -74,22 +74,26 @@ let split_voc cases = | ||
74 | "voc" -> cases, "voc" :: voc | 74 | "voc" -> cases, "voc" :: voc |
75 | | s -> s :: cases, voc) | 75 | | s -> s :: cases, voc) |
76 | 76 | ||
77 | -let subst_uncountable_lexemes = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename) | ||
78 | -let subst_uncountable_lexemes2 = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename2) | ||
79 | -let subst_container_lexemes = StringSet.of_list (File.load_lines subst_container_lexemes_filename) | ||
80 | -let subst_numeral_lexemes = StringSet.of_list (File.load_lines subst_numeral_lexemes_filename) | ||
81 | -let subst_time_lexemes = StringSet.of_list (File.load_lines subst_time_lexemes_filename) | 77 | +let load_subst_data filename _ = |
78 | + StringSet.of_list (File.load_lines filename) | ||
79 | + | ||
80 | +let subst_uncountable_lexemes = File.catch_no_file (load_subst_data subst_uncountable_lexemes_filename) StringSet.empty | ||
81 | +let subst_uncountable_lexemes2 = File.catch_no_file (load_subst_data subst_uncountable_lexemes_filename2) StringSet.empty | ||
82 | +let subst_container_lexemes = File.catch_no_file (load_subst_data subst_container_lexemes_filename) StringSet.empty | ||
83 | +let subst_numeral_lexemes = File.catch_no_file (load_subst_data subst_numeral_lexemes_filename) StringSet.empty | ||
84 | +let subst_time_lexemes = File.catch_no_file (load_subst_data subst_time_lexemes_filename) StringSet.empty | ||
82 | 85 | ||
83 | let subst_pronoun_lexemes = StringSet.of_list ["co"; "kto"; "cokolwiek"; "ktokolwiek"; "nic"; "nikt"; "coś"; "ktoś"; "to"] | 86 | let subst_pronoun_lexemes = StringSet.of_list ["co"; "kto"; "cokolwiek"; "ktokolwiek"; "nic"; "nikt"; "coś"; "ktoś"; "to"] |
84 | let adj_pronoun_lexemes = StringSet.of_list ["czyj"; "jaki"; "który"; "jakiś"; "ten"; "taki"] | 87 | let adj_pronoun_lexemes = StringSet.of_list ["czyj"; "jaki"; "który"; "jakiś"; "ten"; "taki"] |
85 | 88 | ||
86 | (* let adj_quant_lexemes = StringSet.of_list ["każdy"; "wszelki"; "wszystek"; "żaden"; "jakiś"; "pewien"; "niektóry"; "jedyny"; "sam"] *) | 89 | (* let adj_quant_lexemes = StringSet.of_list ["każdy"; "wszelki"; "wszystek"; "żaden"; "jakiś"; "pewien"; "niektóry"; "jedyny"; "sam"] *) |
87 | 90 | ||
88 | -let adv_modes = | ||
89 | - try File.fold_tab adv_modes_filename StringMap.empty (fun adv_modes -> function | 91 | +let load_adv_modes filename adv_modes = |
92 | + File.fold_tab filename adv_modes (fun adv_modes -> function | ||
90 | [adv;mode] -> StringMap.add_inc adv_modes adv [mode] (fun l -> mode :: l) | 93 | [adv;mode] -> StringMap.add_inc adv_modes adv [mode] (fun l -> mode :: l) |
91 | | _ -> failwith "adv_modes") | 94 | | _ -> failwith "adv_modes") |
92 | - with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); StringMap.empty) | 95 | + |
96 | +let adv_modes = File.catch_no_file (load_adv_modes adv_modes_filename) StringMap.empty | ||
93 | 97 | ||
94 | let noun_type proper lemma pos = | 98 | let noun_type proper lemma pos = |
95 | let nsyn = | 99 | let nsyn = |
@@ -347,6 +351,7 @@ let clarify_categories proper cat = function | @@ -347,6 +351,7 @@ let clarify_categories proper cat = function | ||
347 | | lemma,"interp",[] -> [{empty_cats with lemma=lemma; pos="interp"; pos2="interp"}] | 351 | | lemma,"interp",[] -> [{empty_cats with lemma=lemma; pos="interp"; pos2="interp"}] |
348 | | lemma,"unk",[] -> | 352 | | lemma,"unk",[] -> |
349 | [{empty_cats with lemma=lemma; pos="unk"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}] | 353 | [{empty_cats with lemma=lemma; pos="unk"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}] |
354 | + | lemma,"html-tag",[] -> [{empty_cats with lemma=lemma; pos="html-tag"; pos2="html-tag"}] | ||
350 | | lemma,c,l -> failwith ("clarify_categories: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) | 355 | | lemma,c,l -> failwith ("clarify_categories: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) |
351 | 356 | ||
352 | (* FIXME: przenieść gdzieś indziej *) | 357 | (* FIXME: przenieść gdzieś indziej *) |
@@ -547,4 +552,5 @@ let pos_categories = Xlist.fold [ | @@ -547,4 +552,5 @@ let pos_categories = Xlist.fold [ | ||
547 | "burk",[Lemma;]; | 552 | "burk",[Lemma;]; |
548 | "interp",[Lemma;]; | 553 | "interp",[Lemma;]; |
549 | "unk",[Lemma;Number;Case;Gender;Person;]; | 554 | "unk",[Lemma;Number;Case;Gender;Person;]; |
555 | + "html-tag",[Lemma;]; | ||
550 | ] StringMap.empty (fun map (k,l) -> StringMap.add map k l) | 556 | ] StringMap.empty (fun map (k,l) -> StringMap.add map k l) |
LCGlexicon/TODO
1 | +- poprawić parser.ml tak by łączył się sieciowo z subsyntax | ||
1 | 2 | ||
2 | "Można było" - brakuje uzgodnienia rodzaju przymiotnika w przypadku predykatywnym, i ogólnie kontroli składniowej | 3 | "Można było" - brakuje uzgodnienia rodzaju przymiotnika w przypadku predykatywnym, i ogólnie kontroli składniowej |
3 | 4 |
LCGlexicon/makefile
@@ -4,7 +4,8 @@ OCAMLDEP=ocamldep | @@ -4,7 +4,8 @@ OCAMLDEP=ocamldep | ||
4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | OCAMLFLAGS=$(INCLUDES) -g | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa |
7 | -OCAMLOPTFLAGS2=$(OCAMLOPTFLAGS) eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lexSemantics.cmxa | 7 | +OCAMLOPTFLAGS2=$(OCAMLOPTFLAGS) eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa |
8 | +OCAMLOPTFLAGS3=$(OCAMLOPTFLAGS2) eniam-lexSemantics.cmxa | ||
8 | INSTALLDIR=`ocamlc -where`/eniam | 9 | INSTALLDIR=`ocamlc -where`/eniam |
9 | 10 | ||
10 | SOURCES= ENIAM_LCGlexiconTypes.ml ENIAMcategoriesPL.ml ENIAM_LCGlexiconParser.ml ENIAM_LCGlexicon.ml | 11 | SOURCES= ENIAM_LCGlexiconTypes.ml ENIAMcategoriesPL.ml ENIAM_LCGlexiconParser.ml ENIAM_LCGlexicon.ml |
@@ -40,10 +41,14 @@ test: test.ml | @@ -40,10 +41,14 @@ test: test.ml | ||
40 | 41 | ||
41 | test2: test2.ml | 42 | test2: test2.ml |
42 | mkdir -p results | 43 | mkdir -p results |
43 | - $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS2) test2.ml | 44 | + $(OCAMLOPT) -o test2 $(OCAMLOPTFLAGS3) test2.ml |
44 | 45 | ||
45 | interface: interface.ml | 46 | interface: interface.ml |
46 | - $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) interface.ml | 47 | + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS3) interface.ml |
48 | + | ||
49 | +parser: parser.ml | ||
50 | + mkdir -p results | ||
51 | + $(OCAMLOPT) -o parser $(OCAMLOPTFLAGS2) parser.ml | ||
47 | 52 | ||
48 | print_lexicon: ENIAM_LCGlexiconLatexOf.ml | 53 | print_lexicon: ENIAM_LCGlexiconLatexOf.ml |
49 | mkdir -p results | 54 | mkdir -p results |
LCGlexicon/parser.ml
0 → 100644
1 | +open Xstd | ||
2 | +open ENIAMsubsyntaxTypes | ||
3 | + | ||
4 | +let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.user_lexicon_filename | ||
5 | + | ||
6 | +let load_senses_map filename = | ||
7 | + File.fold_tab filename StringMap.empty (fun map -> function | ||
8 | + [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l) | ||
9 | + | l -> failwith ("load_senses_map: " ^ String.concat "\t" l)) | ||
10 | + | ||
11 | +let senses_map = load_senses_map ENIAM_LCGlexiconTypes.user_senses_filename | ||
12 | + | ||
13 | + | ||
14 | +let examples = [ | ||
15 | + (* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) | ||
16 | + "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; | ||
17 | +] | ||
18 | + | ||
19 | +let clarify_categories token = | ||
20 | + match token.ENIAMtokenizerTypes.token with | ||
21 | + ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> | ||
22 | + let senses = try StringMap.find senses_map lemma with Not_found -> ["X"] in | ||
23 | + List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) | ||
24 | + | ENIAMtokenizerTypes.Proper(lemma,pos,interp,senses) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) | ||
25 | + | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false ["X"] (lemma,"interp",[]) | ||
26 | + | _ -> [] | ||
27 | + | ||
28 | +let create_chart tokens paths last = | ||
29 | + ENIAM_LCGrenderer.reset_variable_numbers (); | ||
30 | + let chart = ENIAM_LCGchart.make last in | ||
31 | + let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | ||
32 | + let t = ExtArray.get tokens id in | ||
33 | + ENIAM_LCGrenderer.reset_variable_names (); | ||
34 | + ENIAM_LCGrenderer.add_variable_numbers (); | ||
35 | + let cats = clarify_categories t in | ||
36 | + let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats [] in | ||
37 | + ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | ||
38 | + chart | ||
39 | + | ||
40 | +let test_example name tokens paths last = | ||
41 | + ENIAM_LCGreductions.reset_variant_label (); | ||
42 | + let chart = create_chart tokens paths last in | ||
43 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; | ||
44 | + let chart,references = ENIAM_LCGchart.lazify chart in | ||
45 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | ||
46 | + ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; | ||
47 | + let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | ||
48 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | ||
49 | + ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; | ||
50 | + if ENIAM_LCGchart.is_parsed chart then ( | ||
51 | + let term = ENIAM_LCGchart.get_parsed_term chart in | ||
52 | + Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file -> | ||
53 | + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | ||
54 | + Xlatex.latex_compile_and_clean "results/" (name^"4_term"); | ||
55 | + let dependency_tree = ENIAM_LCGreductions.reduce term references in | ||
56 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree; | ||
57 | + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | ||
58 | + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | ||
59 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree; | ||
60 | + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | ||
61 | + ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree; | ||
62 | + ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree; | ||
63 | + ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree; | ||
64 | + ()) | ||
65 | + else print_endline "not reduced") | ||
66 | + else print_endline "not parsed" | ||
67 | + | ||
68 | +let rec parse_sentence name id tokens = function | ||
69 | + RawSentence s -> id | ||
70 | + | StructSentence(paths,last) -> | ||
71 | + test_example (name ^ string_of_int id ^ "_") tokens paths last; | ||
72 | + id + 1 | ||
73 | + | DepSentence(paths) -> id | ||
74 | + | QuotedSentences sentences -> | ||
75 | + Xlist.fold sentences id (fun id p -> | ||
76 | + parse_sentence name id tokens p.sentence) | ||
77 | + | AltSentence l -> | ||
78 | + Xlist.fold l id (fun id (mode,sentence) -> | ||
79 | + parse_sentence name id tokens sentence) | ||
80 | + | ||
81 | +let rec parse_paragraph name id tokens = function | ||
82 | + RawParagraph s -> id | ||
83 | + | StructParagraph sentences -> | ||
84 | + Xlist.fold sentences id (fun id p -> | ||
85 | + parse_sentence name id tokens p.sentence) | ||
86 | + | AltParagraph l -> | ||
87 | + Xlist.fold l id (fun id (mode,paragraph) -> | ||
88 | + parse_paragraph name id tokens paragraph) | ||
89 | + | ||
90 | +let rec parse_text name id tokens = function | ||
91 | + RawText s -> id | ||
92 | + | StructText paragraphs -> | ||
93 | + Xlist.fold paragraphs id (fun id paragraph -> | ||
94 | + parse_paragraph name id tokens paragraph) | ||
95 | + | AltText l -> | ||
96 | + Xlist.fold l id (fun id (mode,text) -> | ||
97 | + parse_text name id tokens text) | ||
98 | + | ||
99 | + | ||
100 | +(* let _ = | ||
101 | + Xlist.iter examples (fun (name,example) -> | ||
102 | + let text,tokens = ENIAMsubsyntax.parse_text example in | ||
103 | + ignore(parse_text name 1 tokens text)) *) | ||
104 | + | ||
105 | +(* | ||
106 | +type entry = {title: string; info:string; biogram:string; (*primary:string; secondary:string;*) author:string} | ||
107 | + | ||
108 | +let process_xml = function | ||
109 | + Xml.Element("entries",[],entries) -> | ||
110 | + List.rev (Xlist.rev_map entries (function | ||
111 | + Xml.Element("entry",[],[title;info;biogram(*;primary;secondary*);author]) -> | ||
112 | + {title=Xml.to_string title; info=Xml.to_string info; biogram=Xml.to_string biogram; | ||
113 | + (*primary=Xml.to_string primary; secondary=Xml.to_string secondary;*) author=Xml.to_string author} | ||
114 | + | _ -> failwith "process_xml 1")) | ||
115 | + | _ -> failwith "process_xml 2" | ||
116 | + | ||
117 | + | ||
118 | +let load_ppibl filename = | ||
119 | + let ppibl = File.load_file_gen ("data/" ^ filename) in | ||
120 | + process_xml (Xml.parse_string ppibl) | ||
121 | + | ||
122 | +let named_entities = | ||
123 | + File.fold_tab "data/ne.tab" StringMap.empty (fun map -> function | ||
124 | + [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l) | ||
125 | + | _ -> failwith "named_entities") | ||
126 | + | ||
127 | +let assign_named_entities t = | ||
128 | + match t.token with | ||
129 | + Lemma(lemma,"subst",interp) -> | ||
130 | + (try | ||
131 | + let cat = StringMap.find named_entities lemma in | ||
132 | + {t with token=Proper(lemma,"subst",interp,cat)} | ||
133 | + with Not_found -> t) | ||
134 | + | Proper(lemma,"subst",interp,_) -> | ||
135 | + (try | ||
136 | + let cat = StringMap.find named_entities lemma in | ||
137 | + {t with token=Proper(lemma,"subst",interp,cat)} | ||
138 | + with Not_found -> t) | ||
139 | + | _ -> t | ||
140 | + | ||
141 | +let test_strings = [ | ||
142 | + (* "Debiutował opowiadaniem pt. <i>Zlecenie na dostawę</i>."; *) | ||
143 | + "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; | ||
144 | + (* "Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994." *) | ||
145 | + (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP." *) | ||
146 | +] | ||
147 | + | ||
148 | +(* let _ = | ||
149 | + let entries = load_ppibl "ak322269.xml" in | ||
150 | + Xlist.iter entries (fun entry -> print_endline entry.biogram) *) | ||
151 | + | ||
152 | +(* | ||
153 | +let test_strings = [ | ||
154 | + "Szpak frunie."; | ||
155 | + "Kot np. miauczy."; | ||
156 | + "Ala ma kota."; | ||
157 | + "Ale mają kota:" | ||
158 | + ] | ||
159 | + | ||
160 | +let test_strings2 = [ | ||
161 | + "Szpak frunie. Kot miauczy."; | ||
162 | + "Szpak powiedział: „Frunę. Kiszę.”"; | ||
163 | + ] | ||
164 | +*) | ||
165 | + | ||
166 | +let grammar = [ | ||
167 | + "pos=year", Basic "year",symbol_weight; | ||
168 | + "pos=year-interval", Basic "year-interval",symbol_weight; | ||
169 | + "lemma=w,pos=prep,case=loc", Basic "time/(year+year-interval)",0.; | ||
170 | + "lemma=w,pos=prep,case=loc", Basic "locat/np*MIASTO*T*loc*T",0.; | ||
171 | + | ||
172 | + "lemma=uczęszczać,pos=praet|fin,person=ter,negation=aff,mood=indicative", Basic "ip*number*gender{|(1+time),|(1+pp*ORGANIZACJA*do*gen),|(1+locat)}",0.; | ||
173 | + "lemma=do,pos=prep,case=gen", Basic "pp*sense*lemma*case/np*sense*T*case*T",0.; | ||
174 | + | ||
175 | +] | ||
176 | + | ||
177 | +let _ = | ||
178 | + print_endline "Testy wbudowane"; | ||
179 | + Xlist.iter test_strings (fun s -> | ||
180 | + print_endline ("\nTEST: " ^ s); | ||
181 | + let paths = ENIAMsubsyntax.parse s in | ||
182 | + let paths = Xlist.map paths assign_named_entities in | ||
183 | + (* print_endline (ENIAMtokenizer.xml_of tokens); *) | ||
184 | + print_endline (ENIAMpaths.to_string (paths,0))); | ||
185 | +(* Xlist.iter test_strings2 (fun s -> | ||
186 | + print_endline ("\nTEST: " ^ s); | ||
187 | + let text,tokens = ENIAMsubsyntax.parse_text s in | ||
188 | + (* print_endline (ENIAMtokenizer.xml_of tokens); *) | ||
189 | + print_endline (ENIAMsubsyntaxStringOf.tokens tokens); | ||
190 | + print_endline ""; | ||
191 | + print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));*) | ||
192 | +(* print_endline "Testy użytkownika."; | ||
193 | + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; | ||
194 | + let s = ref (read_line ()) in | ||
195 | + while !s <> "" do | ||
196 | + let tokens = ENIAMtokenizer.parse !s in | ||
197 | + (* print_endline (ENIAMtokenizer.xml_of tokens); *) | ||
198 | + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token)); | ||
199 | + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; | ||
200 | + s := read_line () | ||
201 | + done;*) | ||
202 | + () | ||
203 | + | ||
204 | +open ENIAM_LCGlexiconTypes | ||
205 | +open ENIAM_LCGtypes | ||
206 | + | ||
207 | + | ||
208 | +(* | ||
209 | +type output = Text | Xml | Html | Marsh | Graphviz | ||
210 | + | ||
211 | +let output = ref Text | ||
212 | +let comm_stdio = ref true | ||
213 | +let sentence_split = ref true | ||
214 | +let port = ref 0 | ||
215 | + | ||
216 | +let spec_list = [ | ||
217 | + "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | ||
218 | + "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences"; | ||
219 | + "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)"; | ||
220 | + "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number"; | ||
221 | + "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)"; | ||
222 | + "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; | ||
223 | + "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; | ||
224 | + "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; | ||
225 | + "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; | ||
226 | + (* "-r", Arg.String (fun p -> | ||
227 | + ENIAMtokenizerTypes.set_resource_path p; | ||
228 | + ENIAMmorphologyTypes.set_resource_path p; | ||
229 | + ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *) | ||
230 | + ] | ||
231 | + | ||
232 | +let usage_msg = | ||
233 | + "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:" | ||
234 | +*)*) | ||
235 | +let message = "ENIAM_LCGparser, a parser for Logical Categorial Grammar formalism\n\ | ||
236 | +Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\ | ||
237 | +Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences" | ||
238 | +(* | ||
239 | +let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s)) | ||
240 | +*) | ||
241 | +let input_text channel = | ||
242 | + let s = ref (try input_line channel with End_of_file -> "") in | ||
243 | + let lines = ref [] in | ||
244 | + while !s <> "" do | ||
245 | + lines := !s :: !lines; | ||
246 | + s := try input_line channel with End_of_file -> "" | ||
247 | + done; | ||
248 | + String.concat "\n" (List.rev !lines) | ||
249 | + | ||
250 | +let rec main_loop in_chan out_chan = | ||
251 | + let text = input_text in_chan in | ||
252 | + if text = "" then () else ( | ||
253 | + let text,tokens = ENIAMsubsyntax.parse_text text in | ||
254 | + ignore(parse_text "E"(*name*) 1 tokens text) | ||
255 | + (* print_endline "input text begin"; | ||
256 | + print_endline text; | ||
257 | + print_endline "input text end"; *) | ||
258 | + (*if !sentence_split then | ||
259 | + let text,tokens = ENIAMsubsyntax.parse_text text in | ||
260 | + (match !output with | ||
261 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n") | ||
262 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n") | ||
263 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n") | ||
264 | + | Marsh -> Marshal.to_channel out_chan (text,tokens) [] | ||
265 | + | Graphviz -> failwith "main_loop: ni") | ||
266 | + else | ||
267 | + let tokens = ENIAMsubsyntax.parse text in | ||
268 | + (match !output with | ||
269 | + Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n") | ||
270 | + | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n") | ||
271 | + | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n") | ||
272 | + | Marsh -> Marshal.to_channel out_chan tokens [] | ||
273 | + | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))*); | ||
274 | + flush out_chan; | ||
275 | + main_loop in_chan out_chan) | ||
276 | + | ||
277 | +let _ = | ||
278 | + prerr_endline message; | ||
279 | + (* Arg.parse spec_list anon_fun usage_msg; *) | ||
280 | + Gc.compact (); | ||
281 | + prerr_endline "Ready!"; | ||
282 | + (*if !comm_stdio then*) main_loop stdin stdout | ||
283 | + (*else | ||
284 | + let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in | ||
285 | + Unix.establish_server main_loop sockaddr*) |
subsyntax/ENIAM_MWE.ml
@@ -40,7 +40,7 @@ let process_interp lemma interp = | @@ -40,7 +40,7 @@ let process_interp lemma interp = | ||
40 | | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s)) | 40 | | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s)) |
41 | | _ -> failwith "process_interp" | 41 | | _ -> failwith "process_interp" |
42 | 42 | ||
43 | -let load_mwe_dict dict filename = | 43 | +let load_mwe_dict filename dict = |
44 | File.fold_tab filename dict (fun dict -> function | 44 | File.fold_tab filename dict (fun dict -> function |
45 | [orths; lemma; interp] -> | 45 | [orths; lemma; interp] -> |
46 | let orths = Xstring.split " " orths in | 46 | let orths = Xstring.split " " orths in |
@@ -60,7 +60,7 @@ let process_orth = function | @@ -60,7 +60,7 @@ let process_orth = function | ||
60 | | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l) | 60 | | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l) |
61 | | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens) | 61 | | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens) |
62 | 62 | ||
63 | -let load_mwe_dict2 (dict,dict2) filename = | 63 | +let load_mwe_dict2 filename (dict,dict2) = |
64 | File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function | 64 | File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function |
65 | [orths; lemma] -> | 65 | [orths; lemma] -> |
66 | (* print_endline (orths ^ "\t" ^ lemma); *) | 66 | (* print_endline (orths ^ "\t" ^ lemma); *) |
@@ -84,12 +84,13 @@ let load_mwe_dict2 (dict,dict2) filename = | @@ -84,12 +84,13 @@ let load_mwe_dict2 (dict,dict2) filename = | ||
84 | | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'")) | 84 | | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'")) |
85 | 85 | ||
86 | let mwe_dict,mwe_dict2 = | 86 | let mwe_dict,mwe_dict2 = |
87 | - let dict = load_mwe_dict StringMap.empty brev_filename in | ||
88 | - let dict = try load_mwe_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in | ||
89 | - let dict = load_mwe_dict dict mwe_filename in | ||
90 | - let dict,dict2 = load_mwe_dict2 (dict,StringMap.empty) sejf_filename in | ||
91 | - let dict,dict2 = load_mwe_dict2 (dict,dict2) sejfek_filename in | ||
92 | - let dict,dict2 = load_mwe_dict2 (dict,dict2) sawa_filename in | 87 | + let dict = File.catch_no_file (load_mwe_dict brev_filename) StringMap.empty in |
88 | + let dict = File.catch_no_file (load_mwe_dict fixed_filename) dict in | ||
89 | + let dict = File.catch_no_file (load_mwe_dict mwe_filename) dict in | ||
90 | + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejf_filename) (dict,StringMap.empty) in | ||
91 | + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sejfek_filename) (dict,dict2) in | ||
92 | + let dict,dict2 = File.catch_no_file (load_mwe_dict2 sawa_filename) (dict,dict2) in | ||
93 | + let dict,dict2 = File.catch_no_file (load_mwe_dict2 mwe2_filename) (dict,dict2) in | ||
93 | dict,dict2 | 94 | dict,dict2 |
94 | 95 | ||
95 | let get_orths paths = | 96 | let get_orths paths = |
@@ -223,7 +224,7 @@ let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: pro | @@ -223,7 +224,7 @@ let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: pro | ||
223 | next=t.next; | 224 | next=t.next; |
224 | token=Lemma(lemma,cat,[Xlist.map interp (function | 225 | token=Lemma(lemma,cat,[Xlist.map interp (function |
225 | S s -> (try Xlist.assoc sels s with Not_found -> ["_"]) | 226 | S s -> (try Xlist.assoc sels s with Not_found -> ["_"]) |
226 | - | V s -> [s] | 227 | + | V s -> Xstring.split "\\." s |
227 | | G -> ["_"])]); | 228 | | G -> ["_"])]); |
228 | weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *) | 229 | weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *) |
229 | attrs=ENIAMtokens.merge_attrs l} | 230 | attrs=ENIAMtokens.merge_attrs l} |
subsyntax/ENIAMsubsyntax.ml
@@ -21,16 +21,16 @@ open ENIAMsubsyntaxTypes | @@ -21,16 +21,16 @@ open ENIAMsubsyntaxTypes | ||
21 | open ENIAMtokenizerTypes | 21 | open ENIAMtokenizerTypes |
22 | open Xstd | 22 | open Xstd |
23 | 23 | ||
24 | -let load_lemma_frequencies filename = | 24 | +let load_lemma_frequencies filename map = |
25 | let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in | 25 | let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in |
26 | - Xlist.fold l StringMap.empty (fun map line -> | 26 | + Xlist.fold l map (fun map line -> |
27 | if String.length line = 0 then map else | 27 | if String.length line = 0 then map else |
28 | if String.get line 0 = '#' then map else | 28 | if String.get line 0 = '#' then map else |
29 | match Str.split_delim (Str.regexp "\t") line with | 29 | match Str.split_delim (Str.regexp "\t") line with |
30 | [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.)) | 30 | [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.)) |
31 | | _ -> failwith ("load_lemma_frequencies: " ^ line)) | 31 | | _ -> failwith ("load_lemma_frequencies: " ^ line)) |
32 | 32 | ||
33 | -let lemma_frequencies = load_lemma_frequencies lemma_frequencies_filename | 33 | +let lemma_frequencies = File.catch_no_file (load_lemma_frequencies lemma_frequencies_filename) StringMap.empty |
34 | 34 | ||
35 | let modify_weights paths = | 35 | let modify_weights paths = |
36 | List.rev (Xlist.fold paths [] (fun paths t -> | 36 | List.rev (Xlist.fold paths [] (fun paths t -> |
@@ -210,10 +210,13 @@ let load_proper_name proper = function | @@ -210,10 +210,13 @@ let load_proper_name proper = function | ||
210 | StringMap.add_inc proper lemma types (fun types2 -> types @ types2) | 210 | StringMap.add_inc proper lemma types (fun types2 -> types @ types2) |
211 | | l -> failwith ("proper_names: " ^ String.concat " " l) | 211 | | l -> failwith ("proper_names: " ^ String.concat " " l) |
212 | 212 | ||
213 | +let load_proper_names filename proper = | ||
214 | + File.fold_tab filename proper load_proper_name | ||
215 | + | ||
213 | let proper_names = | 216 | let proper_names = |
214 | - let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in | ||
215 | - let proper = File.fold_tab proper_names_filename2 proper load_proper_name in | ||
216 | - let proper = File.fold_tab proper_names_filename3 proper load_proper_name in | 217 | + let proper = File.catch_no_file (load_proper_names proper_names_filename) StringMap.empty in |
218 | + let proper = File.catch_no_file (load_proper_names proper_names_filename2) proper in | ||
219 | + let proper = File.catch_no_file (load_proper_names proper_names_filename3) proper in | ||
217 | proper | 220 | proper |
218 | 221 | ||
219 | let remove l s = | 222 | let remove l s = |
subsyntax/ENIAMsubsyntaxTypes.ml
@@ -44,10 +44,15 @@ type text = | @@ -44,10 +44,15 @@ type text = | ||
44 | | StructText of paragraph list (* * token_record ExtArray.t*) (* akapity * tokeny *) | 44 | | StructText of paragraph list (* * token_record ExtArray.t*) (* akapity * tokeny *) |
45 | | AltText of (mode * text) list | 45 | | AltText of (mode * text) list |
46 | 46 | ||
47 | +let data_path = | ||
48 | + try Sys.getenv "ENIAM_USER_DATA_PATH" | ||
49 | + with Not_found -> "data" | ||
50 | + | ||
47 | let brev_filename = resource_path ^ "/subsyntax/brev.tab" | 51 | let brev_filename = resource_path ^ "/subsyntax/brev.tab" |
48 | let fixed_filename = resource_path ^ "/Walenty/fixed.tab" | 52 | let fixed_filename = resource_path ^ "/Walenty/fixed.tab" |
49 | -let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" | ||
50 | -let mwe_filename = resource_path ^ "/subsyntax/mwe.tab" | 53 | +(* let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" *) |
54 | +let mwe_filename = data_path ^ "/mwe.tab" | ||
55 | +let mwe2_filename = data_path ^ "/mwe2.tab" | ||
51 | let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic" | 56 | let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic" |
52 | let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic" | 57 | let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic" |
53 | let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic" | 58 | let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic" |
@@ -58,7 +63,7 @@ let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.t | @@ -58,7 +63,7 @@ let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.t | ||
58 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *) | 63 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *) |
59 | let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab" | 64 | let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab" |
60 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab" | 65 | let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab" |
61 | -let proper_names_filename3 = resource_path ^ "/subsyntax/ne.tab" | 66 | +let proper_names_filename3 = data_path ^ "/ne.tab" |
62 | 67 | ||
63 | let int_of_mode = function | 68 | let int_of_mode = function |
64 | Raw -> 0 | 69 | Raw -> 0 |
subsyntax/makefile
@@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt | ||
3 | OCAMLDEP=ocamldep | 3 | OCAMLDEP=ocamldep |
4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | OCAMLFLAGS=$(INCLUDES) -g | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa #eniam-subsyntax.cmxa | 6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa |
7 | INSTALLDIR=`ocamlc -where`/eniam | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | ||
9 | SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml | 9 | SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml |
@@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES) | @@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES) | ||
32 | eniam-subsyntax.cmxa: $(SOURCES) | 32 | eniam-subsyntax.cmxa: $(SOURCES) |
33 | ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^ | 33 | ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^ |
34 | 34 | ||
35 | -test: $(SOURCES) test.ml | ||
36 | - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml | 35 | +test: test.ml |
36 | + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml | ||
37 | 37 | ||
38 | interface: interface.ml | 38 | interface: interface.ml |
39 | $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml | 39 | $(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml |
subsyntax/resources/mwe.tab deleted
1 | -Akademii Sztuki w Szczecinie Akademia Sztuki w Szczecinie subst:sg:gen.dat.loc:f | ||
2 | -Atelier Bizio + Ligierko Atelier Bizio + Ligierko subst:sg:_:n2 | ||
3 | -Instytucie Architektury i Planowania Przestrzennego Instytut Architektury i Planowania Przestrzennego subst:sg:loc.voc:m3 | ||
4 | -Katedrze Architektury Współczesnej Teorii i Metodologii Projektowania Katedra Architektury Współczesnej Teorii i Metodologii Projektowania subst:sg:dat.loc:f | ||
5 | -VII Liceum Ogólnokształcącego im . K . K . Baczyńskiego VII Liceum Ogólnokształcące im. K.K. Baczyńskiego subst:sg:gen:m3 | ||
6 | -IV Liceum Ogólnokształcącego im . L . Szenwalda IV Liceum Ogólnokształcące im. L. Szenwalda subst:sg:gen:m3 | ||
7 | -Muzeum Narodowym Muzeum Narodowe subst:sg:inst.loc:n2 | ||
8 | -Nagrodę Artystyczną m . Szczecina Nagroda Artystyczna m. Szczecina subst:sg:acc:f | ||
9 | -Zachodniopomorskiego Nobla Zachodniopomorski Nobel subst:sg:acc.gen:m3 | ||
10 | -Politechnice Krakowskiej Politechnika Krakowska subst:sg:dat.loc:f | ||
11 | -Politechnice Szczecińskiej Politechnika Szczecińska subst:sg:dat.loc:f | ||
12 | -Politechniki Szczecińskiej Politechnika Szczecińska subst:sg:gen:f | ||
13 | -Pracowni Podstaw Projektowania Pracownia Podstaw Projektowania subst:sg:gen.dat.loc:f | ||
14 | -Przeglądu Teatrów Małych Form „ Kontrapunkt ” Przegląd Teatrów Małych Form „Kontrapunkt” subst:sg:gen:m3 | ||
15 | -Mistrzowską Szkołę Reżyserii Filmowej Andrzeja Wajdy Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy subst:sg:acc:f | ||
16 | -Uniwersytecie im . M . Kopernika Uniwersytet im. M. Kopernika subst:sg:loc.voc:m3 | ||
17 | -Zachodniopomorski Uniwersytet Technologiczny Zachodniopomorski Uniwersytet Technologiczny subst:sg:acc.nom:m3 | ||
18 | -Wydziale Budownictwa i Architektury Wydział Budownictwa i Architektury subst:sg:loc.voc:m3 | ||
19 | -Wydziale Stuk Wizualnych Wydział Stuk Wizualnych subst:sg:loc.voc:m3 | ||
20 | -Zakładzie Teorii Architektury , Historii i Konserwacji Zabytków Zakład Teorii Architektury, Historii i Konserwacji Zabytków subst:sg:loc.voc:m3 | ||
21 | -Festiwalu Polskich Sztuk Współczesnych R @ Port Festiwalu Polskich Sztuk Współczesnych R@Port subst:sg:gen.loc.voc:m3 | ||
22 | -Arabia Saudyjska Arabia Saudyjska subst:sg:nom:f |
subsyntax/resources/ne.tab deleted
1 | -Akademia Sztuki ORGANIZACJA | ||
2 | -Atelier Bizio + Ligierko ORGANIZACJA | ||
3 | -Instytut Architektury i Planowania Przestrzennego ORGANIZACJA | ||
4 | -Katedra Architektury Współczesnej Teorii i Metodologii Projektowania ORGANIZACJA | ||
5 | -VII Liceum Ogólnokształcące im. K.K. Baczyńskiego ORGANIZACJA | ||
6 | -IV Liceum Ogólnokształcące im. L. Szenwalda ORGANIZACJA | ||
7 | -Muzeum Narodowe ORGANIZACJA | ||
8 | -Nagroda Artystyczna m. Szczecina WYRÓŻNIENIE | ||
9 | -Zachodniopomorski Nobel WYRÓŻNIENIE | ||
10 | -Politechnika Krakowska ORGANIZACJA | ||
11 | -Politechnika Szczecińska ORGANIZACJA | ||
12 | -Pracownia Podstaw Projektowania ORGANIZACJA | ||
13 | -Przegląd Teatrów Małych Form „Kontrapunkt” ORGANIZACJA | ||
14 | -Mistrzowska Szkoła Reżyserii Filmowej Andrzeja Wajdy ORGANIZACJA | ||
15 | -Uniwersytet im. M. Kopernika ORGANIZACJA | ||
16 | -Zachodniopomorski Uniwersytet Technologiczny ORGANIZACJA | ||
17 | -Wydział Budownictwa i Architektury ORGANIZACJA | ||
18 | -Wydział Stuk Wizualnych ORGANIZACJA | ||
19 | -Zakład Teorii Architektury, Historii i Konserwacji Zabytków ORGANIZACJA | ||
20 | -Festiwal Polskich Sztuk Współczesnych R@Port WYDARZENIE | ||
21 | -Sosnowiec MIASTO | ||
22 | -Stefan IMIĘ | ||
23 | -Józefa IMIĘ | ||
24 | -Szczecin MIASTO | ||
25 | -Waldemar IMIĘ | ||
26 | -Marzęcki NAZWISKO | ||
27 | -Austria KRAJ | ||
28 | -Czechy KRAJ | ||
29 | -Niemcy KRAJ | ||
30 | -Francja KRAJ | ||
31 | -Litwa KRAJ | ||
32 | -USA KRAJ | ||
33 | -Rosja KRAJ | ||
34 | - |
tokenizer/ENIAMacronyms.ml
@@ -19,9 +19,10 @@ | @@ -19,9 +19,10 @@ | ||
19 | 19 | ||
20 | open ENIAMtokenizerTypes | 20 | open ENIAMtokenizerTypes |
21 | 21 | ||
22 | +let load_mte mte_filename _ = File.load_lines mte_filename | ||
23 | + | ||
22 | let mte_patterns = | 24 | let mte_patterns = |
23 | - let lines = try File.load_lines mte_filename | ||
24 | - with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in | 25 | + let lines = File.catch_no_file (load_mte mte_filename) [] in |
25 | let l = List.rev (Xlist.rev_map lines (fun line -> | 26 | let l = List.rev (Xlist.rev_map lines (fun line -> |
26 | match Str.split (Str.regexp "\t") line with | 27 | match Str.split (Str.regexp "\t") line with |
27 | [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp | 28 | [orths; lemma; interp] -> Str.split (Str.regexp " ") orths, lemma, interp |