Commit c420af232462c36c07b7ddbfae7daab85645724d
1 parent
9fe1accb
poprawienie test_conll - wypisywanie danych diagnostycznych
Showing
12 changed files
with
154 additions
and
412 deletions
LCGparser/ENIAM_LCGrenderer.ml
... | ... | @@ -164,7 +164,7 @@ let rec make_raised_term_imp inner_node outer_node arg_symbol = function |
164 | 164 | | Tensor l -> |
165 | 165 | if outer_node.lemma="" then inner_node else |
166 | 166 | Node (add_args outer_node [Cut(SetAttr("ARG_SYMBOL",arg_symbol,inner_node))]) |
167 | - | _ -> failwith "make_raised_term_imp" | |
167 | + | c -> (print_endline (ENIAM_LCGstringOf.grammar_symbol 0 c); failwith "make_raised_term_imp") | |
168 | 168 | |
169 | 169 | let is_raised = function |
170 | 170 | [_,Imp(_,_,_)] -> true |
... | ... |
corpora/README
0 → 100644
1 | +ENIAMcorpora Version 1.0 : | |
2 | +----------------------- | |
3 | + | |
4 | +ENIAMcorpora is a library that | |
5 | +- parses corpuses into CONLL format; | |
6 | +- converts dependencies structures; | |
7 | +- tests results of conversion. | |
8 | + | |
9 | +Install | |
10 | +------- | |
11 | + | |
12 | +ENIAMcorpora requires OCaml version 4.02.3 compiler | |
13 | +together with Xlib library version 3.2 or later, | |
14 | +ENIAMtokenizer library version 1.1, ENIAMmorphology library version 1.1, | |
15 | +ENIAMsubsyntax library version 1.1, ENIAMintegration library version 1.0, | |
16 | +ENIAM_LCGparser library version 2.0, ENIAM_LCGlexicon library version 1.0, | |
17 | +ENIAMsemValence library version 1.0. | |
18 | + | |
19 | +In order to install type: | |
20 | + | |
21 | +make install | |
22 | + | |
23 | +by default, ENIAMcorpora is installed in the 'ocamlc -where'/eniam directory. | |
24 | +you can change it by editing the Makefile. | |
25 | + | |
26 | +In order to test library type (graphviz installed required): | |
27 | +make test | |
28 | +./test | |
29 | + | |
30 | +By default ENIAMcorpora looks for resources in /usr/share/eniam directory. | |
31 | +However this behaviour may be changed by setting end exporting ENIAM_RESOURCE_PATH | |
32 | +environment variable. | |
33 | + | |
34 | +Credits | |
35 | +------- | |
36 | +Copyright © 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
37 | +Copyright © 2016 Daniel Oklesinski <oklesinski dot daniel atSPAMfree gmail dot com> | |
38 | +Copyright © 2016 Institute of Computer Science Polish Academy of Sciences | |
39 | + | |
40 | +The library uses the following licensed resources: | |
41 | + | |
42 | +NKJP1M: the manually annotated 1-million word subcorpus sampled | |
43 | +from texts of a subset of the National Corpus of Polish. | |
44 | +version 1.2 | |
45 | + | |
46 | +SGJP: Grammatical Dictionary of Polish, version 20151020 | |
47 | +Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin | |
48 | +Woliński, Robert Wołosz, Danuta Skowrońska | |
49 | + | |
50 | +Licence | |
51 | +------- | |
52 | + | |
53 | +This library is free software: you can redistribute it and/or modify | |
54 | +it under the terms of the GNU Lesser General Public License as published by | |
55 | +the Free Software Foundation, either version 3 of the License, or | |
56 | +(at your option) any later version. | |
57 | + | |
58 | +This library is distributed in the hope that it will be useful, | |
59 | +but WITHOUT ANY WARRANTY; without even the implied warranty of | |
60 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
61 | +GNU Lesser General Public License for more details. | |
62 | + | |
63 | +You should have received a copy of the GNU Lesser General Public License | |
64 | +along with this program. If not, see <http://www.gnu.org/licenses/>. | |
... | ... |
corpora/XmlPrinter.ml deleted
1 | -open Xstd | |
2 | -open WalTypes | |
3 | -open LCGtypes | |
4 | - | |
5 | -let gf_of_string = function | |
6 | - "subj" -> SUBJ | |
7 | - | "obj" -> OBJ | |
8 | - | "arg"(*""*) -> ARG | |
9 | - | "core" -> CORE | |
10 | - | "nosem" -> NOSEM | |
11 | - | "nogf" -> NOGF | |
12 | - | "adjunct" -> ADJUNCT | |
13 | - | "raised" -> RAISED | |
14 | - | "clause" -> CLAUSE | |
15 | - | "sentence" -> SENTENCE | |
16 | - | s -> prerr_endline s; SUBJ | |
17 | -(* | s -> failwith ("gf_of_string:" ^ s) *) | |
18 | - | |
19 | -(*let morf_of_string s = | |
20 | - let s = Str.split (Str.regexp "[()]") s in | |
21 | - WalParser.parse_morf_single (List.hd s, List.tl s)*) | |
22 | - | |
23 | -let rec lt_of_xml = function | |
24 | - Xml.Element("node",["pred",pred;"cat",cat;"weight",weight;"id",id],[ | |
25 | - Xml.Element("gs",[],[gs]); | |
26 | - Xml.Element("agf",[],[Xml.PCData agf]); | |
27 | - Xml.Element("amorf",[],[amorf]); | |
28 | - Xml.Element("attrs",[],attrs); | |
29 | - Xml.Element("args",[],[args])]) -> | |
30 | - Node{pred=pred; cat=cat; weight=float_of_string weight; id=int_of_string id; | |
31 | - gs = lt_of_xml gs; | |
32 | - agf = gf_of_string agf; (* FIXME *) | |
33 | - amorf = WalTypes.Phrase(WalTypes.Null); (* FIXME *) | |
34 | - arole = ""; (* FIXME *) | |
35 | - arole_attr = ""; (* FIXME *) | |
36 | - meaning = ""; (* FIXME *) | |
37 | - hipero = StringSet.empty; (* FIXME *) | |
38 | - meaning_weight = -1.; (* FIXME *) | |
39 | - position = WalTypes.{gf = WalTypes.SUBJ; role = ""; role_attr = ""; sel_prefs = []; | |
40 | - cr = []; ce = []; dir = WalTypes.Both; morfs = []}; (* FIXME *) | |
41 | - attrs=List.map (function Xml.Element("attr",["label",e],[t]) -> e,lt_of_xml t | _ -> failwith "lt_of_xml") attrs; | |
42 | - args=lt_of_xml args;} | |
43 | - | Xml.Element("tuple",[],l) -> Tuple(List.map lt_of_xml l) | |
44 | - | Xml.Element("val",[],[Xml.PCData s]) -> Val s | |
45 | - | Xml.Element("variants",["label",e],l) -> Variant(e,List.map (function Xml.Element("variant",["id",i],[t]) -> i, lt_of_xml t | _ -> failwith "lt_of_xml") l) | |
46 | - | Xml.Element("dot",[],[]) -> Dot | |
47 | - | Xml.Element("ref",["id",i],[]) -> Ref(int_of_string i) | |
48 | - | xml -> print_endline (Xml.to_string_fmt xml); failwith "lt_of_xml" | |
49 | - | |
50 | -let graph_of_xml xml = | |
51 | - let establish_indexs graph = | |
52 | - let max = Xlist.fold graph 0 (fun acc (n, _) -> if n > acc then n else acc) in | |
53 | - let table = Array.make (max+1) Dot in | |
54 | - Xlist.iter graph (fun (n,x) -> table.(n) <- x); table in | |
55 | - match xml with | |
56 | - Xml.Element("graph",[],l) -> | |
57 | - establish_indexs @@ List.map (function Xml.Element("graph_node",["id",i],[xml]) -> int_of_string i, lt_of_xml xml | _ -> failwith "graph_of_xml") l | |
58 | - | _ -> failwith "graph_of_xml" | |
59 | - | |
60 | -let print_xml path name xml = | |
61 | - let graph = graph_of_xml xml in | |
62 | - Visualization.print_dependency_tree path name graph | |
63 | - | |
64 | -let load_and_print_xml path name filename = | |
65 | - print_xml path name @@ Xml.parse_file filename | |
66 | - | |
67 | -(*let _ = | |
68 | - load_and_print_xml "xml_test/" "test1.0" "xml_test/sentence1.0.xml"*) |
corpora/conllParser.ml
... | ... | @@ -22,7 +22,7 @@ open Types |
22 | 22 | |
23 | 23 | let skladnica_zaleznosciowa_filename = "../../NLP resources/skladnica_zaleznosciowa.conll" |
24 | 24 | |
25 | -let oc = open_out "../corpora/info_sentences.txt" | |
25 | +let oc = open_out @@ resource_path ^ "/info_sentences.txt" | |
26 | 26 | |
27 | 27 | let empty_token = { c_id = 0; c_orth = ""; c_lemma = ""; c_cat = ""; |
28 | 28 | c_interp = []; c_super = 0; c_label = ""; c_beg = 0; c_len = 0} |
... | ... |
corpora/depTree.ml deleted
1 | -open Xstd | |
2 | -open PreTypes | |
3 | - | |
4 | -let tuple_it taglist = | |
5 | - match List.length taglist with | |
6 | - 0 -> Xml.Element("dot",[],[]) | |
7 | - | 1 -> List.hd taglist | |
8 | - | _ -> Xml.Element("tuple",[],taglist) | |
9 | - | |
10 | -let get_amorf_basic token_r = "empty" (* FIXME *) | |
11 | - | |
12 | -let get_amorf token_r = "empty" (* FIXME *) | |
13 | - | |
14 | -let get_vals token_r cat interp = get_amorf_basic token_r :: | |
15 | - match cat with | |
16 | - "subst" -> List.rev ("ter" :: (List.rev interp)) | |
17 | - | _ -> interp (* FIXME *) | |
18 | - | |
19 | -let get_basic_attrs token_r = ["A","a";"B","b"] (* FIXME *) | |
20 | - | |
21 | -let get_attrs token_r = | |
22 | - let attrs = get_basic_attrs token_r in | |
23 | - List.map (fun (label, value) -> | |
24 | - Xml.Element("attr",["label",label],[ | |
25 | - Xml.Element("val",[],[Xml.PCData value])])) attrs | |
26 | - | |
27 | -let xml_of_gs token_r cat interp = | |
28 | - let vals = get_vals token_r cat interp in (** **) | |
29 | - let vals = List.map (fun x -> Xml.Element("val",[],[Xml.PCData x])) vals in | |
30 | - Xml.Element("gs",[],[tuple_it vals]) | |
31 | - | |
32 | -let xml_of_agf token_r = Xml.Element("agf",[],[Xml.PCData token_r.conll_label]) | |
33 | - | |
34 | -let xml_of_amorf token_r = Xml.Element("amorf",[],[Xml.PCData (get_amorf token_r)]) | |
35 | - | |
36 | -let xml_of_attrs token_r = Xml.Element("attrs",[],get_attrs token_r) (* FIXME *) | |
37 | - | |
38 | -let xml_of_args token_rs token_r = | |
39 | - let children = List.filter (fun pom -> pom.conll_super = token_r.conll_id) token_rs in | |
40 | - let children_to_graph = List.map (fun pom -> | |
41 | - Xml.Element("ref",["id", pom.conll_id],[])) children in | |
42 | - Xml.Element("args",[],[tuple_it children_to_graph]) | |
43 | - | |
44 | -let xml_of_token_r token_rs token_r = | |
45 | - let pred, cat, interp = match token_r.token with | |
46 | - | Lemma(a,b,c) -> a, b, Xlist.map (List.hd c) (fun x -> List.hd x) | |
47 | - | _ -> failwith ("xml_of_token_r: not Lemma") in | |
48 | - Xml.Element("graph_node",["id", token_r.conll_id],[ | |
49 | - Xml.Element("node",["pred",pred;"cat",cat;"weight","0";"id", token_r.conll_id], | |
50 | - (xml_of_gs token_r cat interp) :: (** **) | |
51 | - (xml_of_agf token_r) :: | |
52 | - (xml_of_amorf token_r) :: (** **) | |
53 | - (xml_of_attrs token_r) :: (** **) | |
54 | - [xml_of_args token_rs token_r] | |
55 | - ) ]) | |
56 | - | |
57 | -let conll_to_xml token_rs = | |
58 | - Xml.Element("graph",[],List.map (xml_of_token_r token_rs) token_rs) | |
59 | - | |
60 | - | |
61 | -(***************************************************************************************************) | |
62 | - | |
63 | -let get_info i = function | |
64 | - AltText[Raw,RawText text1;CONLL,StructText([StructParagraph[ | |
65 | - {pid = id; pbeg = beg; plen = len; psentence = | |
66 | - AltSentence[Raw, RawSentence text2; CONLL, StructSentence(_,token_rs,-1)]}]],-1)] -> token_rs, id | |
67 | - | StructText([StructParagraph[{pid = id; pbeg = -1; plen = -1; psentence = | |
68 | - StructSentence(_,token_rs,-1)}]],-1) -> token_rs, "id_not_found" ^ (string_of_int i) | |
69 | - | _ -> failwith "get_info" | |
70 | - | |
71 | -let print_corpus filename = | |
72 | - let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in | |
73 | - List.mapi (fun i x -> | |
74 | - let token_rs, id = get_info i x in | |
75 | - let xml = conll_to_xml token_rs in | |
76 | - let id = Str.global_replace (Str.regexp "/") "_" id in | |
77 | - let oc = open_out ("xml_test/"^id^".xml") in | |
78 | - output_string oc (Xml.to_string_fmt xml); | |
79 | - flush oc; | |
80 | - XmlPrinter.print_xml "xml_test/" id xml) corpus | |
81 | - | |
82 | -(*let _ = | |
83 | - print_corpus "xml_test/sentence1.conll"*) |
corpora/generate.ml
corpora/makefile
... | ... | @@ -4,17 +4,37 @@ OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa |
7 | +INSTALLDIR=`ocamlc -where`/eniam | |
7 | 8 | |
8 | -MODS= ../pre/walTypes.ml ../pre/preTypes.ml types.ml CONLL.ml | |
9 | +SOURCES= types.ml CONLL.ml CONLL_adapter.ml resources.ml conllParser.ml interpsInCorpus.ml generate.ml | |
9 | 10 | |
10 | -all: | |
11 | - $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) $(MODS) resources.ml conllParser.ml interpsInCorpus.ml generate.ml | |
11 | +all: eniam-corpora.cma eniam-corpora.cmxa freq_test | |
12 | + $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) $(SOURCES) | |
12 | 13 | |
13 | -lib: | |
14 | - $(OCAMLOPT) -linkall -a -o corpora.cmxa $(INCLUDES) $(MODS) | |
14 | +install: all | |
15 | + mkdir -p $(INSTALLDIR) | |
16 | + cp eniam-corpora.cmxa eniam-corpora.a eniam-corpora.cma $(INSTALLDIR) | |
17 | + cp types.cmi CONLL.cmi CONLL_adapter.cmi resources.cmi conllParser.cmi interpsInCorpus.cmi generate.cmi $(INSTALLDIR) | |
18 | + cp types.cmx CONLL.cmx CONLL_adapter.cmx resources.cmx conllParser.cmx interpsInCorpus.cmx generate.cmx $(INSTALLDIR) | |
19 | + mkdir -p /usr/share/eniam/corpora | |
20 | + cp info_sentences* /usr/share/eniam/corpora | |
21 | + | |
22 | +install-local: all | |
23 | + mkdir -p $(INSTALLDIR) | |
24 | + cp eniam-corpora.cmxa eniam-corpora.a eniam-corpora.cma $(INSTALLDIR) | |
25 | + cp types.cmi CONLL.cmi CONLL_adapter.cmi resources.cmi conllParser.cmi interpsInCorpus.cmi generate.cmi $(INSTALLDIR) | |
26 | + cp types.cmx CONLL.cmx CONLL_adapter.cmx resources.cmx conllParser.cmx interpsInCorpus.cmx generate.cmx $(INSTALLDIR) | |
27 | + mkdir -p /usr/local/share/eniam/corpora | |
28 | + cp info_sentences* /usr/local/share/eniam/corpora | |
29 | + | |
30 | +eniam-corpora.cma: $(SOURCES) | |
31 | + ocamlc -linkall -a -o eniam-corpora.cma $(OCAMLFLAGS) $^ | |
32 | + | |
33 | +eniam-corpora.cmxa: $(SOURCES) | |
34 | + $(OCAMLOPT) -linkall -a -o eniam-corpora.cmxa $(INCLUDES) $(SOURCES) | |
15 | 35 | |
16 | 36 | freq_test: |
17 | - $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(MODS) freq_test.ml | |
37 | + $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(SOURCES) freq_test.ml | |
18 | 38 | |
19 | 39 | test: CONLL.ml CONLL_adapter.ml test_conll.ml |
20 | 40 | mkdir -p results |
... | ... |
corpora/resources.ml
... | ... | @@ -97,7 +97,7 @@ let conll_info () = Xlist.fold (data_conll ()) InfoMap.empty |
97 | 97 | (fun map sentence -> InfoMap.add (List.map (fun token -> token.c_orth) sentence.s_tokens) sentence map) |
98 | 98 | |
99 | 99 | let info_file () = |
100 | - let oc = open_out "../corpora/info_sentences2.txt" in | |
100 | + let oc = open_out @@ resource_path ^ "/info_sentences2.txt" in | |
101 | 101 | List.iter (fun (key, sentence) -> |
102 | 102 | output_string oc (sentence.s_id^"\n"^sentence.s_text^"\n"^(String.concat " " key)^"\n\n"); |
103 | 103 | flush oc) (InfoMap.bindings (conll_info())) |
... | ... |
corpora/test_conll.ml
... | ... | @@ -207,26 +207,34 @@ let process_conll_corpus filename = |
207 | 207 | let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in |
208 | 208 | print_endline "process_conll_corpus"; |
209 | 209 | (* let corpus = [List.hd corpus] in *) |
210 | - Xlist.iter corpus (fun query -> | |
211 | - let id = process_id (get_query_id query) in | |
212 | - let path = "results/" ^ id ^ "/" in | |
213 | - ignore (Sys.command ("mkdir -p " ^ path)); | |
214 | - match query with | |
215 | - | AltText[Raw,RawText query;CONLL,StructText[ | |
216 | - StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens -> | |
217 | - print_endline ("\n" ^ text ^ "\n"); | |
218 | - (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) | |
219 | - let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] | |
220 | - (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in | |
221 | - let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in | |
222 | - let sentences = match text with | |
223 | - AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences | |
224 | - | _ -> failwith "process_conll_corpus 1" in | |
225 | - let text = AltText[Raw,RawText query; Struct, StructText([ | |
226 | - AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in | |
227 | - let lex_sems = ENIAMlexSemantics.assign tokens text in | |
228 | - ignore(parse_text id 1 tokens lex_sems text) | |
229 | - | _ -> failwith "process_conll_corpus 2") | |
210 | + Xlist.iter corpus (fun query -> try | |
211 | + let id = process_id (get_query_id query) in | |
212 | + let path = "results/" ^ id ^ "/" in | |
213 | + ignore (Sys.command ("mkdir -p " ^ path)); | |
214 | + match query with | |
215 | + | AltText[Raw,RawText query;CONLL,StructText[ | |
216 | + StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens -> | |
217 | + print_endline ("\nPróba sparsowania zdania:\n" ^ text ^ "\n"); | |
218 | + (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) | |
219 | + let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] | |
220 | + (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in | |
221 | + let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in | |
222 | + let sentences = match text with | |
223 | + AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences | |
224 | + | _ -> failwith "process_conll_corpus 1" in | |
225 | + let text = AltText[Raw,RawText query; Struct, StructText([ | |
226 | + AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in | |
227 | + let lex_sems = ENIAMlexSemantics.assign tokens text in | |
228 | + ignore(parse_text id 1 tokens lex_sems text) | |
229 | + | _ -> failwith "process_conll_corpus 2" | |
230 | + with | |
231 | + Failure e -> print_endline ("Failure " ^ e) | |
232 | + | e -> print_endline (Printexc.get_backtrace () ^ "\n" ^ (Printexc.to_string e))) | |
230 | 233 | |
231 | 234 | let _ = |
232 | - process_conll_corpus "../testy/skladnica-test1-Failure.conll" | |
235 | + Printexc.record_backtrace true; | |
236 | + (* LCGfields.reset (); *) | |
237 | + process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; | |
238 | + (* process_conll_corpus "../testy/skladnica-test1.conll"; *) | |
239 | + (* process_conll_corpus "../testy/skladnica-test1-Failure.conll"; *) | |
240 | + (* LCGfields.print_results () *) | |
... | ... |
corpora/test_conll2.ml deleted
1 | -(* | |
2 | - * ENIAMcorpora is a library that integrates ENIAM with corpora in CONLL format | |
3 | - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | - * | |
6 | - * This library is free software: you can redistribute it and/or modify | |
7 | - * it under the terms of the GNU Lesser General Public License as published by | |
8 | - * the Free Software Foundation, either version 3 of the License, or | |
9 | - * (at your option) any later version. | |
10 | - * | |
11 | - * This library is distributed in the hope that it will be useful, | |
12 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - * GNU Lesser General Public License for more details. | |
15 | - * | |
16 | - * You should have received a copy of the GNU Lesser General Public License | |
17 | - * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | - *) | |
19 | - | |
20 | -open Xstd | |
21 | -open ENIAM_LCGlexiconTypes | |
22 | -open ENIAM_LCGtypes | |
23 | -open ENIAMsubsyntaxTypes | |
24 | - | |
25 | -let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename | |
26 | -let dep_rules = ENIAM_LCGlexicon.make_rules true ENIAM_LCGlexiconTypes.rules_filename | |
27 | - | |
28 | -let examples = [ | |
29 | - (* "Szpak","Szpak śpiewa.";*) | |
30 | - (* "miał","Miałem miał."; *) | |
31 | - (* "Ala","Ala ma kota."; | |
32 | - "Ale","Ale mają kota:"; *) | |
33 | - (* "zima","Szpak frunie zimą.";*) | |
34 | - (* "październik","Kot miauczy w październiku."; *) | |
35 | - (* "Szpak-Kot","Szpak frunie. Kot miauczy."; | |
36 | - "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*) | |
37 | - "teraz","Teraz frunie jakiś szpak."; | |
38 | - "chłopcy","Chłopcy mają ulicę kwiatami."; | |
39 | - (* "arabia","Arabia Saudyjska biegnie.";*) | |
40 | - (* "Tom","Tom idzie."; *) | |
41 | -] | |
42 | - | |
43 | -let clarify_categories senses token = | |
44 | - match token.ENIAMtokenizerTypes.token with | |
45 | - ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp))) | |
46 | - | ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp))) | |
47 | - | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) | |
48 | - | _ -> [] | |
49 | - | |
50 | -let create_chart tokens lex_sems paths last = | |
51 | - ENIAM_LCGrenderer.reset_variable_numbers (); | |
52 | - let chart = ENIAM_LCGchart.make last in | |
53 | - let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | |
54 | - let t = ExtArray.get tokens id in | |
55 | - let s = ExtArray.get lex_sems id in | |
56 | - ENIAM_LCGrenderer.reset_variable_names (); | |
57 | - ENIAM_LCGrenderer.add_variable_numbers (); | |
58 | - let cats = clarify_categories ["X"] t in | |
59 | - let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | |
60 | - ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | |
61 | - chart | |
62 | - | |
63 | -let rec split_sons left id right = function | |
64 | - [] -> List.rev (List.sort compare left), List.sort compare right | |
65 | - | x :: l -> if x < id then split_sons (x :: left) id right l else split_sons left id (x :: right) l | |
66 | - | |
67 | -let rec dep_create_rec nodes sons conll_id = | |
68 | - let node = IntMap.find nodes conll_id in | |
69 | - let l = try IntMap.find sons conll_id with Not_found -> [] in | |
70 | - let left,right = split_sons [] conll_id [] l in | |
71 | - (* Printf.printf "dep_create_rec [%s] %d [%s]\n" (String.concat ";" (Xlist.map left string_of_int)) conll_id (String.concat ";" (Xlist.map right string_of_int)); *) | |
72 | - DepNode(conll_id, Xlist.map left (dep_create_rec nodes sons), node, Xlist.map right (dep_create_rec nodes sons)) | |
73 | - | |
74 | -let create_dep_chart tokens lex_sems paths = | |
75 | - let sons = Int.fold 1 (Array.length paths - 1) IntMap.empty (fun sons i -> | |
76 | - let _,super,_ = paths.(i) in | |
77 | - IntMap.add_inc sons super [i] (fun l -> i :: l)) in | |
78 | - let nodes = Int.fold 0 (Array.length paths - 1) IntMap.empty (fun nodes i -> | |
79 | - let id,_,_ = paths.(i) in | |
80 | - let t = ExtArray.get tokens id in | |
81 | - let s = ExtArray.get lex_sems id in | |
82 | - ENIAM_LCGrenderer.reset_variable_names (); | |
83 | - ENIAM_LCGrenderer.add_variable_numbers (); | |
84 | - let cats = clarify_categories ["X"] t in | |
85 | - let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | |
86 | - IntMap.add nodes i l) in | |
87 | - dep_create_rec nodes sons 0 | |
88 | - | |
89 | -let test_example path id tokens lex_sems paths last = | |
90 | - ENIAM_LCGreductions.reset_variant_label (); | |
91 | - let chart = create_chart tokens lex_sems paths last in | |
92 | - ENIAM_LCGlatexOf.print_chart path (id^"1_chart") "a1" chart; | |
93 | - let chart,references = ENIAM_LCGchart.lazify chart in | |
94 | - ENIAM_LCGlatexOf.print_chart path (id^"2_chart") "a4" chart; | |
95 | - ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; | |
96 | - let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
97 | - ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; | |
98 | - ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; | |
99 | - if ENIAM_LCGchart.is_parsed chart then ( | |
100 | - let term = ENIAM_LCGchart.get_parsed_term chart in | |
101 | - Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file -> | |
102 | - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
103 | - Xlatex.latex_compile_and_clean path (id^"4_term"); | |
104 | - let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
105 | - ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; | |
106 | - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
107 | - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
108 | - ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; | |
109 | - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
110 | - ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; | |
111 | - ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; | |
112 | - ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; | |
113 | - ()) | |
114 | - else print_endline "not reduced") | |
115 | - else print_endline "not parsed" | |
116 | - | |
117 | -let test_dep_example path id tokens lex_sems paths = | |
118 | - try | |
119 | - ENIAM_LCGreductions.reset_variant_label (); | |
120 | - let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in | |
121 | - ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; | |
122 | - let chart = create_dep_chart tokens lex_sems paths in | |
123 | - ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; | |
124 | - let chart,references = ENIAM_LCGchart.dep_lazify chart in | |
125 | - ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; | |
126 | - ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; | |
127 | - let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
128 | - (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *) | |
129 | - ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; | |
130 | - if ENIAM_LCGchart.is_dep_parsed chart then ( | |
131 | - let term = ENIAM_LCGchart.get_dep_parsed_term chart in | |
132 | - Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file -> | |
133 | - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
134 | - Xlatex.latex_compile_and_clean path (id^"4_term"); | |
135 | - let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
136 | - ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; | |
137 | - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
138 | - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
139 | - ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; | |
140 | - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
141 | - ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; | |
142 | - ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; | |
143 | - ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; | |
144 | - ()) | |
145 | - else print_endline "not reduced") | |
146 | - else print_endline "not parsed" | |
147 | - with NotDepParsed(id_ndp,left,l,right) -> ( | |
148 | - print_endline "not parsed 2"; | |
149 | - ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right)) | |
150 | - | |
151 | -let rec parse_sentence name id tokens lex_sems = function | |
152 | - RawSentence s -> id | |
153 | - | StructSentence(paths,last) -> | |
154 | - (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *) | |
155 | - id + 1 | |
156 | - | DepSentence(paths) -> | |
157 | - test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths; | |
158 | - id + 1 | |
159 | - | QuotedSentences sentences -> | |
160 | - Xlist.fold sentences id (fun id p -> | |
161 | - parse_sentence name id tokens lex_sems p.sentence) | |
162 | - | AltSentence l -> | |
163 | - Xlist.fold l id (fun id (mode,sentence) -> | |
164 | - parse_sentence name id tokens lex_sems sentence) | |
165 | - | |
166 | -let rec parse_paragraph name id tokens lex_sems = function | |
167 | - RawParagraph s -> id | |
168 | - | StructParagraph sentences -> | |
169 | - Xlist.fold sentences id (fun id p -> | |
170 | - parse_sentence name id tokens lex_sems p.sentence) | |
171 | - | AltParagraph l -> | |
172 | - Xlist.fold l id (fun id (mode,paragraph) -> | |
173 | - parse_paragraph name id tokens lex_sems paragraph) | |
174 | - | |
175 | -let rec parse_text name id tokens lex_sems = function | |
176 | - RawText s -> id | |
177 | - | StructText paragraphs -> | |
178 | - Xlist.fold paragraphs id (fun id paragraph -> | |
179 | - parse_paragraph name id tokens lex_sems paragraph) | |
180 | - | AltText l -> | |
181 | - Xlist.fold l id (fun id (mode,text) -> | |
182 | - parse_text name id tokens lex_sems text) | |
183 | - | |
184 | -let id_counter = ref 0 | |
185 | - | |
186 | -let get_id () = | |
187 | - incr id_counter; | |
188 | - "ID_" ^ (string_of_int !id_counter) | |
189 | - | |
190 | -let get_query_id = function | |
191 | - AltText[_;CONLL,StructText[StructParagraph[p]]],_ -> if p.id = "" then get_id () else p.id | |
192 | - | AltText[CONLL,StructText[StructParagraph[p]]],_ -> if p.id = "" then get_id () else p.id | |
193 | - | _ -> failwith "get_query_id" | |
194 | - | |
195 | -let process_id s = | |
196 | - if Xstring.check_prefix "ID_" s then s else | |
197 | - let a,b,c = match Xstring.split_delim "/" s with | |
198 | - [a;b;c] -> a,b,c | |
199 | - | _ -> failwith ("process_id: " ^ s) in | |
200 | - if Xstring.check_prefix "NKJP_1M_" a && Xstring.check_prefix "morph_" b && Xstring.check_sufix "-p" b && | |
201 | - Xstring.check_prefix "morph_" c && Xstring.check_sufix "-s" c then | |
202 | - Xstring.cut_prefix "NKJP_1M_" a ^ "." ^ Xstring.cut_sufix "-s" (Xstring.cut_prefix "morph_" c) | |
203 | - else failwith ("process_id: " ^ s) | |
204 | - | |
205 | -let process_conll_corpus filename = | |
206 | - let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in | |
207 | - print_endline "process_conll_corpus"; | |
208 | - let corpus = [List.hd corpus] in | |
209 | - Xlist.iter corpus (fun query -> | |
210 | - let id = process_id (get_query_id query) in | |
211 | - let path = "results/" ^ id ^ "/" in | |
212 | - ignore (Sys.command ("mkdir -p " ^ path)); | |
213 | - match query with | |
214 | - | AltText[Raw,RawText query;CONLL,StructText[ | |
215 | - StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens -> | |
216 | - (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) | |
217 | - let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] | |
218 | - (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in | |
219 | - let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in | |
220 | - let sentences = match text with | |
221 | - AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences | |
222 | - | _ -> failwith "process_conll_corpus 1" in | |
223 | - let text = AltText[Raw,RawText query; Struct, StructText([ | |
224 | - AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in | |
225 | - let lex_sems = ENIAMlexSemantics.assign tokens text in | |
226 | - ignore(parse_text id 1 tokens lex_sems text) | |
227 | - | _ -> failwith "process_conll_corpus 2") | |
228 | - | |
229 | -let _ = | |
230 | - process_conll_corpus "../testy/skladnica-test1.conll" |
corpora/types.ml
... | ... | @@ -27,3 +27,10 @@ type conll_sentence = |
27 | 27 | |
28 | 28 | type info_sentence = |
29 | 29 | {i_id:string; i_text:string; i_tokens:string list} |
30 | + | |
31 | +let resource_path = | |
32 | + try Sys.getenv "ENIAM_RESOURCE_PATH" | |
33 | + with Not_found -> | |
34 | + if Sys.file_exists "/usr/share/eniam" then "/usr/share/eniam" else | |
35 | + if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else | |
36 | + failwith "resource directory does not exists" | |
... | ... |
testy/skladnica-test1-Failure.conll
1 | +1 - - interp interp _ 3 punct _ _ | |
2 | +2 Panowie pan subst subst pl|nom|m1 3 subj _ _ | |
3 | +3 przyszli przyjść praet praet pl|m1|perf 0 pred _ _ | |
4 | +4 . . interp interp _ 3 punct _ _ | |
5 | + | |
6 | +1 O o prep prep loc 12 comp _ _ | |
7 | +2 klasztornym klasztorny adj adj sg|loc|n|pos 3 adjunct _ _ | |
8 | +3 piekle piekło subst subst sg|loc|n 1 comp _ _ | |
9 | +4 , , interp interp _ 3 punct _ _ | |
10 | +5 zgotowanym zgotować ppas ppas sg|loc|n|perf|aff 3 adjunct _ _ | |
11 | +6 przez przez prep prep acc|nwok 5 comp_ag _ _ | |
12 | +7 trzy trzy num num pl|acc|m2|congr 6 comp _ _ | |
13 | +8 potwory potwór subst subst pl|acc|m2 7 comp _ _ | |
14 | +9 w w prep prep loc|nwok 8 adjunct _ _ | |
15 | +10 habitach habit subst subst pl|loc|m3 9 comp _ _ | |
16 | +11 , , interp interp _ 3 punct _ _ | |
17 | +12 pisali pisać praet praet pl|m1|imperf 0 pred _ _ | |
18 | +13 śmy być aglt aglt pl|pri|imperf|nwok 12 aglt _ _ | |
19 | +14 w w prep prep loc|nwok 12 adjunct _ _ | |
20 | +15 kwietniu kwiecień subst subst sg|loc|m3 14 comp _ _ | |
21 | +16 br bieżący_rok brev brev pun 15 ne _ _ | |
22 | +17 . . interp interp _ 12 punct _ _ | |
23 | + | |
1 | 24 | 1 Następnie następnie adv adv _ 2 adjunct _ _ |
2 | 25 | 2 rozłożyła rozłożyć praet praet sg|f|perf 10 conjunct _ _ |
3 | 26 | 3 wysoki wysoki adj adj sg|acc|m3|pos 4 adjunct _ _ |
... | ... |