Commit c420af232462c36c07b7ddbfae7daab85645724d

Authored by Daniel Oklesiński
1 parent 9fe1accb

poprawienie test_conll - wypisywanie danych diagnostycznych

LCGparser/ENIAM_LCGrenderer.ml
... ... @@ -164,7 +164,7 @@ let rec make_raised_term_imp inner_node outer_node arg_symbol = function
164 164 | Tensor l ->
165 165 if outer_node.lemma="" then inner_node else
166 166 Node (add_args outer_node [Cut(SetAttr("ARG_SYMBOL",arg_symbol,inner_node))])
167   - | _ -> failwith "make_raised_term_imp"
  167 + | c -> (print_endline (ENIAM_LCGstringOf.grammar_symbol 0 c); failwith "make_raised_term_imp")
168 168  
169 169 let is_raised = function
170 170 [_,Imp(_,_,_)] -> true
... ...
corpora/README 0 → 100644
  1 +ENIAMcorpora Version 1.0 :
  2 +-----------------------
  3 +
  4 +ENIAMcorpora is a library that
  5 +- parses corpuses into CONLL format;
  6 +- converts dependencies structures;
  7 +- tests results of conversion.
  8 +
  9 +Install
  10 +-------
  11 +
  12 +ENIAMcorpora requires OCaml version 4.02.3 compiler
  13 +together with Xlib library version 3.2 or later,
  14 +ENIAMtokenizer library version 1.1, ENIAMmorphology library version 1.1,
  15 +ENIAMsubsyntax library version 1.1, ENIAMintegration library version 1.0,
  16 +ENIAM_LCGparser library version 2.0, ENIAM_LCGlexicon library version 1.0,
  17 +ENIAMsemValence library version 1.0.
  18 +
  19 +In order to install type:
  20 +
  21 +make install
  22 +
  23 +by default, ENIAMcorpora is installed in the 'ocamlc -where'/eniam directory.
  24 +you can change it by editing the Makefile.
  25 +
  26 +In order to test library type (graphviz installed required):
  27 +make test
  28 +./test
  29 +
  30 +By default ENIAMcorpora looks for resources in /usr/share/eniam directory.
  31 +However this behaviour may be changed by setting end exporting ENIAM_RESOURCE_PATH
  32 +environment variable.
  33 +
  34 +Credits
  35 +-------
  36 +Copyright © 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  37 +Copyright © 2016 Daniel Oklesinski <oklesinski dot daniel atSPAMfree gmail dot com>
  38 +Copyright © 2016 Institute of Computer Science Polish Academy of Sciences
  39 +
  40 +The library uses the following licensed resources:
  41 +
  42 +NKJP1M: the manually annotated 1-million word subcorpus sampled
  43 +from texts of a subset of the National Corpus of Polish.
  44 +version 1.2
  45 +
  46 +SGJP: Grammatical Dictionary of Polish, version 20151020
  47 +Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin
  48 +Woliński, Robert Wołosz, Danuta Skowrońska
  49 +
  50 +Licence
  51 +-------
  52 +
  53 +This library is free software: you can redistribute it and/or modify
  54 +it under the terms of the GNU Lesser General Public License as published by
  55 +the Free Software Foundation, either version 3 of the License, or
  56 +(at your option) any later version.
  57 +
  58 +This library is distributed in the hope that it will be useful,
  59 +but WITHOUT ANY WARRANTY; without even the implied warranty of
  60 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  61 +GNU Lesser General Public License for more details.
  62 +
  63 +You should have received a copy of the GNU Lesser General Public License
  64 +along with this program. If not, see <http://www.gnu.org/licenses/>.
... ...
corpora/XmlPrinter.ml deleted
1   -open Xstd
2   -open WalTypes
3   -open LCGtypes
4   -
5   -let gf_of_string = function
6   - "subj" -> SUBJ
7   - | "obj" -> OBJ
8   - | "arg"(*""*) -> ARG
9   - | "core" -> CORE
10   - | "nosem" -> NOSEM
11   - | "nogf" -> NOGF
12   - | "adjunct" -> ADJUNCT
13   - | "raised" -> RAISED
14   - | "clause" -> CLAUSE
15   - | "sentence" -> SENTENCE
16   - | s -> prerr_endline s; SUBJ
17   -(* | s -> failwith ("gf_of_string:" ^ s) *)
18   -
19   -(*let morf_of_string s =
20   - let s = Str.split (Str.regexp "[()]") s in
21   - WalParser.parse_morf_single (List.hd s, List.tl s)*)
22   -
23   -let rec lt_of_xml = function
24   - Xml.Element("node",["pred",pred;"cat",cat;"weight",weight;"id",id],[
25   - Xml.Element("gs",[],[gs]);
26   - Xml.Element("agf",[],[Xml.PCData agf]);
27   - Xml.Element("amorf",[],[amorf]);
28   - Xml.Element("attrs",[],attrs);
29   - Xml.Element("args",[],[args])]) ->
30   - Node{pred=pred; cat=cat; weight=float_of_string weight; id=int_of_string id;
31   - gs = lt_of_xml gs;
32   - agf = gf_of_string agf; (* FIXME *)
33   - amorf = WalTypes.Phrase(WalTypes.Null); (* FIXME *)
34   - arole = ""; (* FIXME *)
35   - arole_attr = ""; (* FIXME *)
36   - meaning = ""; (* FIXME *)
37   - hipero = StringSet.empty; (* FIXME *)
38   - meaning_weight = -1.; (* FIXME *)
39   - position = WalTypes.{gf = WalTypes.SUBJ; role = ""; role_attr = ""; sel_prefs = [];
40   - cr = []; ce = []; dir = WalTypes.Both; morfs = []}; (* FIXME *)
41   - attrs=List.map (function Xml.Element("attr",["label",e],[t]) -> e,lt_of_xml t | _ -> failwith "lt_of_xml") attrs;
42   - args=lt_of_xml args;}
43   - | Xml.Element("tuple",[],l) -> Tuple(List.map lt_of_xml l)
44   - | Xml.Element("val",[],[Xml.PCData s]) -> Val s
45   - | Xml.Element("variants",["label",e],l) -> Variant(e,List.map (function Xml.Element("variant",["id",i],[t]) -> i, lt_of_xml t | _ -> failwith "lt_of_xml") l)
46   - | Xml.Element("dot",[],[]) -> Dot
47   - | Xml.Element("ref",["id",i],[]) -> Ref(int_of_string i)
48   - | xml -> print_endline (Xml.to_string_fmt xml); failwith "lt_of_xml"
49   -
50   -let graph_of_xml xml =
51   - let establish_indexs graph =
52   - let max = Xlist.fold graph 0 (fun acc (n, _) -> if n > acc then n else acc) in
53   - let table = Array.make (max+1) Dot in
54   - Xlist.iter graph (fun (n,x) -> table.(n) <- x); table in
55   - match xml with
56   - Xml.Element("graph",[],l) ->
57   - establish_indexs @@ List.map (function Xml.Element("graph_node",["id",i],[xml]) -> int_of_string i, lt_of_xml xml | _ -> failwith "graph_of_xml") l
58   - | _ -> failwith "graph_of_xml"
59   -
60   -let print_xml path name xml =
61   - let graph = graph_of_xml xml in
62   - Visualization.print_dependency_tree path name graph
63   -
64   -let load_and_print_xml path name filename =
65   - print_xml path name @@ Xml.parse_file filename
66   -
67   -(*let _ =
68   - load_and_print_xml "xml_test/" "test1.0" "xml_test/sentence1.0.xml"*)
corpora/conllParser.ml
... ... @@ -22,7 +22,7 @@ open Types
22 22  
23 23 let skladnica_zaleznosciowa_filename = "../../NLP resources/skladnica_zaleznosciowa.conll"
24 24  
25   -let oc = open_out "../corpora/info_sentences.txt"
  25 +let oc = open_out @@ resource_path ^ "/info_sentences.txt"
26 26  
27 27 let empty_token = { c_id = 0; c_orth = ""; c_lemma = ""; c_cat = "";
28 28 c_interp = []; c_super = 0; c_label = ""; c_beg = 0; c_len = 0}
... ...
corpora/depTree.ml deleted
1   -open Xstd
2   -open PreTypes
3   -
4   -let tuple_it taglist =
5   - match List.length taglist with
6   - 0 -> Xml.Element("dot",[],[])
7   - | 1 -> List.hd taglist
8   - | _ -> Xml.Element("tuple",[],taglist)
9   -
10   -let get_amorf_basic token_r = "empty" (* FIXME *)
11   -
12   -let get_amorf token_r = "empty" (* FIXME *)
13   -
14   -let get_vals token_r cat interp = get_amorf_basic token_r ::
15   - match cat with
16   - "subst" -> List.rev ("ter" :: (List.rev interp))
17   - | _ -> interp (* FIXME *)
18   -
19   -let get_basic_attrs token_r = ["A","a";"B","b"] (* FIXME *)
20   -
21   -let get_attrs token_r =
22   - let attrs = get_basic_attrs token_r in
23   - List.map (fun (label, value) ->
24   - Xml.Element("attr",["label",label],[
25   - Xml.Element("val",[],[Xml.PCData value])])) attrs
26   -
27   -let xml_of_gs token_r cat interp =
28   - let vals = get_vals token_r cat interp in (** **)
29   - let vals = List.map (fun x -> Xml.Element("val",[],[Xml.PCData x])) vals in
30   - Xml.Element("gs",[],[tuple_it vals])
31   -
32   -let xml_of_agf token_r = Xml.Element("agf",[],[Xml.PCData token_r.conll_label])
33   -
34   -let xml_of_amorf token_r = Xml.Element("amorf",[],[Xml.PCData (get_amorf token_r)])
35   -
36   -let xml_of_attrs token_r = Xml.Element("attrs",[],get_attrs token_r) (* FIXME *)
37   -
38   -let xml_of_args token_rs token_r =
39   - let children = List.filter (fun pom -> pom.conll_super = token_r.conll_id) token_rs in
40   - let children_to_graph = List.map (fun pom ->
41   - Xml.Element("ref",["id", pom.conll_id],[])) children in
42   - Xml.Element("args",[],[tuple_it children_to_graph])
43   -
44   -let xml_of_token_r token_rs token_r =
45   - let pred, cat, interp = match token_r.token with
46   - | Lemma(a,b,c) -> a, b, Xlist.map (List.hd c) (fun x -> List.hd x)
47   - | _ -> failwith ("xml_of_token_r: not Lemma") in
48   - Xml.Element("graph_node",["id", token_r.conll_id],[
49   - Xml.Element("node",["pred",pred;"cat",cat;"weight","0";"id", token_r.conll_id],
50   - (xml_of_gs token_r cat interp) :: (** **)
51   - (xml_of_agf token_r) ::
52   - (xml_of_amorf token_r) :: (** **)
53   - (xml_of_attrs token_r) :: (** **)
54   - [xml_of_args token_rs token_r]
55   - ) ])
56   -
57   -let conll_to_xml token_rs =
58   - Xml.Element("graph",[],List.map (xml_of_token_r token_rs) token_rs)
59   -
60   -
61   -(***************************************************************************************************)
62   -
63   -let get_info i = function
64   - AltText[Raw,RawText text1;CONLL,StructText([StructParagraph[
65   - {pid = id; pbeg = beg; plen = len; psentence =
66   - AltSentence[Raw, RawSentence text2; CONLL, StructSentence(_,token_rs,-1)]}]],-1)] -> token_rs, id
67   - | StructText([StructParagraph[{pid = id; pbeg = -1; plen = -1; psentence =
68   - StructSentence(_,token_rs,-1)}]],-1) -> token_rs, "id_not_found" ^ (string_of_int i)
69   - | _ -> failwith "get_info"
70   -
71   -let print_corpus filename =
72   - let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in
73   - List.mapi (fun i x ->
74   - let token_rs, id = get_info i x in
75   - let xml = conll_to_xml token_rs in
76   - let id = Str.global_replace (Str.regexp "/") "_" id in
77   - let oc = open_out ("xml_test/"^id^".xml") in
78   - output_string oc (Xml.to_string_fmt xml);
79   - flush oc;
80   - XmlPrinter.print_xml "xml_test/" id xml) corpus
81   -
82   -(*let _ =
83   - print_corpus "xml_test/sentence1.conll"*)
corpora/generate.ml
... ... @@ -29,4 +29,5 @@ let _ =
29 29  
30 30 (* Generowanie pliku ../../NLP resources/krzaki_interp_statistics.txt na podstawie krzaków *)
31 31 let _ =
32   - InterpsInCorpus.print_diagnose ()
  32 + (* InterpsInCorpus.print_diagnose () *)
  33 + ()
... ...
corpora/makefile
... ... @@ -4,17 +4,37 @@ OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6 6 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa
  7 +INSTALLDIR=`ocamlc -where`/eniam
7 8  
8   -MODS= ../pre/walTypes.ml ../pre/preTypes.ml types.ml CONLL.ml
  9 +SOURCES= types.ml CONLL.ml CONLL_adapter.ml resources.ml conllParser.ml interpsInCorpus.ml generate.ml
9 10  
10   -all:
11   - $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) $(MODS) resources.ml conllParser.ml interpsInCorpus.ml generate.ml
  11 +all: eniam-corpora.cma eniam-corpora.cmxa freq_test
  12 + $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) $(SOURCES)
12 13  
13   -lib:
14   - $(OCAMLOPT) -linkall -a -o corpora.cmxa $(INCLUDES) $(MODS)
  14 +install: all
  15 + mkdir -p $(INSTALLDIR)
  16 + cp eniam-corpora.cmxa eniam-corpora.a eniam-corpora.cma $(INSTALLDIR)
  17 + cp types.cmi CONLL.cmi CONLL_adapter.cmi resources.cmi conllParser.cmi interpsInCorpus.cmi generate.cmi $(INSTALLDIR)
  18 + cp types.cmx CONLL.cmx CONLL_adapter.cmx resources.cmx conllParser.cmx interpsInCorpus.cmx generate.cmx $(INSTALLDIR)
  19 + mkdir -p /usr/share/eniam/corpora
  20 + cp info_sentences* /usr/share/eniam/corpora
  21 +
  22 +install-local: all
  23 + mkdir -p $(INSTALLDIR)
  24 + cp eniam-corpora.cmxa eniam-corpora.a eniam-corpora.cma $(INSTALLDIR)
  25 + cp types.cmi CONLL.cmi CONLL_adapter.cmi resources.cmi conllParser.cmi interpsInCorpus.cmi generate.cmi $(INSTALLDIR)
  26 + cp types.cmx CONLL.cmx CONLL_adapter.cmx resources.cmx conllParser.cmx interpsInCorpus.cmx generate.cmx $(INSTALLDIR)
  27 + mkdir -p /usr/local/share/eniam/corpora
  28 + cp info_sentences* /usr/local/share/eniam/corpora
  29 +
  30 +eniam-corpora.cma: $(SOURCES)
  31 + ocamlc -linkall -a -o eniam-corpora.cma $(OCAMLFLAGS) $^
  32 +
  33 +eniam-corpora.cmxa: $(SOURCES)
  34 + $(OCAMLOPT) -linkall -a -o eniam-corpora.cmxa $(INCLUDES) $(SOURCES)
15 35  
16 36 freq_test:
17   - $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(MODS) freq_test.ml
  37 + $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(SOURCES) freq_test.ml
18 38  
19 39 test: CONLL.ml CONLL_adapter.ml test_conll.ml
20 40 mkdir -p results
... ...
corpora/resources.ml
... ... @@ -97,7 +97,7 @@ let conll_info () = Xlist.fold (data_conll ()) InfoMap.empty
97 97 (fun map sentence -> InfoMap.add (List.map (fun token -> token.c_orth) sentence.s_tokens) sentence map)
98 98  
99 99 let info_file () =
100   - let oc = open_out "../corpora/info_sentences2.txt" in
  100 + let oc = open_out @@ resource_path ^ "/info_sentences2.txt" in
101 101 List.iter (fun (key, sentence) ->
102 102 output_string oc (sentence.s_id^"\n"^sentence.s_text^"\n"^(String.concat " " key)^"\n\n");
103 103 flush oc) (InfoMap.bindings (conll_info()))
... ...
corpora/test_conll.ml
... ... @@ -207,26 +207,34 @@ let process_conll_corpus filename =
207 207 let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in
208 208 print_endline "process_conll_corpus";
209 209 (* let corpus = [List.hd corpus] in *)
210   - Xlist.iter corpus (fun query ->
211   - let id = process_id (get_query_id query) in
212   - let path = "results/" ^ id ^ "/" in
213   - ignore (Sys.command ("mkdir -p " ^ path));
214   - match query with
215   - | AltText[Raw,RawText query;CONLL,StructText[
216   - StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens ->
217   - print_endline ("\n" ^ text ^ "\n");
218   - (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *)
219   - let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
220   - (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in
221   - let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in
222   - let sentences = match text with
223   - AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences
224   - | _ -> failwith "process_conll_corpus 1" in
225   - let text = AltText[Raw,RawText query; Struct, StructText([
226   - AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in
227   - let lex_sems = ENIAMlexSemantics.assign tokens text in
228   - ignore(parse_text id 1 tokens lex_sems text)
229   - | _ -> failwith "process_conll_corpus 2")
  210 + Xlist.iter corpus (fun query -> try
  211 + let id = process_id (get_query_id query) in
  212 + let path = "results/" ^ id ^ "/" in
  213 + ignore (Sys.command ("mkdir -p " ^ path));
  214 + match query with
  215 + | AltText[Raw,RawText query;CONLL,StructText[
  216 + StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens ->
  217 + print_endline ("\nPróba sparsowania zdania:\n" ^ text ^ "\n");
  218 + (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *)
  219 + let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
  220 + (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in
  221 + let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in
  222 + let sentences = match text with
  223 + AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences
  224 + | _ -> failwith "process_conll_corpus 1" in
  225 + let text = AltText[Raw,RawText query; Struct, StructText([
  226 + AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in
  227 + let lex_sems = ENIAMlexSemantics.assign tokens text in
  228 + ignore(parse_text id 1 tokens lex_sems text)
  229 + | _ -> failwith "process_conll_corpus 2"
  230 + with
  231 + Failure e -> print_endline ("Failure " ^ e)
  232 + | e -> print_endline (Printexc.get_backtrace () ^ "\n" ^ (Printexc.to_string e)))
230 233  
231 234 let _ =
232   - process_conll_corpus "../testy/skladnica-test1-Failure.conll"
  235 + Printexc.record_backtrace true;
  236 + (* LCGfields.reset (); *)
  237 + process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll";
  238 + (* process_conll_corpus "../testy/skladnica-test1.conll"; *)
  239 + (* process_conll_corpus "../testy/skladnica-test1-Failure.conll"; *)
  240 + (* LCGfields.print_results () *)
... ...
corpora/test_conll2.ml deleted
1   -(*
2   - * ENIAMcorpora is a library that integrates ENIAM with corpora in CONLL format
3   - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
4   - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
5   - *
6   - * This library is free software: you can redistribute it and/or modify
7   - * it under the terms of the GNU Lesser General Public License as published by
8   - * the Free Software Foundation, either version 3 of the License, or
9   - * (at your option) any later version.
10   - *
11   - * This library is distributed in the hope that it will be useful,
12   - * but WITHOUT ANY WARRANTY; without even the implied warranty of
13   - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14   - * GNU Lesser General Public License for more details.
15   - *
16   - * You should have received a copy of the GNU Lesser General Public License
17   - * along with this program. If not, see <http://www.gnu.org/licenses/>.
18   - *)
19   -
20   -open Xstd
21   -open ENIAM_LCGlexiconTypes
22   -open ENIAM_LCGtypes
23   -open ENIAMsubsyntaxTypes
24   -
25   -let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename
26   -let dep_rules = ENIAM_LCGlexicon.make_rules true ENIAM_LCGlexiconTypes.rules_filename
27   -
28   -let examples = [
29   - (* "Szpak","Szpak śpiewa.";*)
30   - (* "miał","Miałem miał."; *)
31   - (* "Ala","Ala ma kota.";
32   - "Ale","Ale mają kota:"; *)
33   - (* "zima","Szpak frunie zimą.";*)
34   - (* "październik","Kot miauczy w październiku."; *)
35   - (* "Szpak-Kot","Szpak frunie. Kot miauczy.";
36   - "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*)
37   - "teraz","Teraz frunie jakiś szpak.";
38   - "chłopcy","Chłopcy mają ulicę kwiatami.";
39   - (* "arabia","Arabia Saudyjska biegnie.";*)
40   - (* "Tom","Tom idzie."; *)
41   -]
42   -
43   -let clarify_categories senses token =
44   - match token.ENIAMtokenizerTypes.token with
45   - ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
46   - | ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
47   - | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[])
48   - | _ -> []
49   -
50   -let create_chart tokens lex_sems paths last =
51   - ENIAM_LCGrenderer.reset_variable_numbers ();
52   - let chart = ENIAM_LCGchart.make last in
53   - let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
54   - let t = ExtArray.get tokens id in
55   - let s = ExtArray.get lex_sems id in
56   - ENIAM_LCGrenderer.reset_variable_names ();
57   - ENIAM_LCGrenderer.add_variable_numbers ();
58   - let cats = clarify_categories ["X"] t in
59   - let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
60   - ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
61   - chart
62   -
63   -let rec split_sons left id right = function
64   - [] -> List.rev (List.sort compare left), List.sort compare right
65   - | x :: l -> if x < id then split_sons (x :: left) id right l else split_sons left id (x :: right) l
66   -
67   -let rec dep_create_rec nodes sons conll_id =
68   - let node = IntMap.find nodes conll_id in
69   - let l = try IntMap.find sons conll_id with Not_found -> [] in
70   - let left,right = split_sons [] conll_id [] l in
71   - (* Printf.printf "dep_create_rec [%s] %d [%s]\n" (String.concat ";" (Xlist.map left string_of_int)) conll_id (String.concat ";" (Xlist.map right string_of_int)); *)
72   - DepNode(conll_id, Xlist.map left (dep_create_rec nodes sons), node, Xlist.map right (dep_create_rec nodes sons))
73   -
74   -let create_dep_chart tokens lex_sems paths =
75   - let sons = Int.fold 1 (Array.length paths - 1) IntMap.empty (fun sons i ->
76   - let _,super,_ = paths.(i) in
77   - IntMap.add_inc sons super [i] (fun l -> i :: l)) in
78   - let nodes = Int.fold 0 (Array.length paths - 1) IntMap.empty (fun nodes i ->
79   - let id,_,_ = paths.(i) in
80   - let t = ExtArray.get tokens id in
81   - let s = ExtArray.get lex_sems id in
82   - ENIAM_LCGrenderer.reset_variable_names ();
83   - ENIAM_LCGrenderer.add_variable_numbers ();
84   - let cats = clarify_categories ["X"] t in
85   - let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
86   - IntMap.add nodes i l) in
87   - dep_create_rec nodes sons 0
88   -
89   -let test_example path id tokens lex_sems paths last =
90   - ENIAM_LCGreductions.reset_variant_label ();
91   - let chart = create_chart tokens lex_sems paths last in
92   - ENIAM_LCGlatexOf.print_chart path (id^"1_chart") "a1" chart;
93   - let chart,references = ENIAM_LCGchart.lazify chart in
94   - ENIAM_LCGlatexOf.print_chart path (id^"2_chart") "a4" chart;
95   - ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references;
96   - let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
97   - ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart;
98   - ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references;
99   - if ENIAM_LCGchart.is_parsed chart then (
100   - let term = ENIAM_LCGchart.get_parsed_term chart in
101   - Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file ->
102   - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
103   - Xlatex.latex_compile_and_clean path (id^"4_term");
104   - let dependency_tree = ENIAM_LCGreductions.reduce term references in
105   - ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree;
106   - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
107   - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
108   - ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree;
109   - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
110   - ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree;
111   - ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree;
112   - ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree;
113   - ())
114   - else print_endline "not reduced")
115   - else print_endline "not parsed"
116   -
117   -let test_dep_example path id tokens lex_sems paths =
118   - try
119   - ENIAM_LCGreductions.reset_variant_label ();
120   - let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in
121   - ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths;
122   - let chart = create_dep_chart tokens lex_sems paths in
123   - ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart;
124   - let chart,references = ENIAM_LCGchart.dep_lazify chart in
125   - ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart;
126   - ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references;
127   - let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
128   - (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *)
129   - ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references;
130   - if ENIAM_LCGchart.is_dep_parsed chart then (
131   - let term = ENIAM_LCGchart.get_dep_parsed_term chart in
132   - Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file ->
133   - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
134   - Xlatex.latex_compile_and_clean path (id^"4_term");
135   - let dependency_tree = ENIAM_LCGreductions.reduce term references in
136   - ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree;
137   - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
138   - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
139   - ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree;
140   - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
141   - ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree;
142   - ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree;
143   - ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree;
144   - ())
145   - else print_endline "not reduced")
146   - else print_endline "not parsed"
147   - with NotDepParsed(id_ndp,left,l,right) -> (
148   - print_endline "not parsed 2";
149   - ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right))
150   -
151   -let rec parse_sentence name id tokens lex_sems = function
152   - RawSentence s -> id
153   - | StructSentence(paths,last) ->
154   - (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *)
155   - id + 1
156   - | DepSentence(paths) ->
157   - test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths;
158   - id + 1
159   - | QuotedSentences sentences ->
160   - Xlist.fold sentences id (fun id p ->
161   - parse_sentence name id tokens lex_sems p.sentence)
162   - | AltSentence l ->
163   - Xlist.fold l id (fun id (mode,sentence) ->
164   - parse_sentence name id tokens lex_sems sentence)
165   -
166   -let rec parse_paragraph name id tokens lex_sems = function
167   - RawParagraph s -> id
168   - | StructParagraph sentences ->
169   - Xlist.fold sentences id (fun id p ->
170   - parse_sentence name id tokens lex_sems p.sentence)
171   - | AltParagraph l ->
172   - Xlist.fold l id (fun id (mode,paragraph) ->
173   - parse_paragraph name id tokens lex_sems paragraph)
174   -
175   -let rec parse_text name id tokens lex_sems = function
176   - RawText s -> id
177   - | StructText paragraphs ->
178   - Xlist.fold paragraphs id (fun id paragraph ->
179   - parse_paragraph name id tokens lex_sems paragraph)
180   - | AltText l ->
181   - Xlist.fold l id (fun id (mode,text) ->
182   - parse_text name id tokens lex_sems text)
183   -
184   -let id_counter = ref 0
185   -
186   -let get_id () =
187   - incr id_counter;
188   - "ID_" ^ (string_of_int !id_counter)
189   -
190   -let get_query_id = function
191   - AltText[_;CONLL,StructText[StructParagraph[p]]],_ -> if p.id = "" then get_id () else p.id
192   - | AltText[CONLL,StructText[StructParagraph[p]]],_ -> if p.id = "" then get_id () else p.id
193   - | _ -> failwith "get_query_id"
194   -
195   -let process_id s =
196   - if Xstring.check_prefix "ID_" s then s else
197   - let a,b,c = match Xstring.split_delim "/" s with
198   - [a;b;c] -> a,b,c
199   - | _ -> failwith ("process_id: " ^ s) in
200   - if Xstring.check_prefix "NKJP_1M_" a && Xstring.check_prefix "morph_" b && Xstring.check_sufix "-p" b &&
201   - Xstring.check_prefix "morph_" c && Xstring.check_sufix "-s" c then
202   - Xstring.cut_prefix "NKJP_1M_" a ^ "." ^ Xstring.cut_sufix "-s" (Xstring.cut_prefix "morph_" c)
203   - else failwith ("process_id: " ^ s)
204   -
205   -let process_conll_corpus filename =
206   - let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in
207   - print_endline "process_conll_corpus";
208   - let corpus = [List.hd corpus] in
209   - Xlist.iter corpus (fun query ->
210   - let id = process_id (get_query_id query) in
211   - let path = "results/" ^ id ^ "/" in
212   - ignore (Sys.command ("mkdir -p " ^ path));
213   - match query with
214   - | AltText[Raw,RawText query;CONLL,StructText[
215   - StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens ->
216   - (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *)
217   - let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
218   - (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in
219   - let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in
220   - let sentences = match text with
221   - AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences
222   - | _ -> failwith "process_conll_corpus 1" in
223   - let text = AltText[Raw,RawText query; Struct, StructText([
224   - AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in
225   - let lex_sems = ENIAMlexSemantics.assign tokens text in
226   - ignore(parse_text id 1 tokens lex_sems text)
227   - | _ -> failwith "process_conll_corpus 2")
228   -
229   -let _ =
230   - process_conll_corpus "../testy/skladnica-test1.conll"
corpora/types.ml
... ... @@ -27,3 +27,10 @@ type conll_sentence =
27 27  
28 28 type info_sentence =
29 29 {i_id:string; i_text:string; i_tokens:string list}
  30 +
  31 +let resource_path =
  32 + try Sys.getenv "ENIAM_RESOURCE_PATH"
  33 + with Not_found ->
  34 + if Sys.file_exists "/usr/share/eniam" then "/usr/share/eniam" else
  35 + if Sys.file_exists "/usr/local/share/eniam" then "/usr/local/share/eniam" else
  36 + failwith "resource directory does not exists"
... ...
testy/skladnica-test1-Failure.conll
  1 +1 - - interp interp _ 3 punct _ _
  2 +2 Panowie pan subst subst pl|nom|m1 3 subj _ _
  3 +3 przyszli przyjść praet praet pl|m1|perf 0 pred _ _
  4 +4 . . interp interp _ 3 punct _ _
  5 +
  6 +1 O o prep prep loc 12 comp _ _
  7 +2 klasztornym klasztorny adj adj sg|loc|n|pos 3 adjunct _ _
  8 +3 piekle piekło subst subst sg|loc|n 1 comp _ _
  9 +4 , , interp interp _ 3 punct _ _
  10 +5 zgotowanym zgotować ppas ppas sg|loc|n|perf|aff 3 adjunct _ _
  11 +6 przez przez prep prep acc|nwok 5 comp_ag _ _
  12 +7 trzy trzy num num pl|acc|m2|congr 6 comp _ _
  13 +8 potwory potwór subst subst pl|acc|m2 7 comp _ _
  14 +9 w w prep prep loc|nwok 8 adjunct _ _
  15 +10 habitach habit subst subst pl|loc|m3 9 comp _ _
  16 +11 , , interp interp _ 3 punct _ _
  17 +12 pisali pisać praet praet pl|m1|imperf 0 pred _ _
  18 +13 śmy być aglt aglt pl|pri|imperf|nwok 12 aglt _ _
  19 +14 w w prep prep loc|nwok 12 adjunct _ _
  20 +15 kwietniu kwiecień subst subst sg|loc|m3 14 comp _ _
  21 +16 br bieżący_rok brev brev pun 15 ne _ _
  22 +17 . . interp interp _ 12 punct _ _
  23 +
1 24 1 Następnie następnie adv adv _ 2 adjunct _ _
2 25 2 rozłożyła rozłożyć praet praet sg|f|perf 10 conjunct _ _
3 26 3 wysoki wysoki adj adj sg|acc|m3|pos 4 adjunct _ _
... ...