Commit 2a9dd7b32ac2d5c233f52b81c2c69f20348c9955
1 parent
3abd3b23
konwersja drzew zależnościowych do struktury ENIAM-a, wersja wstępna
Showing
3 changed files
with
156 additions
and
3 deletions
corpora/CONLL.ml
... | ... | @@ -26,7 +26,7 @@ let string_of_token mode token = match mode with |
26 | 26 | String.concat "." y)) |
27 | 27 | | _ -> failwith ("string_of_token: not Lemma") in |
28 | 28 | String.concat "\t" [token.conll_id; |
29 | - token.orth; lemma; cat; cat; interp; "_"; "_"] | |
29 | + token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"] | |
30 | 30 | | _ -> failwith "string_of_token: ni" |
31 | 31 | |
32 | 32 | let rec string_of_sentence mode = function |
... | ... | @@ -136,9 +136,14 @@ let load_token in_channel = |
136 | 136 | else |
137 | 137 | match Xstring.split "\t" line with |
138 | 138 | [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] -> |
139 | - (if cat <> cat2 then fail line; n_token id orth lemma cat interp super label) | |
139 | + (if cat <> cat2 then fail line; | |
140 | + n_token id orth lemma cat interp super label) | |
141 | + | [id; orth; lemma; lemma2; cat; cat2; interp; interp2; "-1"; super; "_"; label; "_"; "_"] -> | |
142 | + (if (cat, lemma, interp) <> (cat2, lemma2, interp2) then fail line; | |
143 | + n_token id orth lemma cat interp super label) | |
140 | 144 | | id :: orth :: lemma :: cat :: cat2 :: interp :: e -> |
141 | - (fail line; n_token id orth lemma cat interp "" "") (* FIXME: "" "" trzeba na coś zmienic *) | |
145 | + (fail line; | |
146 | + n_token id orth lemma cat interp "" "") (* FIXME: "" "" trzeba na coś zmienic *) | |
142 | 147 | | _ -> failwith ("load_token: " ^ line) |
143 | 148 | (* {c_id = List.nth pom 1; |
144 | 149 | c_lemma = List.nth pom 2; |
... | ... |
corpora/XmlPrinter.ml
0 → 100644
1 | +open Xstd | |
2 | +open WalTypes | |
3 | +open LCGtypes | |
4 | + | |
5 | +let gf_of_string = function | |
6 | + "subj" -> SUBJ | |
7 | + | "obj" -> OBJ | |
8 | + | "arg"(*""*) -> ARG | |
9 | + | "core" -> CORE | |
10 | + | "nosem" -> NOSEM | |
11 | + | "nogf" -> NOGF | |
12 | + | "adjunct" -> ADJUNCT | |
13 | + | "raised" -> RAISED | |
14 | + | "clause" -> CLAUSE | |
15 | + | "sentence" -> SENTENCE | |
16 | + | s -> prerr_endline s; SUBJ | |
17 | +(* | s -> failwith ("gf_of_string:" ^ s) *) | |
18 | + | |
19 | +(*let morf_of_string s = | |
20 | + let s = Str.split (Str.regexp "[()]") s in | |
21 | + WalParser.parse_morf_single (List.hd s, List.tl s)*) | |
22 | + | |
23 | +let rec lt_of_xml = function | |
24 | + Xml.Element("node",["pred",pred;"cat",cat;"weight",weight;"id",id],[ | |
25 | + Xml.Element("gs",[],[gs]); | |
26 | + Xml.Element("agf",[],[Xml.PCData agf]); | |
27 | + Xml.Element("amorf",[],[amorf]); | |
28 | + Xml.Element("attrs",[],attrs); | |
29 | + Xml.Element("args",[],[args])]) -> | |
30 | + Node{pred=pred; cat=cat; weight=float_of_string weight; id=int_of_string id; | |
31 | + gs = lt_of_xml gs; | |
32 | + agf = gf_of_string agf; (* FIXME *) | |
33 | + amorf = WalTypes.Phrase(WalTypes.Null); (* FIXME *) | |
34 | + arole = ""; (* FIXME *) | |
35 | + arole_attr = ""; (* FIXME *) | |
36 | + meaning = ""; (* FIXME *) | |
37 | + hipero = StringSet.empty; (* FIXME *) | |
38 | + meaning_weight = -1.; (* FIXME *) | |
39 | + position = WalTypes.{gf = WalTypes.SUBJ; role = ""; role_attr = ""; sel_prefs = []; | |
40 | + cr = []; ce = []; dir = WalTypes.Both; morfs = []}; (* FIXME *) | |
41 | + attrs=List.map (function Xml.Element("attr",["label",e],[t]) -> e,lt_of_xml t | _ -> failwith "lt_of_xml") attrs; | |
42 | + args=lt_of_xml args;} | |
43 | + | Xml.Element("tuple",[],l) -> Tuple(List.map lt_of_xml l) | |
44 | + | Xml.Element("val",[],[Xml.PCData s]) -> Val s | |
45 | + | Xml.Element("variants",["label",e],l) -> Variant(e,List.map (function Xml.Element("variant",["id",i],[t]) -> i, lt_of_xml t | _ -> failwith "lt_of_xml") l) | |
46 | + | Xml.Element("dot",[],[]) -> Dot | |
47 | + | Xml.Element("ref",["id",i],[]) -> Ref(int_of_string i) | |
48 | + | xml -> print_endline (Xml.to_string_fmt xml); failwith "lt_of_xml" | |
49 | + | |
50 | +let graph_of_xml = function | |
51 | + Xml.Element("graph",[],l) -> | |
52 | + List.map (function Xml.Element("graph_node",["id",i],[xml]) -> (*int_of_string i,*) lt_of_xml xml | _ -> failwith "graph_of_xml") l | |
53 | + | _ -> failwith "graph_of_xml" | |
54 | + | |
55 | +let print_xml path name xml = | |
56 | + let graph = Array.of_list @@ graph_of_xml xml in | |
57 | + Visualization.print_graph path name graph | |
58 | + | |
59 | +let load_and_print_xml path name filename = | |
60 | + print_xml path name @@ Xml.parse_file filename | |
61 | + | |
62 | +let _ = | |
63 | + load_and_print_xml "xml_test/" "test1.0" "xml_test/sentence1.0.xml" | |
... | ... |
corpora/depTree.ml
0 → 100644
1 | +open Xstd | |
2 | +open PreTypes | |
3 | + | |
4 | +let tuple_it taglist = | |
5 | + match List.length taglist with | |
6 | + 0 -> Xml.Element("dot",[],[]) | |
7 | + | 1 -> List.hd taglist | |
8 | + | _ -> Xml.Element("tuple",[],taglist) | |
9 | + | |
10 | +let get_amorf_basic token_r = "empty" (* FIXME *) | |
11 | + | |
12 | +let get_amorf token_r = "empty" (* FIXME *) | |
13 | + | |
14 | +let get_vals token_r cat interp = get_amorf_basic token_r :: | |
15 | + match cat with | |
16 | + "subst" -> List.rev ("ter" :: (List.rev interp)) | |
17 | + | _ -> interp (* FIXME *) | |
18 | + | |
19 | +let get_basic_attrs token_r = ["A","a";"B","b"] (* FIXME *) | |
20 | + | |
21 | +let get_attrs token_r = | |
22 | + let attrs = get_basic_attrs token_r in | |
23 | + List.map (fun (label, value) -> | |
24 | + Xml.Element("attr",["label",label],[ | |
25 | + Xml.Element("val",[],[Xml.PCData value])])) attrs | |
26 | + | |
27 | +let xml_of_gs token_r cat interp = | |
28 | + let vals = get_vals token_r cat interp in (** **) | |
29 | + let vals = List.map (fun x -> Xml.Element("val",[],[Xml.PCData x])) vals in | |
30 | + Xml.Element("gs",[],[tuple_it vals]) | |
31 | + | |
32 | +let xml_of_agf token_r = Xml.Element("agf",[],[Xml.PCData token_r.conll_label]) | |
33 | + | |
34 | +let xml_of_amorf token_r = Xml.Element("amorf",[],[Xml.PCData (get_amorf token_r)]) | |
35 | + | |
36 | +let xml_of_attrs token_r = Xml.Element("attrs",[],get_attrs token_r) (* FIXME *) | |
37 | + | |
38 | +let xml_of_args token_rs token_r = | |
39 | + let children = List.filter (fun pom -> pom.conll_super = token_r.conll_id) token_rs in | |
40 | + let children_to_graph = List.map (fun pom -> | |
41 | + Xml.Element("ref",["id", pom.conll_id],[])) children in | |
42 | + Xml.Element("args",[],[tuple_it children_to_graph]) | |
43 | + | |
44 | +let xml_of_token_r token_rs token_r = | |
45 | + let pred, cat, interp = match token_r.token with | |
46 | + | Lemma(a,b,c) -> a, b, Xlist.map (List.hd c) (fun x -> List.hd x) | |
47 | + | _ -> failwith ("xml_of_token_r: not Lemma") in | |
48 | + Xml.Element("graph_node",["id", token_r.conll_id],[ | |
49 | + Xml.Element("node",["pred",pred;"cat",cat;"weight","0";"id", token_r.conll_id], | |
50 | + (xml_of_gs token_r cat interp) :: (** **) | |
51 | + (xml_of_agf token_r) :: | |
52 | + (xml_of_amorf token_r) :: (** **) | |
53 | + (xml_of_attrs token_r) :: (** **) | |
54 | + [xml_of_args token_rs token_r] | |
55 | + ) ]) | |
56 | + | |
57 | +let conll_to_xml token_rs = | |
58 | + Xml.Element("graph",[],List.map (xml_of_token_r token_rs) token_rs) | |
59 | + | |
60 | + | |
61 | +(***************************************************************************************************) | |
62 | + | |
63 | +let get_info i = function | |
64 | + AltText[Raw,RawText text1;CONLL,StructText([StructParagraph[ | |
65 | + {pid = id; pbeg = beg; plen = len; psentence = | |
66 | + AltSentence[Raw, RawSentence text2; CONLL, StructSentence(token_rs,-1)]}]],-1)] -> token_rs, id | |
67 | + | StructText([StructParagraph[{pid = id; pbeg = -1; plen = -1; psentence = | |
68 | + StructSentence(token_rs,-1)}]],-1) -> token_rs, "id_not_found" ^ (string_of_int i) | |
69 | + | _ -> failwith "get_info" | |
70 | + | |
71 | +let print_corpus filename = | |
72 | + let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in | |
73 | + List.mapi (fun i x -> | |
74 | + let token_rs, id = get_info i x in | |
75 | + let xml = conll_to_xml token_rs in | |
76 | + let id = Str.global_replace (Str.regexp "/") "_" id in | |
77 | + let oc = open_out ("xml_test/"^id^".xml") in | |
78 | + output_string oc (Xml.to_string_fmt xml); | |
79 | + flush oc; | |
80 | + XmlPrinter.print_xml "xml_test/" id xml) corpus | |
81 | + | |
82 | +let _ = | |
83 | + print_corpus "xml_test/sentence1.conll" | |
84 | + | |
85 | + | |
... | ... |