Commit 2a9dd7b32ac2d5c233f52b81c2c69f20348c9955

Authored by Daniel Oklesiński
1 parent 3abd3b23

konwersja drzew zależnościowych do struktury ENIAM-a, wersja wstępna

corpora/CONLL.ml
... ... @@ -26,7 +26,7 @@ let string_of_token mode token = match mode with
26 26 String.concat "." y))
27 27 | _ -> failwith ("string_of_token: not Lemma") in
28 28 String.concat "\t" [token.conll_id;
29   - token.orth; lemma; cat; cat; interp; "_"; "_"]
  29 + token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"]
30 30 | _ -> failwith "string_of_token: ni"
31 31  
32 32 let rec string_of_sentence mode = function
... ... @@ -136,9 +136,14 @@ let load_token in_channel =
136 136 else
137 137 match Xstring.split "\t" line with
138 138 [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
139   - (if cat <> cat2 then fail line; n_token id orth lemma cat interp super label)
  139 + (if cat <> cat2 then fail line;
  140 + n_token id orth lemma cat interp super label)
  141 + | [id; orth; lemma; lemma2; cat; cat2; interp; interp2; "-1"; super; "_"; label; "_"; "_"] ->
  142 + (if (cat, lemma, interp) <> (cat2, lemma2, interp2) then fail line;
  143 + n_token id orth lemma cat interp super label)
140 144 | id :: orth :: lemma :: cat :: cat2 :: interp :: e ->
141   - (fail line; n_token id orth lemma cat interp "" "") (* FIXME: "" "" trzeba na coś zmienic *)
  145 + (fail line;
  146 + n_token id orth lemma cat interp "" "") (* FIXME: "" "" trzeba na coś zmienic *)
142 147 | _ -> failwith ("load_token: " ^ line)
143 148 (* {c_id = List.nth pom 1;
144 149 c_lemma = List.nth pom 2;
... ...
corpora/XmlPrinter.ml 0 → 100644
  1 +open Xstd
  2 +open WalTypes
  3 +open LCGtypes
  4 +
  5 +let gf_of_string = function
  6 + "subj" -> SUBJ
  7 + | "obj" -> OBJ
  8 + | "arg"(*""*) -> ARG
  9 + | "core" -> CORE
  10 + | "nosem" -> NOSEM
  11 + | "nogf" -> NOGF
  12 + | "adjunct" -> ADJUNCT
  13 + | "raised" -> RAISED
  14 + | "clause" -> CLAUSE
  15 + | "sentence" -> SENTENCE
  16 + | s -> prerr_endline s; SUBJ
  17 +(* | s -> failwith ("gf_of_string:" ^ s) *)
  18 +
  19 +(*let morf_of_string s =
  20 + let s = Str.split (Str.regexp "[()]") s in
  21 + WalParser.parse_morf_single (List.hd s, List.tl s)*)
  22 +
  23 +let rec lt_of_xml = function
  24 + Xml.Element("node",["pred",pred;"cat",cat;"weight",weight;"id",id],[
  25 + Xml.Element("gs",[],[gs]);
  26 + Xml.Element("agf",[],[Xml.PCData agf]);
  27 + Xml.Element("amorf",[],[amorf]);
  28 + Xml.Element("attrs",[],attrs);
  29 + Xml.Element("args",[],[args])]) ->
  30 + Node{pred=pred; cat=cat; weight=float_of_string weight; id=int_of_string id;
  31 + gs = lt_of_xml gs;
  32 + agf = gf_of_string agf; (* FIXME *)
  33 + amorf = WalTypes.Phrase(WalTypes.Null); (* FIXME *)
  34 + arole = ""; (* FIXME *)
  35 + arole_attr = ""; (* FIXME *)
  36 + meaning = ""; (* FIXME *)
  37 + hipero = StringSet.empty; (* FIXME *)
  38 + meaning_weight = -1.; (* FIXME *)
  39 + position = WalTypes.{gf = WalTypes.SUBJ; role = ""; role_attr = ""; sel_prefs = [];
  40 + cr = []; ce = []; dir = WalTypes.Both; morfs = []}; (* FIXME *)
  41 + attrs=List.map (function Xml.Element("attr",["label",e],[t]) -> e,lt_of_xml t | _ -> failwith "lt_of_xml") attrs;
  42 + args=lt_of_xml args;}
  43 + | Xml.Element("tuple",[],l) -> Tuple(List.map lt_of_xml l)
  44 + | Xml.Element("val",[],[Xml.PCData s]) -> Val s
  45 + | Xml.Element("variants",["label",e],l) -> Variant(e,List.map (function Xml.Element("variant",["id",i],[t]) -> i, lt_of_xml t | _ -> failwith "lt_of_xml") l)
  46 + | Xml.Element("dot",[],[]) -> Dot
  47 + | Xml.Element("ref",["id",i],[]) -> Ref(int_of_string i)
  48 + | xml -> print_endline (Xml.to_string_fmt xml); failwith "lt_of_xml"
  49 +
  50 +let graph_of_xml = function
  51 + Xml.Element("graph",[],l) ->
  52 + List.map (function Xml.Element("graph_node",["id",i],[xml]) -> (*int_of_string i,*) lt_of_xml xml | _ -> failwith "graph_of_xml") l
  53 + | _ -> failwith "graph_of_xml"
  54 +
  55 +let print_xml path name xml =
  56 + let graph = Array.of_list @@ graph_of_xml xml in
  57 + Visualization.print_graph path name graph
  58 +
  59 +let load_and_print_xml path name filename =
  60 + print_xml path name @@ Xml.parse_file filename
  61 +
  62 +let _ =
  63 + load_and_print_xml "xml_test/" "test1.0" "xml_test/sentence1.0.xml"
... ...
corpora/depTree.ml 0 → 100644
  1 +open Xstd
  2 +open PreTypes
  3 +
  4 +let tuple_it taglist =
  5 + match List.length taglist with
  6 + 0 -> Xml.Element("dot",[],[])
  7 + | 1 -> List.hd taglist
  8 + | _ -> Xml.Element("tuple",[],taglist)
  9 +
  10 +let get_amorf_basic token_r = "empty" (* FIXME *)
  11 +
  12 +let get_amorf token_r = "empty" (* FIXME *)
  13 +
  14 +let get_vals token_r cat interp = get_amorf_basic token_r ::
  15 + match cat with
  16 + "subst" -> List.rev ("ter" :: (List.rev interp))
  17 + | _ -> interp (* FIXME *)
  18 +
  19 +let get_basic_attrs token_r = ["A","a";"B","b"] (* FIXME *)
  20 +
  21 +let get_attrs token_r =
  22 + let attrs = get_basic_attrs token_r in
  23 + List.map (fun (label, value) ->
  24 + Xml.Element("attr",["label",label],[
  25 + Xml.Element("val",[],[Xml.PCData value])])) attrs
  26 +
  27 +let xml_of_gs token_r cat interp =
  28 + let vals = get_vals token_r cat interp in (** **)
  29 + let vals = List.map (fun x -> Xml.Element("val",[],[Xml.PCData x])) vals in
  30 + Xml.Element("gs",[],[tuple_it vals])
  31 +
  32 +let xml_of_agf token_r = Xml.Element("agf",[],[Xml.PCData token_r.conll_label])
  33 +
  34 +let xml_of_amorf token_r = Xml.Element("amorf",[],[Xml.PCData (get_amorf token_r)])
  35 +
  36 +let xml_of_attrs token_r = Xml.Element("attrs",[],get_attrs token_r) (* FIXME *)
  37 +
  38 +let xml_of_args token_rs token_r =
  39 + let children = List.filter (fun pom -> pom.conll_super = token_r.conll_id) token_rs in
  40 + let children_to_graph = List.map (fun pom ->
  41 + Xml.Element("ref",["id", pom.conll_id],[])) children in
  42 + Xml.Element("args",[],[tuple_it children_to_graph])
  43 +
  44 +let xml_of_token_r token_rs token_r =
  45 + let pred, cat, interp = match token_r.token with
  46 + | Lemma(a,b,c) -> a, b, Xlist.map (List.hd c) (fun x -> List.hd x)
  47 + | _ -> failwith ("xml_of_token_r: not Lemma") in
  48 + Xml.Element("graph_node",["id", token_r.conll_id],[
  49 + Xml.Element("node",["pred",pred;"cat",cat;"weight","0";"id", token_r.conll_id],
  50 + (xml_of_gs token_r cat interp) :: (** **)
  51 + (xml_of_agf token_r) ::
  52 + (xml_of_amorf token_r) :: (** **)
  53 + (xml_of_attrs token_r) :: (** **)
  54 + [xml_of_args token_rs token_r]
  55 + ) ])
  56 +
  57 +let conll_to_xml token_rs =
  58 + Xml.Element("graph",[],List.map (xml_of_token_r token_rs) token_rs)
  59 +
  60 +
  61 +(***************************************************************************************************)
  62 +
  63 +let get_info i = function
  64 + AltText[Raw,RawText text1;CONLL,StructText([StructParagraph[
  65 + {pid = id; pbeg = beg; plen = len; psentence =
  66 + AltSentence[Raw, RawSentence text2; CONLL, StructSentence(token_rs,-1)]}]],-1)] -> token_rs, id
  67 + | StructText([StructParagraph[{pid = id; pbeg = -1; plen = -1; psentence =
  68 + StructSentence(token_rs,-1)}]],-1) -> token_rs, "id_not_found" ^ (string_of_int i)
  69 + | _ -> failwith "get_info"
  70 +
  71 +let print_corpus filename =
  72 + let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in
  73 + List.mapi (fun i x ->
  74 + let token_rs, id = get_info i x in
  75 + let xml = conll_to_xml token_rs in
  76 + let id = Str.global_replace (Str.regexp "/") "_" id in
  77 + let oc = open_out ("xml_test/"^id^".xml") in
  78 + output_string oc (Xml.to_string_fmt xml);
  79 + flush oc;
  80 + XmlPrinter.print_xml "xml_test/" id xml) corpus
  81 +
  82 +let _ =
  83 + print_corpus "xml_test/sentence1.conll"
  84 +
  85 +
... ...