Commit 1e9e955b5e16d923924ddd77a24eab68db47017f
Merge branch 'corpora' of ssh://git.nlp.ipipan.waw.pl:8888/wojciech.jaworski/ENIAM into corpora
rozjechało się
Showing
4 changed files
with
26 additions
and
8 deletions
corpora/CONLL.ml
... | ... | @@ -52,6 +52,15 @@ let rec string_of_text mode = function |
52 | 52 | |
53 | 53 | (******************) |
54 | 54 | |
55 | +let establish_next rev_tokens = | |
56 | + let rec pom res = function | |
57 | + h :: t -> let next = if res = [] | |
58 | + then h.beg+h.len | |
59 | + else (List.hd res).beg in | |
60 | + pom ({h with next = next} :: res) t | |
61 | + | [] -> res in | |
62 | + pom [] rev_tokens | |
63 | + | |
55 | 64 | let rec establish_for_token i res text = function |
56 | 65 | h :: t -> if Xstring.check_prefix " " text |
57 | 66 | then establish_for_token (i+100) res (Xstring.cut_prefix " " text) (h :: t) |
... | ... | @@ -66,7 +75,8 @@ let rec establish_for_token i res text = function |
66 | 75 | let rec establish_lengths text = function |
67 | 76 | RawSentence text -> failwith ("establish_lengths: " ^ text) |
68 | 77 | | StructSentence (tokens, n) -> let pbeg, plen, rev_tokens = establish_for_token 100 [] text tokens in |
69 | - pbeg, plen, StructSentence (List.rev rev_tokens, n) | |
78 | + let tokens = establish_next rev_tokens in | |
79 | + pbeg, plen-100, StructSentence (tokens, plen) | |
70 | 80 | | ORSentence (_,_,_,_) -> failwith ("establish_lengths: ORSentence") |
71 | 81 | | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts |
72 | 82 | then establish_lengths text (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) |
... | ... | @@ -141,9 +151,10 @@ let load_token in_channel = |
141 | 151 | | [id; orth; lemma; lemma2; cat; cat2; interp; interp2; "-1"; super; "_"; label; "_"; "_"] -> |
142 | 152 | (if (cat, lemma, interp) <> (cat2, lemma2, interp2) then fail line; |
143 | 153 | n_token id orth lemma cat interp super label) |
144 | - | id :: orth :: lemma :: cat :: cat2 :: interp :: e -> | |
145 | - (fail line; | |
146 | - n_token id orth lemma cat interp "" "") (* FIXME: "" "" trzeba na coś zmienic *) | |
154 | + | [id; orth; lemma; cat; cat2; interp; super; label_err; "_"] -> | |
155 | + (if cat <> cat2 && Xstring.check_sufix "_" label_err then fail line; | |
156 | + let label = Xstring.cut_sufix "_" label_err in | |
157 | + n_token id orth lemma cat interp super label) | |
147 | 158 | | _ -> failwith ("load_token: " ^ line) |
148 | 159 | (* {c_id = List.nth pom 1; |
149 | 160 | c_lemma = List.nth pom 2; |
... | ... |
corpora/XmlPrinter.ml
... | ... | @@ -47,13 +47,18 @@ let rec lt_of_xml = function |
47 | 47 | | Xml.Element("ref",["id",i],[]) -> Ref(int_of_string i) |
48 | 48 | | xml -> print_endline (Xml.to_string_fmt xml); failwith "lt_of_xml" |
49 | 49 | |
50 | -let graph_of_xml = function | |
50 | +let graph_of_xml xml = | |
51 | + let establish_indexs graph = | |
52 | + let max = Xlist.fold graph 0 (fun acc (n, _) -> if n > acc then n else acc) in | |
53 | + let table = Array.make (max+1) Dot in | |
54 | + Xlist.iter graph (fun (n,x) -> table.(n) <- x); table in | |
55 | + match xml with | |
51 | 56 | Xml.Element("graph",[],l) -> |
52 | - List.map (function Xml.Element("graph_node",["id",i],[xml]) -> (*int_of_string i,*) lt_of_xml xml | _ -> failwith "graph_of_xml") l | |
57 | + establish_indexs @@ List.map (function Xml.Element("graph_node",["id",i],[xml]) -> int_of_string i, lt_of_xml xml | _ -> failwith "graph_of_xml") l | |
53 | 58 | | _ -> failwith "graph_of_xml" |
54 | 59 | |
55 | 60 | let print_xml path name xml = |
56 | - let graph = Array.of_list @@ graph_of_xml xml in | |
61 | + let graph = graph_of_xml xml in | |
57 | 62 | Visualization.print_graph path name graph |
58 | 63 | |
59 | 64 | let load_and_print_xml path name filename = |
... | ... |
dependencyParser/basic/mate-tools/README.txt
parser/exec.ml
... | ... | @@ -203,7 +203,7 @@ let rec parse_sentence timeout test_only_flag mode next_id = function |
203 | 203 | let dep_graph,references,next_reference = LCGchart.dep_parse timeout dep_graph references next_reference time_fun in |
204 | 204 | (* FIXME: dodać dalsze przetwarzanie dep_graph *) |
205 | 205 | let xml = DepTree.conll_to_xml paths in |
206 | - let graph = Array.of_list (XmlPrinter.graph_of_xml xml) in (* FIXME: do poprawy *) | |
206 | + let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) | |
207 | 207 | Visualization.print_graph "results/" "term_conll" graph; |
208 | 208 | let result = {empty_eniam_parse_result with status=Parsed; term=graph} in |
209 | 209 | ENIAMSentence result, next_id |
... | ... |