Commit 8bc083e57ed28269233702785365375207c3b6cf
1 parent
5007a782
dodanie wstępnej diagnostyki sparsowanych zdań
Showing
5 changed files
with
145 additions
and
24 deletions
diagnostics/LCGfields.ml
0 → 100644
1 | +open LCGtypes | |
2 | +open Xstd | |
3 | +open ExecTypes | |
4 | + | |
5 | +let eniam = "eniam" | |
6 | +let conll = "conll" | |
7 | + | |
8 | +module Strings = | |
9 | + struct | |
10 | + type t = string | |
11 | + let compare a b = Pervasives.compare a b | |
12 | + end | |
13 | + | |
14 | +module StrMap = Map.Make(Strings) | |
15 | + | |
16 | +let field_map = StrMap.(empty |> add eniam (ref empty) |> add conll (ref empty)) | |
17 | + | |
18 | +let add_to_field_map str_mode field content = | |
19 | + let f_map = StrMap.find str_mode field_map in | |
20 | + let c_map = if StrMap.mem field !f_map | |
21 | + then StrMap.find field !f_map | |
22 | + else let temp = ref StrMap.empty in | |
23 | + f_map := StrMap.add field temp !f_map; temp in | |
24 | + if StrMap.mem content !c_map | |
25 | + then incr (StrMap.find content !c_map) | |
26 | + else c_map := StrMap.add content (ref 1) !c_map | |
27 | + | |
28 | +let print_field_map () = | |
29 | + StrMap.iter (fun key1 val1 -> | |
30 | + print_endline key1; | |
31 | + StrMap.iter (fun key2 val2 -> | |
32 | + let i = ref 0 in | |
33 | + print_endline ("\t" ^ key2); | |
34 | + StrMap.iter (fun key3 val3 -> | |
35 | + i := !i + !val3; | |
36 | + print_endline ("\t\t" ^ key3 ^ "\t\t" ^ (string_of_int !val3)) | |
37 | + ) !val2; | |
38 | + print_endline ("\tsum: " ^ (string_of_int !i)) | |
39 | + ) !val1 | |
40 | + ) field_map; | |
41 | + print_newline () | |
42 | + | |
43 | + | |
44 | +let field_of_node str_mode n = function | |
45 | + "arole" -> let content = if n.arole = "" then "null" else n.arole in | |
46 | + add_to_field_map str_mode "arole" content; content | |
47 | + | _ -> failwith "field_of_node: ni" | |
48 | + | |
49 | +let field_of_linear_term str_node field = function | |
50 | + Node n -> field_of_node str_node n field | |
51 | + | _ -> failwith "field_of_linear_term: ni" | |
52 | + | |
53 | +let field_of_dependency_tree str_node fields dep_tree = | |
54 | + String.concat "\n" (Xlist.map fields (fun field -> | |
55 | + Array.fold_left (fun acc x -> | |
56 | + acc ^ (field_of_linear_term str_node field x) ^ "\n\t\t" ) "" dep_tree)) | |
57 | + | |
58 | +let field_of_eniam_sentence fields tokens (result : eniam_parse_result) = | |
59 | + match result.status with | |
60 | + Idle -> "Idle" | |
61 | + (* | PreprocessingError -> "PreprocessingError" *) | |
62 | + | LexiconError -> "LexiconError" | |
63 | + | ParseError -> "ParseError" | |
64 | + | ParseTimeout -> "ParseTimeout" | |
65 | + | NotParsed -> "NotParsed" | |
66 | + | ReductionError -> "ReductionError" | |
67 | + | TooManyNodes -> "TooManyNodes" | |
68 | + | NotReduced -> "NotReduced" | |
69 | + | SemError -> "SemError" | |
70 | + (* | NotTranslated -> "NotTranslated" *) | |
71 | + | Parsed -> ignore ("Parsed\n\t\t" ^ (field_of_dependency_tree eniam fields result.dependency_tree)); "Parsed\n" | |
72 | + | _ -> failwith "field_of_eniam_sentence" | |
73 | + | |
74 | +let field_of_conll_sentence fields tokens (result : conll_parse_result) = | |
75 | + match result.status with | |
76 | + Idle -> "Idle" | |
77 | + (* | PreprocessingError -> "PreprocessingError" *) | |
78 | + | LexiconError -> "LexiconError" | |
79 | + | ParseError -> "ParseError" | |
80 | + | ParseTimeout -> "ParseTimeout" | |
81 | + | NotParsed -> "NotParsed" | |
82 | + | ReductionError -> "ReductionError" | |
83 | + | TooManyNodes -> "TooManyNodes" | |
84 | + | NotReduced -> "NotReduced" | |
85 | + | SemError -> "SemError" | |
86 | + (* | NotTranslated -> "NotTranslated" *) | |
87 | + | Parsed -> ignore ("Parsed\n\t\t" ^ (field_of_dependency_tree conll fields result.dependency_tree)); "Parsed\n" | |
88 | + | _ -> failwith "field_of_conll_sentence" | |
89 | + | |
90 | + | |
91 | +let rec field_of_sentence fields tokens = function | |
92 | + RawSentence s -> s | |
93 | + | StructSentence(_,paths,last) -> "StructSentence" | |
94 | + | DepSentence(_,paths) -> "DepSentence" | |
95 | + | ENIAMSentence result -> field_of_eniam_sentence fields tokens result | |
96 | + | CONLLSentence result -> field_of_conll_sentence fields tokens result | |
97 | + | QuotedSentences sentences -> "QuotedSentences" | |
98 | + | AltSentence l -> String.concat "\n\t" (Xlist.map l (fun (m, s) -> | |
99 | + Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields tokens s))) | |
100 | + (* | _ -> failwith "field_of_sentence: ni" *) | |
101 | + | |
102 | +let rec field_of_paragraph fields tokens = function | |
103 | + RawParagraph s -> print_endline "no fields detected: only raw paragraph"; s | |
104 | + | StructParagraph sentences -> | |
105 | + String.concat "\n\t" (Xlist.map sentences (fun p -> field_of_sentence fields tokens p.psentence)) | |
106 | + | AltParagraph l -> | |
107 | + String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> m = ENIAM || m = CONLL) l) (fun (m,t) -> | |
108 | + Visualization.string_of_mode m ^ "\n\t" ^ (field_of_paragraph fields tokens t))) | |
109 | + (* field_of_paragraph fields tokens (snd @@ List.find (fun (mode,text) -> mode = ENIAM || mode = CONLL) l) *) | |
110 | + | |
111 | +let rec print_fields_rec fields = function | |
112 | + RawText s -> print_endline "no fields detected: only raw text"; | |
113 | +| StructText(paragraphs,tokens) -> | |
114 | + print_endline (String.concat "\n\n" (Xlist.map paragraphs (field_of_paragraph fields tokens)) ^ "\n") | |
115 | +| AltText l -> | |
116 | + print_fields_rec fields (snd @@ List.find (fun (m,t) -> m = Struct || m = ENIAM || m = CONLL) l) | |
117 | + | |
118 | +let print_fields fields text = | |
119 | + print_fields_rec fields text; | |
120 | + print_field_map () | |
... | ... |
parser/exec.ml
... | ... | @@ -198,10 +198,10 @@ let conll_parse_sentence timeout test_only_flag id paths tokens = |
198 | 198 | let result = {result with lex_time=time3 -. time2} in |
199 | 199 | try |
200 | 200 | (* print_endline "conll_parse_sentence 1"; *) |
201 | - LCGlatexOf.print_references "results/" "references1" references; | |
201 | + (* LCGlatexOf.print_references "results/" "references1" references; *) | |
202 | 202 | let parsed_dep_chart = LCGchart.dep_parse dep_chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *) |
203 | 203 | (* print_endline "conll_parse_sentence 2"; *) |
204 | - LCGlatexOf.print_references "results/" "references2" references; | |
204 | + (*LCGlatexOf.print_references "results/" "references2" references;*) | |
205 | 205 | let time4 = time_fun () in |
206 | 206 | let result = if test_only_flag then result else {result with parsed_dep_chart=parsed_dep_chart} in |
207 | 207 | let result = {result with parse_time=time4 -. time3} in |
... | ... | @@ -324,21 +324,21 @@ let rec extract_query_text = function |
324 | 324 | | _ -> failwith "extract_query_text" |
325 | 325 | |
326 | 326 | let process_query pre_in pre_out timeout test_only_flag id full_query max_n = |
327 | - print_endline "process_query 0"; | |
327 | + (* print_endline "process_query 0"; *) | |
328 | 328 | let result = {empty_result with input_text=translate_text full_query} in |
329 | 329 | let time1 = time_fun () in |
330 | - print_endline "process_query 1"; | |
330 | + (* print_endline "process_query 1"; *) | |
331 | 331 | Marshal.to_channel pre_out full_query []; |
332 | 332 | flush pre_out; |
333 | - print_endline "process_query 2"; | |
333 | + (* print_endline "process_query 2"; *) | |
334 | 334 | let pre_text,msg,pre_time1 = (Marshal.from_channel pre_in : PreTypes.text * string * float) in |
335 | 335 | let time2 = time_fun () in |
336 | 336 | let result = if test_only_flag then result else {result with pre_text=translate_text pre_text} in |
337 | 337 | let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1} in |
338 | 338 | if msg <> "" then {result with status=PreprocessingError; msg=msg} else ( |
339 | - print_endline "process_query 3"; | |
339 | + (* print_endline "process_query 3"; *) | |
340 | 340 | let parsed_text = parse_text timeout test_only_flag Struct (translate_text pre_text) in |
341 | - print_endline "process_query 4"; | |
341 | + (* print_endline "process_query 4"; *) | |
342 | 342 | let time3 = time_fun () in |
343 | 343 | let result = if test_only_flag then result else {result with status=Parsed; parsed_text=parsed_text} in |
344 | 344 | let result = {result with parse_time=time3 -. time2} in |
... | ... |
parser/makefile
1 | 1 | OCAMLC=ocamlc |
2 | 2 | OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | -INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../../../Dropbox/lib/latexvis -I ../../installed/latexvis -I ../lib/xt -I ../../../Dropbox/Clarin-pl/podzadania/nkjp/fold_text -I ../podzadania/morfeusz -I ../pre -I ../corpora | |
4 | +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../../../Dropbox/lib/latexvis -I ../../installed/latexvis -I ../lib/xt -I ../../../Dropbox/Clarin-pl/podzadania/nkjp/fold_text -I ../podzadania/morfeusz -I ../pre -I ../corpora -I ../diagnostics | |
5 | 5 | #INCLUDES=-I +xml-light -I +xlib -I ../pre |
6 | 6 | OCAMLFLAGS=$(INCLUDES) -g |
7 | 7 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa latexvis.cmxa #nkjp.cmxa |
... | ... | @@ -13,7 +13,7 @@ LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGlatexO |
13 | 13 | DISAMB= disambSelPref.ml disambLemma.ml |
14 | 14 | SEM= semGraph.ml semTypes.ml semStringOf.ml semLatexOf.ml semMmlOf.ml semMrl.ml |
15 | 15 | #SEM= semGraph.ml semTypes.ml semStringOf.ml semMmlOf.ml semMrl.ml |
16 | -EXEC= execTypes.ml visualization.ml exec.ml | |
16 | +EXEC= execTypes.ml visualization.ml exec.ml ../diagnostics/LCGfields.ml | |
17 | 17 | |
18 | 18 | all: |
19 | 19 | $(OCAMLOPT) -o pipe $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) pipe.ml |
... | ... |
parser/pipe.ml
... | ... | @@ -187,19 +187,20 @@ let process_id s = |
187 | 187 | let process_conll_corpus filename = |
188 | 188 | let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in |
189 | 189 | print_endline "process_conll_corpus"; |
190 | - let corpus = [List.hd corpus] in | |
190 | + (* let corpus = [List.hd corpus] in *) | |
191 | 191 | let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in |
192 | 192 | Xlist.iter corpus (fun query -> |
193 | 193 | let id = process_id (get_query_id query) in |
194 | 194 | let path = "results/" ^ id ^ "/" in |
195 | 195 | ignore (Sys.command ("mkdir -p " ^ path)); |
196 | 196 | let result = Exec.process_query ic oc 30. false "x" query 10 in |
197 | - Visualization.print_html_text path "input_text" result.input_text; | |
198 | - Visualization.print_html_text path "pre_text" result.pre_text; | |
199 | - Visualization.print_html_text path "parsed_text" result.parsed_text; | |
197 | + (* Visualization.print_html_text path "input_text" result.input_text; | |
198 | + Visualization.print_html_text path "pre_text" result.pre_text; *) | |
199 | + (* Visualization.print_html_text path "parsed_text" result.parsed_text; *) | |
200 | 200 | (* printf "input_text:\n%s\n" (Visualization.string_of_text result.input_text); |
201 | 201 | printf "pre_text:\n%s\n" (Visualization.string_of_text result.pre_text); *) |
202 | 202 | (* Exec.print_result stdout result; *) |
203 | + LCGfields.print_fields ["arole"] result.parsed_text; | |
203 | 204 | (* Visualization.print_paths "results/" "paths" result.paths; *) |
204 | 205 | ()); |
205 | 206 | Marshal.to_channel oc (PreTypes.RawText "") []; |
... | ... |
parser/visualization.ml
... | ... | @@ -78,7 +78,7 @@ let string_of_status = function |
78 | 78 | | ExecTypes.ParseTimeout -> "timeout" |
79 | 79 | | ExecTypes.NotParsed -> "not_parsed" |
80 | 80 | | ExecTypes.ReductionError -> "error_reduction" |
81 | - | ExecTypes.TooManyNodes -> "to_many_nodes" | |
81 | + | ExecTypes.TooManyNodes -> "too_many_nodes" | |
82 | 82 | | ExecTypes.NotReduced -> "not_reduced" |
83 | 83 | | ExecTypes.SemError -> "error_sem" |
84 | 84 | | ExecTypes.NotTranslated -> "not_translated" |
... | ... | @@ -811,15 +811,15 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) = |
811 | 811 | |
812 | 812 | |
813 | 813 | let rec html_of_sentence path tokens = function |
814 | - RawSentence s -> s | |
815 | - | StructSentence(_,paths,last) -> html_of_struct_sentence tokens paths last | |
816 | - | DepSentence(_,paths) -> html_of_dep_sentence tokens paths | |
817 | - | ENIAMSentence result -> html_of_eniam_sentence path tokens result | |
818 | - | CONLLSentence result -> html_of_conll_sentence path tokens result | |
819 | - | QuotedSentences sentences -> | |
814 | + RawSentence s -> print_endline "RawSentence"; s | |
815 | + | StructSentence(_,paths,last) -> print_endline "StructSentence"; html_of_struct_sentence tokens paths last | |
816 | + | DepSentence(_,paths) -> print_endline "DepSentence"; html_of_dep_sentence tokens paths | |
817 | + | ENIAMSentence result -> print_endline "ENIAMSentence"; html_of_eniam_sentence path tokens result | |
818 | + | CONLLSentence result -> print_endline "CONLLSentence"; html_of_conll_sentence path tokens result | |
819 | + | QuotedSentences sentences -> print_endline "QuotedSentences"; | |
820 | 820 | String.concat "<BR>\n" (Xlist.map sentences (fun p -> |
821 | 821 | sprintf "pid=%s pbeg=%d plen=%d pnext=%d<BR>%s" p.pid p.pbeg p.plen p.pnext (html_of_sentence path tokens p.psentence))) |
822 | - | AltSentence l -> | |
822 | + | AltSentence l -> print_endline "AltSentence"; | |
823 | 823 | "<table border=1>" ^ |
824 | 824 | String.concat "\n" (Xlist.map l (fun (mode,sentence) -> |
825 | 825 | sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_sentence path tokens sentence))) ^ |
... | ... | @@ -827,11 +827,11 @@ let rec html_of_sentence path tokens = function |
827 | 827 | (* | _ -> failwith "html_of_sentence: ni" *) |
828 | 828 | |
829 | 829 | let rec html_of_paragraph path tokens = function |
830 | - RawParagraph s -> s | |
831 | - | StructParagraph sentences -> | |
830 | + RawParagraph s -> print_endline "RawParagraph"; s | |
831 | + | StructParagraph sentences -> print_endline "StructParagraph"; | |
832 | 832 | String.concat "<BR>\n" (Xlist.map sentences (fun p -> |
833 | 833 | sprintf "pid=%s pbeg=%d plen=%d pnext=%d<BR>%s" p.pid p.pbeg p.plen p.pnext (html_of_sentence path tokens p.psentence))) |
834 | - | AltParagraph l -> | |
834 | + | AltParagraph l -> print_endline "AltParagraph"; | |
835 | 835 | "<table border=2>" ^ |
836 | 836 | String.concat "\n" (Xlist.map l (fun (mode,paragraph) -> |
837 | 837 | sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_paragraph path tokens paragraph))) ^ |
... | ... |