Commit 78e20b4b74f1f5a06ee32a42436f1fcff34afc05

Authored by Wojciech Jaworski
1 parent 0b6dd720

generowanie wejścia dla Swigry i POLFIE

corpora/CONLL.ml
... ... @@ -33,8 +33,8 @@ let string_of_paths mode tokens paths =
33 33  
34 34 let rec string_of_sentence mode tokens = function
35 35 RawSentence s -> if mode = Raw then s else ""
36   - | StructSentence (_,tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)
37   - | DepSentence (_, paths) -> string_of_paths mode tokens paths
  36 + | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)
  37 + | DepSentence (paths) -> string_of_paths mode tokens paths
38 38 | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences")
39 39 | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts
40 40  
... ... @@ -111,8 +111,8 @@ let info_map =
111 111 let match_sentence (p_record,tokens) =
112 112 let rec info_token s = match s with
113 113 RawSentence text -> failwith ("match_sentence: " ^ text)
114   - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
115   - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
  114 + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
  115 + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
116 116 | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")
117 117 | AltSentence alts -> failwith ("match_sentence: AltSentence")
118 118 (*if List.exists (fun (mode, s) -> mode = CONLL) alts
... ... @@ -122,8 +122,8 @@ let match_sentence (p_record,tokens) =
122 122 try
123 123 let id, text = StringMap.find info_map info_token in
124 124 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
125   - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len;
126   - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)]
  125 + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
  126 + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
127 127 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
128 128 with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]
129 129  
... ... @@ -188,8 +188,8 @@ let info_map =
188 188 let match_sentence (p_record,tokens) =
189 189 let rec info_token s = match s with
190 190 RawSentence text -> failwith ("match_sentence: " ^ text)
191   - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
192   - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
  191 + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
  192 + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
193 193 | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")
194 194 | AltSentence alts -> failwith ("match_sentence: AltSentence")
195 195 (*if List.exists (fun (mode, s) -> mode = CONLL) alts
... ... @@ -199,8 +199,8 @@ let match_sentence (p_record,tokens) =
199 199 try
200 200 let id, text = StringMap.find info_map info_token in
201 201 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
202   - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len;
203   - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)]
  202 + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
  203 + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
204 204 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
205 205 with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]
206 206  
... ... @@ -274,7 +274,7 @@ let load_sentence in_channel =
274 274 then raise End_of_file
275 275 else rev_paths, id in
276 276 let rev_paths, id = pom [] "" in
277   - {pid = id; pbeg = -1; plen = -1; pnext = -1; psentence = DepSentence("",Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens
  277 + {pid = id; pbeg = -1; plen = -1; pnext = -1; pfile_prefix = ""; psentence = DepSentence(Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens
278 278 (* {s_id = id; s_text = ""; s_paths = (List.rev rev_paths)} *)
279 279  
280 280 let load_corpus in_channel =
... ...
parser/exec.ml
... ... @@ -43,7 +43,7 @@ let empty_result = {
43 43 (*structs=SemTypes.Atom "",SemTypes.Label "",SemTypes.Label "",[],""*)}
44 44  
45 45 let empty_eniam_parse_result = {
46   - id="";
  46 + file_prefix="";
47 47 status=Idle;
48 48 msg="";
49 49 lex_time=0.;
... ... @@ -58,7 +58,7 @@ let empty_eniam_parse_result = {
58 58 }
59 59  
60 60 let empty_conll_parse_result = {
61   - id="";
  61 + file_prefix="";
62 62 status=Idle;
63 63 msg="";
64 64 lex_time=0.;
... ... @@ -102,14 +102,16 @@ let translate_mode = function
102 102 | PreTypes.CONLL -> CONLL
103 103 | PreTypes.ENIAM -> ENIAM
104 104 | PreTypes.Mate -> Mate
  105 + | PreTypes.Swigra -> Swigra
  106 + | PreTypes.POLFIE -> POLFIE
105 107  
106 108 let rec translate_sentence = function
107 109 PreTypes.RawSentence s -> RawSentence s
108   - | PreTypes.StructSentence(id,paths,last) -> StructSentence(id,paths,last)
109   - | PreTypes.DepSentence(id,paths) -> DepSentence(id,paths)
  110 + | PreTypes.StructSentence(paths,last) -> StructSentence(paths,last)
  111 + | PreTypes.DepSentence(paths) -> DepSentence(paths)
110 112 | PreTypes.QuotedSentences sentences ->
111 113 QuotedSentences(Xlist.map sentences (fun p ->
112   - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext;
  114 + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix;
113 115 psentence=translate_sentence p.PreTypes.psentence}))
114 116 | PreTypes.AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) ->
115 117 translate_mode mode, translate_sentence sentence))
... ... @@ -118,7 +120,7 @@ let rec translate_paragraph = function
118 120 PreTypes.RawParagraph s -> RawParagraph s
119 121 | PreTypes.StructParagraph sentences ->
120 122 StructParagraph(Xlist.map sentences (fun p ->
121   - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext;
  123 + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix;
122 124 psentence=translate_sentence p.PreTypes.psentence}))
123 125 | PreTypes.AltParagraph l -> AltParagraph(Xlist.map l (fun (mode,paragraph) ->
124 126 translate_mode mode, translate_paragraph paragraph))
... ... @@ -130,8 +132,8 @@ let rec translate_text = function
130 132 | PreTypes.AltText l -> AltText(Xlist.map l (fun (mode,text) ->
131 133 translate_mode mode, translate_text text))
132 134  
133   -let eniam_parse_sentence timeout test_only_flag id paths last tokens =
134   - let result = {empty_eniam_parse_result with id=id} in
  135 +let eniam_parse_sentence timeout test_only_flag paths last tokens =
  136 + let result = empty_eniam_parse_result in
135 137 let time2 = time_fun () in
136 138 try
137 139 let chart = LCGlexicon.create (paths,last) tokens in
... ... @@ -187,8 +189,8 @@ let eniam_parse_sentence timeout test_only_flag id paths last tokens =
187 189 let time3 = time_fun () in
188 190 {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}
189 191  
190   -let conll_parse_sentence timeout test_only_flag id paths tokens =
191   - let result = {empty_conll_parse_result with id=id} in
  192 +let conll_parse_sentence timeout test_only_flag paths tokens =
  193 + let result = empty_conll_parse_result in
192 194 let time2 = time_fun () in
193 195 try
194 196 let dep_chart = LCGlexicon.dep_create paths tokens in
... ... @@ -253,22 +255,33 @@ let conll_parse_sentence timeout test_only_flag id paths tokens =
253 255  
254 256 let mate_in, mate_out = Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"
255 257  
  258 +let file_prefix_of_mode = function
  259 + Raw -> "R"
  260 + | Struct -> "St"
  261 + | CONLL -> "C"
  262 + | ENIAM -> "E"
  263 + | Mate -> "M"
  264 + | Swigra -> "S"
  265 + | POLFIE -> "P"
  266 +
256 267 let get_paths = function
257   - {PreTypes.psentence=PreTypes.DepSentence(_,paths)},_ -> paths
  268 + {PreTypes.psentence=PreTypes.DepSentence(paths)},_ -> paths
258 269 | _ -> failwith "get_paths"
259 270  
260   -let rec parse_sentence timeout test_only_flag mode tokens = function
  271 +let rec parse_sentence timeout test_only_flag mode file_prefix tokens = function
261 272 RawSentence s -> RawSentence s
262   - | StructSentence(id,paths,last) ->
  273 + | StructSentence(paths,last) ->
263 274 (match mode with
264 275 ENIAM ->
265   - let result = eniam_parse_sentence timeout test_only_flag id paths last tokens in
  276 + let result = eniam_parse_sentence timeout test_only_flag paths last tokens in
  277 + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in
266 278 ENIAMSentence result
267 279 | _ -> failwith "parse_sentence")
268   - | DepSentence(id,paths) ->
  280 + | DepSentence(paths) ->
269 281 (match mode with
270 282 CONLL ->
271   - let result = conll_parse_sentence timeout test_only_flag id paths tokens in
  283 + let result = conll_parse_sentence timeout test_only_flag paths tokens in
  284 + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in
272 285 CONLLSentence result
273 286 (* let xml = DepTree.conll_to_xml paths in
274 287 let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *)
... ... @@ -279,22 +292,23 @@ let rec parse_sentence timeout test_only_flag mode tokens = function
279 292 print_endline "parse_sentence 1";
280 293 let conll = CONLL.string_of_paths PreTypes.Mate tokens paths in
281 294 print_endline "parse_sentence 2";
282   - printf "|%s|\n" conll;
  295 + (* printf "|%s|\n" conll; *)
283 296 Printf.fprintf mate_out "%s\n\n%!" conll;
284 297 print_endline "parse_sentence 3";
285 298 let new_paths = get_paths (CONLL.load_sentence mate_in) in
286 299 print_endline "parse_sentence 4";
287   - let result = conll_parse_sentence timeout test_only_flag id new_paths tokens in
  300 + let result = conll_parse_sentence timeout test_only_flag new_paths tokens in
  301 + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in
288 302 CONLLSentence result
289 303 | _ -> failwith "parse_sentence")
290 304 | QuotedSentences sentences ->
291 305 let sentences = Xlist.rev_map sentences (fun p ->
292   - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in
  306 + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in
293 307 {p with psentence=sentence}) in
294 308 QuotedSentences(List.rev sentences)
295 309 | AltSentence l ->
296 310 let l = Xlist.rev_map l (fun (mode,sentence) ->
297   - mode, parse_sentence timeout test_only_flag mode tokens sentence) in
  311 + mode, parse_sentence timeout test_only_flag mode file_prefix tokens sentence) in
298 312 AltSentence(List.rev l)
299 313 | _ -> failwith "parse_sentence"
300 314  
... ... @@ -302,7 +316,7 @@ let rec parse_paragraph timeout test_only_flag mode tokens = function
302 316 RawParagraph s -> RawParagraph s
303 317 | StructParagraph sentences ->
304 318 let sentences = Xlist.rev_map sentences (fun p ->
305   - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in
  319 + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in
306 320 {p with psentence=sentence}) in
307 321 StructParagraph(List.rev sentences)
308 322 | AltParagraph l ->
... ...
parser/execTypes.ml
... ... @@ -20,7 +20,7 @@
20 20 type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated
21 21  
22 22 type eniam_parse_result = {
23   - id: string;
  23 + file_prefix: string;
24 24 status: status;
25 25 msg: string;
26 26 lex_time: float;
... ... @@ -35,7 +35,7 @@ type eniam_parse_result = {
35 35 }
36 36  
37 37 type conll_parse_result = {
38   - id: string;
  38 + file_prefix: string;
39 39 status: status;
40 40 msg: string;
41 41 lex_time: float;
... ... @@ -54,13 +54,13 @@ type conll_parse_result = {
54 54 }
55 55  
56 56 type mode =
57   - Raw | Struct | CONLL | ENIAM | Mate
  57 + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE
58 58  
59 59 type sentence =
60 60 RawSentence of string
61 61 (* | CONLL of conll list *)
62   - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *)
63   - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *)
  62 + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *)
  63 + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *)
64 64 | QuotedSentences of paragraph_record list
65 65 (* | NKJP1M of nkjp1m list *)
66 66 (* | Skladnica of skladnica_tree *)
... ... @@ -68,7 +68,7 @@ type sentence =
68 68 | ENIAMSentence of eniam_parse_result
69 69 | CONLLSentence of conll_parse_result
70 70  
71   -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *)
  71 +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *)
72 72  
73 73 and paragraph =
74 74 RawParagraph of string
... ...
parser/pipe.ml
... ... @@ -118,9 +118,9 @@ let lcg_process query =
118 118 let _ = Unix.shutdown_connection ic in
119 119 ()
120 120  
121   -(* let _ =
  121 +let _ =
122 122 if Array.length Sys.argv < 2 then print_endline "missing argument" else
123   - lcg_process Sys.argv.(1) *)
  123 + lcg_process Sys.argv.(1)
124 124  
125 125  
126 126 (* FIXME: parser dziwnie się zachowuje dla 'ścieżki anomalia.' 'ścieżki anomalia. GG' itp. - nie parsuje '.' a jak sparsuje to nie chce redukować *)
... ... @@ -210,7 +210,7 @@ let process_conll_corpus filename =
210 210 let _ =
211 211 (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *)
212 212 (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *)
213   - process_conll_corpus "../testy/skladnica-test1.conll";
  213 + (* process_conll_corpus "../testy/skladnica-test1.conll"; *)
214 214 ()
215 215  
216 216 (* TO DO:
... ... @@ -227,7 +227,8 @@ let _ =
227 227 - assign_not_parsed
228 228 - sprawdzenie zerowania globalnych referencji przy parsowaniu korpusu
229 229 - mateParser
230   - 2016.10.19
  230 + 2016.10.22
  231 + - przerobić AltSentence tak by prefix nazw plików był jego elementem, albo wstawić liczbę z prefiksu do paragraph_record
231 232 *)
232 233  
233 234  
... ...
parser/visualization.ml
... ... @@ -640,6 +640,8 @@ let string_of_mode = function
640 640 | CONLL -> "CONLL"
641 641 | ENIAM -> "ENIAM"
642 642 | Mate -> "Mate"
  643 + | Swigra -> "Swigra"
  644 + | POLFIE -> "POLFIE"
643 645  
644 646 (*let rec string_of_sentence = function
645 647 RawSentence s -> sprintf "RawSentence(%s)" s
... ... @@ -742,30 +744,30 @@ let html_of_eniam_sentence path tokens (result : eniam_parse_result) =
742 744 (* | PreprocessingError -> "error_pre: %s\n" result.msg *)
743 745 | LexiconError -> sprintf "error_lex: %s\n" result.msg
744 746 | ParseError ->
745   - create_latex_chart path (result.id ^ "_chart") result.chart;
  747 + create_latex_chart path (result.file_prefix ^ "_chart") result.chart;
746 748 sprintf "error_parse: %s\n" result.msg ^
747   - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id
  749 + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix
748 750 | ParseTimeout ->
749   - create_latex_chart path (result.id ^ "_chart") result.chart;
  751 + create_latex_chart path (result.file_prefix ^ "_chart") result.chart;
750 752 sprintf "timeout: %s\n" result.msg ^
751   - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id
  753 + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix
752 754 | NotParsed ->
753   - create_latex_chart path (result.id ^ "_chart") result.chart;
  755 + create_latex_chart path (result.file_prefix ^ "_chart") result.chart;
754 756 sprintf "not_parsed: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^
755   - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id
  757 + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix
756 758 | ReductionError -> sprintf "error_reduction: %s\n" result.msg
757 759 | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size
758 760 | NotReduced -> sprintf "not_reduced: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size
759 761 | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size
760 762 (* | NotTranslated -> "not_translated: \n" *)
761 763 | Parsed ->
762   - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree;
763   - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree;
764   - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree;
  764 + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree;
  765 + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree;
  766 + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree;
765 767 sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size ^
766   - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^
767   - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^
768   - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id
  768 + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^
  769 + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^
  770 + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix
769 771 | _ -> failwith "html_of_eniam_sentence"
770 772  
771 773 let html_of_conll_sentence path tokens (result : conll_parse_result) =
... ... @@ -774,46 +776,46 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) =
774 776 (* | PreprocessingError -> "error_pre: %s\n" result.msg *)
775 777 | LexiconError -> sprintf "error_lex: %s\n" result.msg
776 778 | ParseError ->
777   - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart;
778   - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart;
  779 + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart;
  780 + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart;
779 781 sprintf "error_parse: %s\n" result.msg ^
780   - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^
781   - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id
  782 + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^
  783 + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix
782 784 | ParseTimeout ->
783   - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart;
784   - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart;
  785 + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart;
  786 + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart;
785 787 sprintf "timeout: %s\n" result.msg ^
786   - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^
787   - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id
  788 + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^
  789 + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix
788 790 | NotParsed ->
789   - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart;
790   - create_latex_not_parsed_dep_chart path (result.id ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart;
  791 + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart;
  792 + create_latex_not_parsed_dep_chart path (result.file_prefix ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart;
791 793 sprintf "not_parsed\n" ^
792   - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^
793   - sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.id
  794 + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^
  795 + sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.file_prefix
794 796 | ReductionError -> sprintf "error_reduction: %s\n" result.msg
795 797 | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d\n" result.paths_size
796 798 | NotReduced ->
797   - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree;
  799 + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree;
798 800 sprintf "not_reduced: paths_size=%d\n" result.paths_size ^
799   - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id
  801 + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix
800 802 | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size
801 803 (* | NotTranslated -> "not_translated: \n" *)
802 804 | Parsed ->
803   - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree;
804   - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree;
805   - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree;
  805 + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree;
  806 + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree;
  807 + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree;
806 808 sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size ^
807   - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^
808   - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^
809   - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id
  809 + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^
  810 + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^
  811 + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix
810 812 | _ -> failwith "html_of_conll_sentence"
811 813  
812 814  
813 815 let rec html_of_sentence path tokens = function
814 816 RawSentence s -> s
815   - | StructSentence(_,paths,last) -> html_of_struct_sentence tokens paths last
816   - | DepSentence(_,paths) -> html_of_dep_sentence tokens paths
  817 + | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last
  818 + | DepSentence(paths) -> html_of_dep_sentence tokens paths
817 819 | ENIAMSentence result -> html_of_eniam_sentence path tokens result
818 820 | CONLLSentence result -> html_of_conll_sentence path tokens result
819 821 | QuotedSentences sentences ->
... ...
pre/preProcessing.ml
... ... @@ -614,12 +614,13 @@ let parse_text = function
614 614 AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in
615 615 AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)]
616 616 | AltText[Raw,RawText query;CONLL,StructText([
617   - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence(_,dep_paths)]} as p]],tokens)] ->
  617 + StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]],tokens)] ->
618 618 parse_conll tokens dep_paths;
619 619 let paths = parse query in
620 620 let sentences = PreSentences.split_into_sentences query tokens paths in
  621 + let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in
621 622 let conll = StructParagraph[{p with psentence = AltSentence[Raw, RawSentence text;
622   - Mate, DepSentence("M",dep_paths); CONLL, DepSentence("C",dep_paths)]}] in
  623 + Mate, DepSentence m_dep_paths; CONLL, DepSentence dep_paths]}] in
623 624 AltText[Raw,RawText query; Struct, StructText([
624 625 AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)]
625 626 | _ -> failwith "parse_text: not implemented"
... ...
pre/preSentences.ml
... ... @@ -147,17 +147,15 @@ let find_tokens_in_chart tokens chart lnode rnode cat =
147 147  
148 148 let rec add_struct_sentence_ids_rec n sentences =
149 149 Xlist.fold sentences ([],n) (fun (l,n) -> function
150   - {psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p ->
151   - {p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E" ^ string_of_int n,paths,last)]} :: l, n+1
152   - | {psentence=AltSentence[Raw,s;ENIAM,QuotedSentences sentences]} as p ->
  150 + {psentence=AltSentence[Raw,s;Struct,QuotedSentences sentences]} as p ->
153 151 let sentences, n = add_struct_sentence_ids_rec n sentences in
154   - {p with psentence=AltSentence[Raw,s;ENIAM,QuotedSentences (List.rev sentences)]} :: l, n+1
155   - | _ -> failwith "add_struct_sentence_ids")
  152 + {p with psentence=AltSentence[Raw,s;Struct,QuotedSentences (List.rev sentences)]} :: l, n
  153 + | p -> {p with pfile_prefix=string_of_int n} :: l, n+1)
156 154  
157 155 let add_struct_sentence_ids sentences =
158 156 match sentences with
159   - [{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p] ->
160   - [{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E",paths,last)]}]
  157 + [{psentence=AltSentence[Raw,_;Struct,QuotedSentences _]}] -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences))
  158 + | [p] -> [p]
161 159 | _ -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences))
162 160  
163 161 let prepare_indexes paths =
... ... @@ -181,13 +179,16 @@ let rec extract_sentences_rec tokens id =
181 179 match t.token with
182 180 Tokens("sentence",ids) ->
183 181 let paths,last = make_paths tokens ids in
184   - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next;
  182 + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix="";
185 183 psentence=AltSentence[Raw,RawSentence t.orth;
186   - ENIAM,StructSentence("",paths,last)]}]
  184 + ENIAM,StructSentence(paths,last);
  185 + Mate,RawSentence t.orth;
  186 + Swigra,RawSentence t.orth;
  187 + POLFIE,RawSentence t.orth]}]
187 188 | Tokens("quoted_sentences",ids) ->
188   - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next;
  189 + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix="";
189 190 psentence=AltSentence[Raw,RawSentence t.orth;
190   - ENIAM,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}]
  191 + Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}]
191 192 | _ -> []
192 193  
193 194 let extract_sentences tokens chart last =
... ...
pre/preTypes.ml
... ... @@ -117,7 +117,7 @@ let empty_token = {
117 117 lroles="",""; semantics=Normal}
118 118  
119 119 type mode =
120   - Raw | Struct | CONLL | ENIAM | Mate
  120 + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE
121 121  
122 122 (* warstwy nkjp1m do analizy:
123 123 header
... ... @@ -133,14 +133,14 @@ ann_named
133 133 type sentence =
134 134 RawSentence of string
135 135 (* | CONLL of conll list *)
136   - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *)
137   - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *)
  136 + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *)
  137 + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *)
138 138 | QuotedSentences of paragraph_record list
139 139 (* | NKJP1M of nkjp1m list *)
140 140 (* | Skladnica of skladnica_tree *)
141 141 | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *)
142 142  
143   -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *)
  143 +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *)
144 144  
145 145 and paragraph =
146 146 RawParagraph of string
... ...