Commit 78e20b4b74f1f5a06ee32a42436f1fcff34afc05
1 parent
0b6dd720
generowanie wejścia dla Swigry i POLFIE
Showing
8 changed files
with
112 additions
and
93 deletions
corpora/CONLL.ml
... | ... | @@ -33,8 +33,8 @@ let string_of_paths mode tokens paths = |
33 | 33 | |
34 | 34 | let rec string_of_sentence mode tokens = function |
35 | 35 | RawSentence s -> if mode = Raw then s else "" |
36 | - | StructSentence (_,tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) | |
37 | - | DepSentence (_, paths) -> string_of_paths mode tokens paths | |
36 | + | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) | |
37 | + | DepSentence (paths) -> string_of_paths mode tokens paths | |
38 | 38 | | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") |
39 | 39 | | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts |
40 | 40 | |
... | ... | @@ -111,8 +111,8 @@ let info_map = |
111 | 111 | let match_sentence (p_record,tokens) = |
112 | 112 | let rec info_token s = match s with |
113 | 113 | RawSentence text -> failwith ("match_sentence: " ^ text) |
114 | - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) | |
115 | - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | |
114 | + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) | |
115 | + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | |
116 | 116 | | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") |
117 | 117 | | AltSentence alts -> failwith ("match_sentence: AltSentence") |
118 | 118 | (*if List.exists (fun (mode, s) -> mode = CONLL) alts |
... | ... | @@ -122,8 +122,8 @@ let match_sentence (p_record,tokens) = |
122 | 122 | try |
123 | 123 | let id, text = StringMap.find info_map info_token in |
124 | 124 | let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in |
125 | - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; | |
126 | - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] | |
125 | + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; | |
126 | + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] | |
127 | 127 | (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) |
128 | 128 | with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] |
129 | 129 | |
... | ... | @@ -188,8 +188,8 @@ let info_map = |
188 | 188 | let match_sentence (p_record,tokens) = |
189 | 189 | let rec info_token s = match s with |
190 | 190 | RawSentence text -> failwith ("match_sentence: " ^ text) |
191 | - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) | |
192 | - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | |
191 | + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) | |
192 | + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | |
193 | 193 | | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") |
194 | 194 | | AltSentence alts -> failwith ("match_sentence: AltSentence") |
195 | 195 | (*if List.exists (fun (mode, s) -> mode = CONLL) alts |
... | ... | @@ -199,8 +199,8 @@ let match_sentence (p_record,tokens) = |
199 | 199 | try |
200 | 200 | let id, text = StringMap.find info_map info_token in |
201 | 201 | let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in |
202 | - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; | |
203 | - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] | |
202 | + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; | |
203 | + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] | |
204 | 204 | (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) |
205 | 205 | with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] |
206 | 206 | |
... | ... | @@ -274,7 +274,7 @@ let load_sentence in_channel = |
274 | 274 | then raise End_of_file |
275 | 275 | else rev_paths, id in |
276 | 276 | let rev_paths, id = pom [] "" in |
277 | - {pid = id; pbeg = -1; plen = -1; pnext = -1; psentence = DepSentence("",Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens | |
277 | + {pid = id; pbeg = -1; plen = -1; pnext = -1; pfile_prefix = ""; psentence = DepSentence(Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens | |
278 | 278 | (* {s_id = id; s_text = ""; s_paths = (List.rev rev_paths)} *) |
279 | 279 | |
280 | 280 | let load_corpus in_channel = |
... | ... |
parser/exec.ml
... | ... | @@ -43,7 +43,7 @@ let empty_result = { |
43 | 43 | (*structs=SemTypes.Atom "",SemTypes.Label "",SemTypes.Label "",[],""*)} |
44 | 44 | |
45 | 45 | let empty_eniam_parse_result = { |
46 | - id=""; | |
46 | + file_prefix=""; | |
47 | 47 | status=Idle; |
48 | 48 | msg=""; |
49 | 49 | lex_time=0.; |
... | ... | @@ -58,7 +58,7 @@ let empty_eniam_parse_result = { |
58 | 58 | } |
59 | 59 | |
60 | 60 | let empty_conll_parse_result = { |
61 | - id=""; | |
61 | + file_prefix=""; | |
62 | 62 | status=Idle; |
63 | 63 | msg=""; |
64 | 64 | lex_time=0.; |
... | ... | @@ -102,14 +102,16 @@ let translate_mode = function |
102 | 102 | | PreTypes.CONLL -> CONLL |
103 | 103 | | PreTypes.ENIAM -> ENIAM |
104 | 104 | | PreTypes.Mate -> Mate |
105 | + | PreTypes.Swigra -> Swigra | |
106 | + | PreTypes.POLFIE -> POLFIE | |
105 | 107 | |
106 | 108 | let rec translate_sentence = function |
107 | 109 | PreTypes.RawSentence s -> RawSentence s |
108 | - | PreTypes.StructSentence(id,paths,last) -> StructSentence(id,paths,last) | |
109 | - | PreTypes.DepSentence(id,paths) -> DepSentence(id,paths) | |
110 | + | PreTypes.StructSentence(paths,last) -> StructSentence(paths,last) | |
111 | + | PreTypes.DepSentence(paths) -> DepSentence(paths) | |
110 | 112 | | PreTypes.QuotedSentences sentences -> |
111 | 113 | QuotedSentences(Xlist.map sentences (fun p -> |
112 | - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; | |
114 | + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix; | |
113 | 115 | psentence=translate_sentence p.PreTypes.psentence})) |
114 | 116 | | PreTypes.AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) -> |
115 | 117 | translate_mode mode, translate_sentence sentence)) |
... | ... | @@ -118,7 +120,7 @@ let rec translate_paragraph = function |
118 | 120 | PreTypes.RawParagraph s -> RawParagraph s |
119 | 121 | | PreTypes.StructParagraph sentences -> |
120 | 122 | StructParagraph(Xlist.map sentences (fun p -> |
121 | - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; | |
123 | + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix; | |
122 | 124 | psentence=translate_sentence p.PreTypes.psentence})) |
123 | 125 | | PreTypes.AltParagraph l -> AltParagraph(Xlist.map l (fun (mode,paragraph) -> |
124 | 126 | translate_mode mode, translate_paragraph paragraph)) |
... | ... | @@ -130,8 +132,8 @@ let rec translate_text = function |
130 | 132 | | PreTypes.AltText l -> AltText(Xlist.map l (fun (mode,text) -> |
131 | 133 | translate_mode mode, translate_text text)) |
132 | 134 | |
133 | -let eniam_parse_sentence timeout test_only_flag id paths last tokens = | |
134 | - let result = {empty_eniam_parse_result with id=id} in | |
135 | +let eniam_parse_sentence timeout test_only_flag paths last tokens = | |
136 | + let result = empty_eniam_parse_result in | |
135 | 137 | let time2 = time_fun () in |
136 | 138 | try |
137 | 139 | let chart = LCGlexicon.create (paths,last) tokens in |
... | ... | @@ -187,8 +189,8 @@ let eniam_parse_sentence timeout test_only_flag id paths last tokens = |
187 | 189 | let time3 = time_fun () in |
188 | 190 | {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2} |
189 | 191 | |
190 | -let conll_parse_sentence timeout test_only_flag id paths tokens = | |
191 | - let result = {empty_conll_parse_result with id=id} in | |
192 | +let conll_parse_sentence timeout test_only_flag paths tokens = | |
193 | + let result = empty_conll_parse_result in | |
192 | 194 | let time2 = time_fun () in |
193 | 195 | try |
194 | 196 | let dep_chart = LCGlexicon.dep_create paths tokens in |
... | ... | @@ -253,22 +255,33 @@ let conll_parse_sentence timeout test_only_flag id paths tokens = |
253 | 255 | |
254 | 256 | let mate_in, mate_out = Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test" |
255 | 257 | |
258 | +let file_prefix_of_mode = function | |
259 | + Raw -> "R" | |
260 | + | Struct -> "St" | |
261 | + | CONLL -> "C" | |
262 | + | ENIAM -> "E" | |
263 | + | Mate -> "M" | |
264 | + | Swigra -> "S" | |
265 | + | POLFIE -> "P" | |
266 | + | |
256 | 267 | let get_paths = function |
257 | - {PreTypes.psentence=PreTypes.DepSentence(_,paths)},_ -> paths | |
268 | + {PreTypes.psentence=PreTypes.DepSentence(paths)},_ -> paths | |
258 | 269 | | _ -> failwith "get_paths" |
259 | 270 | |
260 | -let rec parse_sentence timeout test_only_flag mode tokens = function | |
271 | +let rec parse_sentence timeout test_only_flag mode file_prefix tokens = function | |
261 | 272 | RawSentence s -> RawSentence s |
262 | - | StructSentence(id,paths,last) -> | |
273 | + | StructSentence(paths,last) -> | |
263 | 274 | (match mode with |
264 | 275 | ENIAM -> |
265 | - let result = eniam_parse_sentence timeout test_only_flag id paths last tokens in | |
276 | + let result = eniam_parse_sentence timeout test_only_flag paths last tokens in | |
277 | + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in | |
266 | 278 | ENIAMSentence result |
267 | 279 | | _ -> failwith "parse_sentence") |
268 | - | DepSentence(id,paths) -> | |
280 | + | DepSentence(paths) -> | |
269 | 281 | (match mode with |
270 | 282 | CONLL -> |
271 | - let result = conll_parse_sentence timeout test_only_flag id paths tokens in | |
283 | + let result = conll_parse_sentence timeout test_only_flag paths tokens in | |
284 | + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in | |
272 | 285 | CONLLSentence result |
273 | 286 | (* let xml = DepTree.conll_to_xml paths in |
274 | 287 | let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) |
... | ... | @@ -279,22 +292,23 @@ let rec parse_sentence timeout test_only_flag mode tokens = function |
279 | 292 | print_endline "parse_sentence 1"; |
280 | 293 | let conll = CONLL.string_of_paths PreTypes.Mate tokens paths in |
281 | 294 | print_endline "parse_sentence 2"; |
282 | - printf "|%s|\n" conll; | |
295 | + (* printf "|%s|\n" conll; *) | |
283 | 296 | Printf.fprintf mate_out "%s\n\n%!" conll; |
284 | 297 | print_endline "parse_sentence 3"; |
285 | 298 | let new_paths = get_paths (CONLL.load_sentence mate_in) in |
286 | 299 | print_endline "parse_sentence 4"; |
287 | - let result = conll_parse_sentence timeout test_only_flag id new_paths tokens in | |
300 | + let result = conll_parse_sentence timeout test_only_flag new_paths tokens in | |
301 | + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in | |
288 | 302 | CONLLSentence result |
289 | 303 | | _ -> failwith "parse_sentence") |
290 | 304 | | QuotedSentences sentences -> |
291 | 305 | let sentences = Xlist.rev_map sentences (fun p -> |
292 | - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in | |
306 | + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in | |
293 | 307 | {p with psentence=sentence}) in |
294 | 308 | QuotedSentences(List.rev sentences) |
295 | 309 | | AltSentence l -> |
296 | 310 | let l = Xlist.rev_map l (fun (mode,sentence) -> |
297 | - mode, parse_sentence timeout test_only_flag mode tokens sentence) in | |
311 | + mode, parse_sentence timeout test_only_flag mode file_prefix tokens sentence) in | |
298 | 312 | AltSentence(List.rev l) |
299 | 313 | | _ -> failwith "parse_sentence" |
300 | 314 | |
... | ... | @@ -302,7 +316,7 @@ let rec parse_paragraph timeout test_only_flag mode tokens = function |
302 | 316 | RawParagraph s -> RawParagraph s |
303 | 317 | | StructParagraph sentences -> |
304 | 318 | let sentences = Xlist.rev_map sentences (fun p -> |
305 | - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in | |
319 | + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in | |
306 | 320 | {p with psentence=sentence}) in |
307 | 321 | StructParagraph(List.rev sentences) |
308 | 322 | | AltParagraph l -> |
... | ... |
parser/execTypes.ml
... | ... | @@ -20,7 +20,7 @@ |
20 | 20 | type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated |
21 | 21 | |
22 | 22 | type eniam_parse_result = { |
23 | - id: string; | |
23 | + file_prefix: string; | |
24 | 24 | status: status; |
25 | 25 | msg: string; |
26 | 26 | lex_time: float; |
... | ... | @@ -35,7 +35,7 @@ type eniam_parse_result = { |
35 | 35 | } |
36 | 36 | |
37 | 37 | type conll_parse_result = { |
38 | - id: string; | |
38 | + file_prefix: string; | |
39 | 39 | status: status; |
40 | 40 | msg: string; |
41 | 41 | lex_time: float; |
... | ... | @@ -54,13 +54,13 @@ type conll_parse_result = { |
54 | 54 | } |
55 | 55 | |
56 | 56 | type mode = |
57 | - Raw | Struct | CONLL | ENIAM | Mate | |
57 | + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE | |
58 | 58 | |
59 | 59 | type sentence = |
60 | 60 | RawSentence of string |
61 | 61 | (* | CONLL of conll list *) |
62 | - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *) | |
63 | - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *) | |
62 | + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *) | |
63 | + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *) | |
64 | 64 | | QuotedSentences of paragraph_record list |
65 | 65 | (* | NKJP1M of nkjp1m list *) |
66 | 66 | (* | Skladnica of skladnica_tree *) |
... | ... | @@ -68,7 +68,7 @@ type sentence = |
68 | 68 | | ENIAMSentence of eniam_parse_result |
69 | 69 | | CONLLSentence of conll_parse_result |
70 | 70 | |
71 | -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) | |
71 | +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *) | |
72 | 72 | |
73 | 73 | and paragraph = |
74 | 74 | RawParagraph of string |
... | ... |
parser/pipe.ml
... | ... | @@ -118,9 +118,9 @@ let lcg_process query = |
118 | 118 | let _ = Unix.shutdown_connection ic in |
119 | 119 | () |
120 | 120 | |
121 | -(* let _ = | |
121 | +let _ = | |
122 | 122 | if Array.length Sys.argv < 2 then print_endline "missing argument" else |
123 | - lcg_process Sys.argv.(1) *) | |
123 | + lcg_process Sys.argv.(1) | |
124 | 124 | |
125 | 125 | |
126 | 126 | (* FIXME: parser dziwnie się zachowuje dla 'ścieżki anomalia.' 'ścieżki anomalia. GG' itp. - nie parsuje '.' a jak sparsuje to nie chce redukować *) |
... | ... | @@ -210,7 +210,7 @@ let process_conll_corpus filename = |
210 | 210 | let _ = |
211 | 211 | (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *) |
212 | 212 | (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) |
213 | - process_conll_corpus "../testy/skladnica-test1.conll"; | |
213 | + (* process_conll_corpus "../testy/skladnica-test1.conll"; *) | |
214 | 214 | () |
215 | 215 | |
216 | 216 | (* TO DO: |
... | ... | @@ -227,7 +227,8 @@ let _ = |
227 | 227 | - assign_not_parsed |
228 | 228 | - sprawdzenie zerowania globalnych referencji przy parsowaniu korpusu |
229 | 229 | - mateParser |
230 | - 2016.10.19 | |
230 | + 2016.10.22 | |
231 | + - przerobić AltSentence tak by prefix nazw plików był jego elementem, albo wstawić liczbę z prefiksu do paragraph_record | |
231 | 232 | *) |
232 | 233 | |
233 | 234 | |
... | ... |
parser/visualization.ml
... | ... | @@ -640,6 +640,8 @@ let string_of_mode = function |
640 | 640 | | CONLL -> "CONLL" |
641 | 641 | | ENIAM -> "ENIAM" |
642 | 642 | | Mate -> "Mate" |
643 | + | Swigra -> "Swigra" | |
644 | + | POLFIE -> "POLFIE" | |
643 | 645 | |
644 | 646 | (*let rec string_of_sentence = function |
645 | 647 | RawSentence s -> sprintf "RawSentence(%s)" s |
... | ... | @@ -742,30 +744,30 @@ let html_of_eniam_sentence path tokens (result : eniam_parse_result) = |
742 | 744 | (* | PreprocessingError -> "error_pre: %s\n" result.msg *) |
743 | 745 | | LexiconError -> sprintf "error_lex: %s\n" result.msg |
744 | 746 | | ParseError -> |
745 | - create_latex_chart path (result.id ^ "_chart") result.chart; | |
747 | + create_latex_chart path (result.file_prefix ^ "_chart") result.chart; | |
746 | 748 | sprintf "error_parse: %s\n" result.msg ^ |
747 | - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id | |
749 | + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix | |
748 | 750 | | ParseTimeout -> |
749 | - create_latex_chart path (result.id ^ "_chart") result.chart; | |
751 | + create_latex_chart path (result.file_prefix ^ "_chart") result.chart; | |
750 | 752 | sprintf "timeout: %s\n" result.msg ^ |
751 | - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id | |
753 | + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix | |
752 | 754 | | NotParsed -> |
753 | - create_latex_chart path (result.id ^ "_chart") result.chart; | |
755 | + create_latex_chart path (result.file_prefix ^ "_chart") result.chart; | |
754 | 756 | sprintf "not_parsed: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^ |
755 | - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id | |
757 | + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix | |
756 | 758 | | ReductionError -> sprintf "error_reduction: %s\n" result.msg |
757 | 759 | | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size |
758 | 760 | | NotReduced -> sprintf "not_reduced: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size |
759 | 761 | | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size |
760 | 762 | (* | NotTranslated -> "not_translated: \n" *) |
761 | 763 | | Parsed -> |
762 | - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree; | |
763 | - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree; | |
764 | - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; | |
764 | + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; | |
765 | + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; | |
766 | + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; | |
765 | 767 | sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size ^ |
766 | - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^ | |
767 | - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^ | |
768 | - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id | |
768 | + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ | |
769 | + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ | |
770 | + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix | |
769 | 771 | | _ -> failwith "html_of_eniam_sentence" |
770 | 772 | |
771 | 773 | let html_of_conll_sentence path tokens (result : conll_parse_result) = |
... | ... | @@ -774,46 +776,46 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) = |
774 | 776 | (* | PreprocessingError -> "error_pre: %s\n" result.msg *) |
775 | 777 | | LexiconError -> sprintf "error_lex: %s\n" result.msg |
776 | 778 | | ParseError -> |
777 | - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart; | |
778 | - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart; | |
779 | + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart; | |
780 | + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart; | |
779 | 781 | sprintf "error_parse: %s\n" result.msg ^ |
780 | - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^ | |
781 | - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id | |
782 | + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^ | |
783 | + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix | |
782 | 784 | | ParseTimeout -> |
783 | - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart; | |
784 | - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart; | |
785 | + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart; | |
786 | + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart; | |
785 | 787 | sprintf "timeout: %s\n" result.msg ^ |
786 | - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^ | |
787 | - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id | |
788 | + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^ | |
789 | + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix | |
788 | 790 | | NotParsed -> |
789 | - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart; | |
790 | - create_latex_not_parsed_dep_chart path (result.id ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart; | |
791 | + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart; | |
792 | + create_latex_not_parsed_dep_chart path (result.file_prefix ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart; | |
791 | 793 | sprintf "not_parsed\n" ^ |
792 | - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^ | |
793 | - sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.id | |
794 | + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^ | |
795 | + sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.file_prefix | |
794 | 796 | | ReductionError -> sprintf "error_reduction: %s\n" result.msg |
795 | 797 | | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d\n" result.paths_size |
796 | 798 | | NotReduced -> |
797 | - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; | |
799 | + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; | |
798 | 800 | sprintf "not_reduced: paths_size=%d\n" result.paths_size ^ |
799 | - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id | |
801 | + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix | |
800 | 802 | | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size |
801 | 803 | (* | NotTranslated -> "not_translated: \n" *) |
802 | 804 | | Parsed -> |
803 | - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree; | |
804 | - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree; | |
805 | - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; | |
805 | + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; | |
806 | + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; | |
807 | + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; | |
806 | 808 | sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size ^ |
807 | - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^ | |
808 | - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^ | |
809 | - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id | |
809 | + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ | |
810 | + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ | |
811 | + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix | |
810 | 812 | | _ -> failwith "html_of_conll_sentence" |
811 | 813 | |
812 | 814 | |
813 | 815 | let rec html_of_sentence path tokens = function |
814 | 816 | RawSentence s -> s |
815 | - | StructSentence(_,paths,last) -> html_of_struct_sentence tokens paths last | |
816 | - | DepSentence(_,paths) -> html_of_dep_sentence tokens paths | |
817 | + | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last | |
818 | + | DepSentence(paths) -> html_of_dep_sentence tokens paths | |
817 | 819 | | ENIAMSentence result -> html_of_eniam_sentence path tokens result |
818 | 820 | | CONLLSentence result -> html_of_conll_sentence path tokens result |
819 | 821 | | QuotedSentences sentences -> |
... | ... |
pre/preProcessing.ml
... | ... | @@ -614,12 +614,13 @@ let parse_text = function |
614 | 614 | AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in |
615 | 615 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)] |
616 | 616 | | AltText[Raw,RawText query;CONLL,StructText([ |
617 | - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence(_,dep_paths)]} as p]],tokens)] -> | |
617 | + StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]],tokens)] -> | |
618 | 618 | parse_conll tokens dep_paths; |
619 | 619 | let paths = parse query in |
620 | 620 | let sentences = PreSentences.split_into_sentences query tokens paths in |
621 | + let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in | |
621 | 622 | let conll = StructParagraph[{p with psentence = AltSentence[Raw, RawSentence text; |
622 | - Mate, DepSentence("M",dep_paths); CONLL, DepSentence("C",dep_paths)]}] in | |
623 | + Mate, DepSentence m_dep_paths; CONLL, DepSentence dep_paths]}] in | |
623 | 624 | AltText[Raw,RawText query; Struct, StructText([ |
624 | 625 | AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)] |
625 | 626 | | _ -> failwith "parse_text: not implemented" |
... | ... |
pre/preSentences.ml
... | ... | @@ -147,17 +147,15 @@ let find_tokens_in_chart tokens chart lnode rnode cat = |
147 | 147 | |
148 | 148 | let rec add_struct_sentence_ids_rec n sentences = |
149 | 149 | Xlist.fold sentences ([],n) (fun (l,n) -> function |
150 | - {psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p -> | |
151 | - {p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E" ^ string_of_int n,paths,last)]} :: l, n+1 | |
152 | - | {psentence=AltSentence[Raw,s;ENIAM,QuotedSentences sentences]} as p -> | |
150 | + {psentence=AltSentence[Raw,s;Struct,QuotedSentences sentences]} as p -> | |
153 | 151 | let sentences, n = add_struct_sentence_ids_rec n sentences in |
154 | - {p with psentence=AltSentence[Raw,s;ENIAM,QuotedSentences (List.rev sentences)]} :: l, n+1 | |
155 | - | _ -> failwith "add_struct_sentence_ids") | |
152 | + {p with psentence=AltSentence[Raw,s;Struct,QuotedSentences (List.rev sentences)]} :: l, n | |
153 | + | p -> {p with pfile_prefix=string_of_int n} :: l, n+1) | |
156 | 154 | |
157 | 155 | let add_struct_sentence_ids sentences = |
158 | 156 | match sentences with |
159 | - [{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p] -> | |
160 | - [{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E",paths,last)]}] | |
157 | + [{psentence=AltSentence[Raw,_;Struct,QuotedSentences _]}] -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences)) | |
158 | + | [p] -> [p] | |
161 | 159 | | _ -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences)) |
162 | 160 | |
163 | 161 | let prepare_indexes paths = |
... | ... | @@ -181,13 +179,16 @@ let rec extract_sentences_rec tokens id = |
181 | 179 | match t.token with |
182 | 180 | Tokens("sentence",ids) -> |
183 | 181 | let paths,last = make_paths tokens ids in |
184 | - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; | |
182 | + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix=""; | |
185 | 183 | psentence=AltSentence[Raw,RawSentence t.orth; |
186 | - ENIAM,StructSentence("",paths,last)]}] | |
184 | + ENIAM,StructSentence(paths,last); | |
185 | + Mate,RawSentence t.orth; | |
186 | + Swigra,RawSentence t.orth; | |
187 | + POLFIE,RawSentence t.orth]}] | |
187 | 188 | | Tokens("quoted_sentences",ids) -> |
188 | - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; | |
189 | + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix=""; | |
189 | 190 | psentence=AltSentence[Raw,RawSentence t.orth; |
190 | - ENIAM,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}] | |
191 | + Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}] | |
191 | 192 | | _ -> [] |
192 | 193 | |
193 | 194 | let extract_sentences tokens chart last = |
... | ... |
pre/preTypes.ml
... | ... | @@ -117,7 +117,7 @@ let empty_token = { |
117 | 117 | lroles="",""; semantics=Normal} |
118 | 118 | |
119 | 119 | type mode = |
120 | - Raw | Struct | CONLL | ENIAM | Mate | |
120 | + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE | |
121 | 121 | |
122 | 122 | (* warstwy nkjp1m do analizy: |
123 | 123 | header |
... | ... | @@ -133,14 +133,14 @@ ann_named |
133 | 133 | type sentence = |
134 | 134 | RawSentence of string |
135 | 135 | (* | CONLL of conll list *) |
136 | - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *) | |
137 | - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *) | |
136 | + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *) | |
137 | + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *) | |
138 | 138 | | QuotedSentences of paragraph_record list |
139 | 139 | (* | NKJP1M of nkjp1m list *) |
140 | 140 | (* | Skladnica of skladnica_tree *) |
141 | 141 | | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *) |
142 | 142 | |
143 | -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) | |
143 | +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *) | |
144 | 144 | |
145 | 145 | and paragraph = |
146 | 146 | RawParagraph of string |
... | ... |