Commit 78e20b4b74f1f5a06ee32a42436f1fcff34afc05

Authored by Wojciech Jaworski
1 parent 0b6dd720

generowanie wejścia dla Swigry i POLFIE

corpora/CONLL.ml
@@ -33,8 +33,8 @@ let string_of_paths mode tokens paths = @@ -33,8 +33,8 @@ let string_of_paths mode tokens paths =
33 33
34 let rec string_of_sentence mode tokens = function 34 let rec string_of_sentence mode tokens = function
35 RawSentence s -> if mode = Raw then s else "" 35 RawSentence s -> if mode = Raw then s else ""
36 - | StructSentence (_,tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)  
37 - | DepSentence (_, paths) -> string_of_paths mode tokens paths 36 + | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)
  37 + | DepSentence (paths) -> string_of_paths mode tokens paths
38 | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") 38 | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences")
39 | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts 39 | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts
40 40
@@ -111,8 +111,8 @@ let info_map = @@ -111,8 +111,8 @@ let info_map =
111 let match_sentence (p_record,tokens) = 111 let match_sentence (p_record,tokens) =
112 let rec info_token s = match s with 112 let rec info_token s = match s with
113 RawSentence text -> failwith ("match_sentence: " ^ text) 113 RawSentence text -> failwith ("match_sentence: " ^ text)
114 - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)  
115 - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths 114 + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
  115 + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
116 | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") 116 | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")
117 | AltSentence alts -> failwith ("match_sentence: AltSentence") 117 | AltSentence alts -> failwith ("match_sentence: AltSentence")
118 (*if List.exists (fun (mode, s) -> mode = CONLL) alts 118 (*if List.exists (fun (mode, s) -> mode = CONLL) alts
@@ -122,8 +122,8 @@ let match_sentence (p_record,tokens) = @@ -122,8 +122,8 @@ let match_sentence (p_record,tokens) =
122 try 122 try
123 let id, text = StringMap.find info_map info_token in 123 let id, text = StringMap.find info_map info_token in
124 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in 124 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
125 - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len;  
126 - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] 125 + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
  126 + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
127 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) 127 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
128 with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] 128 with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]
129 129
@@ -188,8 +188,8 @@ let info_map = @@ -188,8 +188,8 @@ let info_map =
188 let match_sentence (p_record,tokens) = 188 let match_sentence (p_record,tokens) =
189 let rec info_token s = match s with 189 let rec info_token s = match s with
190 RawSentence text -> failwith ("match_sentence: " ^ text) 190 RawSentence text -> failwith ("match_sentence: " ^ text)
191 - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)  
192 - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths 191 + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
  192 + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
193 | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") 193 | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")
194 | AltSentence alts -> failwith ("match_sentence: AltSentence") 194 | AltSentence alts -> failwith ("match_sentence: AltSentence")
195 (*if List.exists (fun (mode, s) -> mode = CONLL) alts 195 (*if List.exists (fun (mode, s) -> mode = CONLL) alts
@@ -199,8 +199,8 @@ let match_sentence (p_record,tokens) = @@ -199,8 +199,8 @@ let match_sentence (p_record,tokens) =
199 try 199 try
200 let id, text = StringMap.find info_map info_token in 200 let id, text = StringMap.find info_map info_token in
201 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in 201 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
202 - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len;  
203 - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] 202 + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
  203 + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
204 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) 204 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
205 with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] 205 with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]
206 206
@@ -274,7 +274,7 @@ let load_sentence in_channel = @@ -274,7 +274,7 @@ let load_sentence in_channel =
274 then raise End_of_file 274 then raise End_of_file
275 else rev_paths, id in 275 else rev_paths, id in
276 let rev_paths, id = pom [] "" in 276 let rev_paths, id = pom [] "" in
277 - {pid = id; pbeg = -1; plen = -1; pnext = -1; psentence = DepSentence("",Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens 277 + {pid = id; pbeg = -1; plen = -1; pnext = -1; pfile_prefix = ""; psentence = DepSentence(Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens
278 (* {s_id = id; s_text = ""; s_paths = (List.rev rev_paths)} *) 278 (* {s_id = id; s_text = ""; s_paths = (List.rev rev_paths)} *)
279 279
280 let load_corpus in_channel = 280 let load_corpus in_channel =
parser/exec.ml
@@ -43,7 +43,7 @@ let empty_result = { @@ -43,7 +43,7 @@ let empty_result = {
43 (*structs=SemTypes.Atom "",SemTypes.Label "",SemTypes.Label "",[],""*)} 43 (*structs=SemTypes.Atom "",SemTypes.Label "",SemTypes.Label "",[],""*)}
44 44
45 let empty_eniam_parse_result = { 45 let empty_eniam_parse_result = {
46 - id=""; 46 + file_prefix="";
47 status=Idle; 47 status=Idle;
48 msg=""; 48 msg="";
49 lex_time=0.; 49 lex_time=0.;
@@ -58,7 +58,7 @@ let empty_eniam_parse_result = { @@ -58,7 +58,7 @@ let empty_eniam_parse_result = {
58 } 58 }
59 59
60 let empty_conll_parse_result = { 60 let empty_conll_parse_result = {
61 - id=""; 61 + file_prefix="";
62 status=Idle; 62 status=Idle;
63 msg=""; 63 msg="";
64 lex_time=0.; 64 lex_time=0.;
@@ -102,14 +102,16 @@ let translate_mode = function @@ -102,14 +102,16 @@ let translate_mode = function
102 | PreTypes.CONLL -> CONLL 102 | PreTypes.CONLL -> CONLL
103 | PreTypes.ENIAM -> ENIAM 103 | PreTypes.ENIAM -> ENIAM
104 | PreTypes.Mate -> Mate 104 | PreTypes.Mate -> Mate
  105 + | PreTypes.Swigra -> Swigra
  106 + | PreTypes.POLFIE -> POLFIE
105 107
106 let rec translate_sentence = function 108 let rec translate_sentence = function
107 PreTypes.RawSentence s -> RawSentence s 109 PreTypes.RawSentence s -> RawSentence s
108 - | PreTypes.StructSentence(id,paths,last) -> StructSentence(id,paths,last)  
109 - | PreTypes.DepSentence(id,paths) -> DepSentence(id,paths) 110 + | PreTypes.StructSentence(paths,last) -> StructSentence(paths,last)
  111 + | PreTypes.DepSentence(paths) -> DepSentence(paths)
110 | PreTypes.QuotedSentences sentences -> 112 | PreTypes.QuotedSentences sentences ->
111 QuotedSentences(Xlist.map sentences (fun p -> 113 QuotedSentences(Xlist.map sentences (fun p ->
112 - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; 114 + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix;
113 psentence=translate_sentence p.PreTypes.psentence})) 115 psentence=translate_sentence p.PreTypes.psentence}))
114 | PreTypes.AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) -> 116 | PreTypes.AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) ->
115 translate_mode mode, translate_sentence sentence)) 117 translate_mode mode, translate_sentence sentence))
@@ -118,7 +120,7 @@ let rec translate_paragraph = function @@ -118,7 +120,7 @@ let rec translate_paragraph = function
118 PreTypes.RawParagraph s -> RawParagraph s 120 PreTypes.RawParagraph s -> RawParagraph s
119 | PreTypes.StructParagraph sentences -> 121 | PreTypes.StructParagraph sentences ->
120 StructParagraph(Xlist.map sentences (fun p -> 122 StructParagraph(Xlist.map sentences (fun p ->
121 - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; 123 + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix;
122 psentence=translate_sentence p.PreTypes.psentence})) 124 psentence=translate_sentence p.PreTypes.psentence}))
123 | PreTypes.AltParagraph l -> AltParagraph(Xlist.map l (fun (mode,paragraph) -> 125 | PreTypes.AltParagraph l -> AltParagraph(Xlist.map l (fun (mode,paragraph) ->
124 translate_mode mode, translate_paragraph paragraph)) 126 translate_mode mode, translate_paragraph paragraph))
@@ -130,8 +132,8 @@ let rec translate_text = function @@ -130,8 +132,8 @@ let rec translate_text = function
130 | PreTypes.AltText l -> AltText(Xlist.map l (fun (mode,text) -> 132 | PreTypes.AltText l -> AltText(Xlist.map l (fun (mode,text) ->
131 translate_mode mode, translate_text text)) 133 translate_mode mode, translate_text text))
132 134
133 -let eniam_parse_sentence timeout test_only_flag id paths last tokens =  
134 - let result = {empty_eniam_parse_result with id=id} in 135 +let eniam_parse_sentence timeout test_only_flag paths last tokens =
  136 + let result = empty_eniam_parse_result in
135 let time2 = time_fun () in 137 let time2 = time_fun () in
136 try 138 try
137 let chart = LCGlexicon.create (paths,last) tokens in 139 let chart = LCGlexicon.create (paths,last) tokens in
@@ -187,8 +189,8 @@ let eniam_parse_sentence timeout test_only_flag id paths last tokens = @@ -187,8 +189,8 @@ let eniam_parse_sentence timeout test_only_flag id paths last tokens =
187 let time3 = time_fun () in 189 let time3 = time_fun () in
188 {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2} 190 {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}
189 191
190 -let conll_parse_sentence timeout test_only_flag id paths tokens =  
191 - let result = {empty_conll_parse_result with id=id} in 192 +let conll_parse_sentence timeout test_only_flag paths tokens =
  193 + let result = empty_conll_parse_result in
192 let time2 = time_fun () in 194 let time2 = time_fun () in
193 try 195 try
194 let dep_chart = LCGlexicon.dep_create paths tokens in 196 let dep_chart = LCGlexicon.dep_create paths tokens in
@@ -253,22 +255,33 @@ let conll_parse_sentence timeout test_only_flag id paths tokens = @@ -253,22 +255,33 @@ let conll_parse_sentence timeout test_only_flag id paths tokens =
253 255
254 let mate_in, mate_out = Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test" 256 let mate_in, mate_out = Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"
255 257
  258 +let file_prefix_of_mode = function
  259 + Raw -> "R"
  260 + | Struct -> "St"
  261 + | CONLL -> "C"
  262 + | ENIAM -> "E"
  263 + | Mate -> "M"
  264 + | Swigra -> "S"
  265 + | POLFIE -> "P"
  266 +
256 let get_paths = function 267 let get_paths = function
257 - {PreTypes.psentence=PreTypes.DepSentence(_,paths)},_ -> paths 268 + {PreTypes.psentence=PreTypes.DepSentence(paths)},_ -> paths
258 | _ -> failwith "get_paths" 269 | _ -> failwith "get_paths"
259 270
260 -let rec parse_sentence timeout test_only_flag mode tokens = function 271 +let rec parse_sentence timeout test_only_flag mode file_prefix tokens = function
261 RawSentence s -> RawSentence s 272 RawSentence s -> RawSentence s
262 - | StructSentence(id,paths,last) -> 273 + | StructSentence(paths,last) ->
263 (match mode with 274 (match mode with
264 ENIAM -> 275 ENIAM ->
265 - let result = eniam_parse_sentence timeout test_only_flag id paths last tokens in 276 + let result = eniam_parse_sentence timeout test_only_flag paths last tokens in
  277 + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in
266 ENIAMSentence result 278 ENIAMSentence result
267 | _ -> failwith "parse_sentence") 279 | _ -> failwith "parse_sentence")
268 - | DepSentence(id,paths) -> 280 + | DepSentence(paths) ->
269 (match mode with 281 (match mode with
270 CONLL -> 282 CONLL ->
271 - let result = conll_parse_sentence timeout test_only_flag id paths tokens in 283 + let result = conll_parse_sentence timeout test_only_flag paths tokens in
  284 + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in
272 CONLLSentence result 285 CONLLSentence result
273 (* let xml = DepTree.conll_to_xml paths in 286 (* let xml = DepTree.conll_to_xml paths in
274 let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) 287 let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *)
@@ -279,22 +292,23 @@ let rec parse_sentence timeout test_only_flag mode tokens = function @@ -279,22 +292,23 @@ let rec parse_sentence timeout test_only_flag mode tokens = function
279 print_endline "parse_sentence 1"; 292 print_endline "parse_sentence 1";
280 let conll = CONLL.string_of_paths PreTypes.Mate tokens paths in 293 let conll = CONLL.string_of_paths PreTypes.Mate tokens paths in
281 print_endline "parse_sentence 2"; 294 print_endline "parse_sentence 2";
282 - printf "|%s|\n" conll; 295 + (* printf "|%s|\n" conll; *)
283 Printf.fprintf mate_out "%s\n\n%!" conll; 296 Printf.fprintf mate_out "%s\n\n%!" conll;
284 print_endline "parse_sentence 3"; 297 print_endline "parse_sentence 3";
285 let new_paths = get_paths (CONLL.load_sentence mate_in) in 298 let new_paths = get_paths (CONLL.load_sentence mate_in) in
286 print_endline "parse_sentence 4"; 299 print_endline "parse_sentence 4";
287 - let result = conll_parse_sentence timeout test_only_flag id new_paths tokens in 300 + let result = conll_parse_sentence timeout test_only_flag new_paths tokens in
  301 + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in
288 CONLLSentence result 302 CONLLSentence result
289 | _ -> failwith "parse_sentence") 303 | _ -> failwith "parse_sentence")
290 | QuotedSentences sentences -> 304 | QuotedSentences sentences ->
291 let sentences = Xlist.rev_map sentences (fun p -> 305 let sentences = Xlist.rev_map sentences (fun p ->
292 - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in 306 + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in
293 {p with psentence=sentence}) in 307 {p with psentence=sentence}) in
294 QuotedSentences(List.rev sentences) 308 QuotedSentences(List.rev sentences)
295 | AltSentence l -> 309 | AltSentence l ->
296 let l = Xlist.rev_map l (fun (mode,sentence) -> 310 let l = Xlist.rev_map l (fun (mode,sentence) ->
297 - mode, parse_sentence timeout test_only_flag mode tokens sentence) in 311 + mode, parse_sentence timeout test_only_flag mode file_prefix tokens sentence) in
298 AltSentence(List.rev l) 312 AltSentence(List.rev l)
299 | _ -> failwith "parse_sentence" 313 | _ -> failwith "parse_sentence"
300 314
@@ -302,7 +316,7 @@ let rec parse_paragraph timeout test_only_flag mode tokens = function @@ -302,7 +316,7 @@ let rec parse_paragraph timeout test_only_flag mode tokens = function
302 RawParagraph s -> RawParagraph s 316 RawParagraph s -> RawParagraph s
303 | StructParagraph sentences -> 317 | StructParagraph sentences ->
304 let sentences = Xlist.rev_map sentences (fun p -> 318 let sentences = Xlist.rev_map sentences (fun p ->
305 - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in 319 + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in
306 {p with psentence=sentence}) in 320 {p with psentence=sentence}) in
307 StructParagraph(List.rev sentences) 321 StructParagraph(List.rev sentences)
308 | AltParagraph l -> 322 | AltParagraph l ->
parser/execTypes.ml
@@ -20,7 +20,7 @@ @@ -20,7 +20,7 @@
20 type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated 20 type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated
21 21
22 type eniam_parse_result = { 22 type eniam_parse_result = {
23 - id: string; 23 + file_prefix: string;
24 status: status; 24 status: status;
25 msg: string; 25 msg: string;
26 lex_time: float; 26 lex_time: float;
@@ -35,7 +35,7 @@ type eniam_parse_result = { @@ -35,7 +35,7 @@ type eniam_parse_result = {
35 } 35 }
36 36
37 type conll_parse_result = { 37 type conll_parse_result = {
38 - id: string; 38 + file_prefix: string;
39 status: status; 39 status: status;
40 msg: string; 40 msg: string;
41 lex_time: float; 41 lex_time: float;
@@ -54,13 +54,13 @@ type conll_parse_result = { @@ -54,13 +54,13 @@ type conll_parse_result = {
54 } 54 }
55 55
56 type mode = 56 type mode =
57 - Raw | Struct | CONLL | ENIAM | Mate 57 + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE
58 58
59 type sentence = 59 type sentence =
60 RawSentence of string 60 RawSentence of string
61 (* | CONLL of conll list *) 61 (* | CONLL of conll list *)
62 - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *)  
63 - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *) 62 + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *)
  63 + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *)
64 | QuotedSentences of paragraph_record list 64 | QuotedSentences of paragraph_record list
65 (* | NKJP1M of nkjp1m list *) 65 (* | NKJP1M of nkjp1m list *)
66 (* | Skladnica of skladnica_tree *) 66 (* | Skladnica of skladnica_tree *)
@@ -68,7 +68,7 @@ type sentence = @@ -68,7 +68,7 @@ type sentence =
68 | ENIAMSentence of eniam_parse_result 68 | ENIAMSentence of eniam_parse_result
69 | CONLLSentence of conll_parse_result 69 | CONLLSentence of conll_parse_result
70 70
71 -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) 71 +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *)
72 72
73 and paragraph = 73 and paragraph =
74 RawParagraph of string 74 RawParagraph of string
parser/pipe.ml
@@ -118,9 +118,9 @@ let lcg_process query = @@ -118,9 +118,9 @@ let lcg_process query =
118 let _ = Unix.shutdown_connection ic in 118 let _ = Unix.shutdown_connection ic in
119 () 119 ()
120 120
121 -(* let _ = 121 +let _ =
122 if Array.length Sys.argv < 2 then print_endline "missing argument" else 122 if Array.length Sys.argv < 2 then print_endline "missing argument" else
123 - lcg_process Sys.argv.(1) *) 123 + lcg_process Sys.argv.(1)
124 124
125 125
126 (* FIXME: parser dziwnie się zachowuje dla 'ścieżki anomalia.' 'ścieżki anomalia. GG' itp. - nie parsuje '.' a jak sparsuje to nie chce redukować *) 126 (* FIXME: parser dziwnie się zachowuje dla 'ścieżki anomalia.' 'ścieżki anomalia. GG' itp. - nie parsuje '.' a jak sparsuje to nie chce redukować *)
@@ -210,7 +210,7 @@ let process_conll_corpus filename = @@ -210,7 +210,7 @@ let process_conll_corpus filename =
210 let _ = 210 let _ =
211 (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *) 211 (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *)
212 (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) 212 (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *)
213 - process_conll_corpus "../testy/skladnica-test1.conll"; 213 + (* process_conll_corpus "../testy/skladnica-test1.conll"; *)
214 () 214 ()
215 215
216 (* TO DO: 216 (* TO DO:
@@ -227,7 +227,8 @@ let _ = @@ -227,7 +227,8 @@ let _ =
227 - assign_not_parsed 227 - assign_not_parsed
228 - sprawdzenie zerowania globalnych referencji przy parsowaniu korpusu 228 - sprawdzenie zerowania globalnych referencji przy parsowaniu korpusu
229 - mateParser 229 - mateParser
230 - 2016.10.19 230 + 2016.10.22
  231 + - przerobić AltSentence tak by prefix nazw plików był jego elementem, albo wstawić liczbę z prefiksu do paragraph_record
231 *) 232 *)
232 233
233 234
parser/visualization.ml
@@ -640,6 +640,8 @@ let string_of_mode = function @@ -640,6 +640,8 @@ let string_of_mode = function
640 | CONLL -> "CONLL" 640 | CONLL -> "CONLL"
641 | ENIAM -> "ENIAM" 641 | ENIAM -> "ENIAM"
642 | Mate -> "Mate" 642 | Mate -> "Mate"
  643 + | Swigra -> "Swigra"
  644 + | POLFIE -> "POLFIE"
643 645
644 (*let rec string_of_sentence = function 646 (*let rec string_of_sentence = function
645 RawSentence s -> sprintf "RawSentence(%s)" s 647 RawSentence s -> sprintf "RawSentence(%s)" s
@@ -742,30 +744,30 @@ let html_of_eniam_sentence path tokens (result : eniam_parse_result) = @@ -742,30 +744,30 @@ let html_of_eniam_sentence path tokens (result : eniam_parse_result) =
742 (* | PreprocessingError -> "error_pre: %s\n" result.msg *) 744 (* | PreprocessingError -> "error_pre: %s\n" result.msg *)
743 | LexiconError -> sprintf "error_lex: %s\n" result.msg 745 | LexiconError -> sprintf "error_lex: %s\n" result.msg
744 | ParseError -> 746 | ParseError ->
745 - create_latex_chart path (result.id ^ "_chart") result.chart; 747 + create_latex_chart path (result.file_prefix ^ "_chart") result.chart;
746 sprintf "error_parse: %s\n" result.msg ^ 748 sprintf "error_parse: %s\n" result.msg ^
747 - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id 749 + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix
748 | ParseTimeout -> 750 | ParseTimeout ->
749 - create_latex_chart path (result.id ^ "_chart") result.chart; 751 + create_latex_chart path (result.file_prefix ^ "_chart") result.chart;
750 sprintf "timeout: %s\n" result.msg ^ 752 sprintf "timeout: %s\n" result.msg ^
751 - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id 753 + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix
752 | NotParsed -> 754 | NotParsed ->
753 - create_latex_chart path (result.id ^ "_chart") result.chart; 755 + create_latex_chart path (result.file_prefix ^ "_chart") result.chart;
754 sprintf "not_parsed: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^ 756 sprintf "not_parsed: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^
755 - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id 757 + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix
756 | ReductionError -> sprintf "error_reduction: %s\n" result.msg 758 | ReductionError -> sprintf "error_reduction: %s\n" result.msg
757 | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size 759 | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size
758 | NotReduced -> sprintf "not_reduced: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size 760 | NotReduced -> sprintf "not_reduced: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size
759 | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size 761 | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size
760 (* | NotTranslated -> "not_translated: \n" *) 762 (* | NotTranslated -> "not_translated: \n" *)
761 | Parsed -> 763 | Parsed ->
762 - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree;  
763 - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree;  
764 - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; 764 + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree;
  765 + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree;
  766 + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree;
765 sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size ^ 767 sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size ^
766 - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^  
767 - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^  
768 - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id 768 + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^
  769 + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^
  770 + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix
769 | _ -> failwith "html_of_eniam_sentence" 771 | _ -> failwith "html_of_eniam_sentence"
770 772
771 let html_of_conll_sentence path tokens (result : conll_parse_result) = 773 let html_of_conll_sentence path tokens (result : conll_parse_result) =
@@ -774,46 +776,46 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) = @@ -774,46 +776,46 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) =
774 (* | PreprocessingError -> "error_pre: %s\n" result.msg *) 776 (* | PreprocessingError -> "error_pre: %s\n" result.msg *)
775 | LexiconError -> sprintf "error_lex: %s\n" result.msg 777 | LexiconError -> sprintf "error_lex: %s\n" result.msg
776 | ParseError -> 778 | ParseError ->
777 - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart;  
778 - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart; 779 + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart;
  780 + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart;
779 sprintf "error_parse: %s\n" result.msg ^ 781 sprintf "error_parse: %s\n" result.msg ^
780 - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^  
781 - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id 782 + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^
  783 + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix
782 | ParseTimeout -> 784 | ParseTimeout ->
783 - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart;  
784 - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart; 785 + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart;
  786 + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart;
785 sprintf "timeout: %s\n" result.msg ^ 787 sprintf "timeout: %s\n" result.msg ^
786 - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^  
787 - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id 788 + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^
  789 + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix
788 | NotParsed -> 790 | NotParsed ->
789 - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart;  
790 - create_latex_not_parsed_dep_chart path (result.id ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart; 791 + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart;
  792 + create_latex_not_parsed_dep_chart path (result.file_prefix ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart;
791 sprintf "not_parsed\n" ^ 793 sprintf "not_parsed\n" ^
792 - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^  
793 - sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.id 794 + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^
  795 + sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.file_prefix
794 | ReductionError -> sprintf "error_reduction: %s\n" result.msg 796 | ReductionError -> sprintf "error_reduction: %s\n" result.msg
795 | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d\n" result.paths_size 797 | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d\n" result.paths_size
796 | NotReduced -> 798 | NotReduced ->
797 - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; 799 + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree;
798 sprintf "not_reduced: paths_size=%d\n" result.paths_size ^ 800 sprintf "not_reduced: paths_size=%d\n" result.paths_size ^
799 - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id 801 + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix
800 | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size 802 | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size
801 (* | NotTranslated -> "not_translated: \n" *) 803 (* | NotTranslated -> "not_translated: \n" *)
802 | Parsed -> 804 | Parsed ->
803 - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree;  
804 - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree;  
805 - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; 805 + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree;
  806 + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree;
  807 + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree;
806 sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size ^ 808 sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size ^
807 - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^  
808 - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^  
809 - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id 809 + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^
  810 + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^
  811 + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix
810 | _ -> failwith "html_of_conll_sentence" 812 | _ -> failwith "html_of_conll_sentence"
811 813
812 814
813 let rec html_of_sentence path tokens = function 815 let rec html_of_sentence path tokens = function
814 RawSentence s -> s 816 RawSentence s -> s
815 - | StructSentence(_,paths,last) -> html_of_struct_sentence tokens paths last  
816 - | DepSentence(_,paths) -> html_of_dep_sentence tokens paths 817 + | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last
  818 + | DepSentence(paths) -> html_of_dep_sentence tokens paths
817 | ENIAMSentence result -> html_of_eniam_sentence path tokens result 819 | ENIAMSentence result -> html_of_eniam_sentence path tokens result
818 | CONLLSentence result -> html_of_conll_sentence path tokens result 820 | CONLLSentence result -> html_of_conll_sentence path tokens result
819 | QuotedSentences sentences -> 821 | QuotedSentences sentences ->
pre/preProcessing.ml
@@ -614,12 +614,13 @@ let parse_text = function @@ -614,12 +614,13 @@ let parse_text = function
614 AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in 614 AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in
615 AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)] 615 AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)]
616 | AltText[Raw,RawText query;CONLL,StructText([ 616 | AltText[Raw,RawText query;CONLL,StructText([
617 - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence(_,dep_paths)]} as p]],tokens)] -> 617 + StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]],tokens)] ->
618 parse_conll tokens dep_paths; 618 parse_conll tokens dep_paths;
619 let paths = parse query in 619 let paths = parse query in
620 let sentences = PreSentences.split_into_sentences query tokens paths in 620 let sentences = PreSentences.split_into_sentences query tokens paths in
  621 + let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in
621 let conll = StructParagraph[{p with psentence = AltSentence[Raw, RawSentence text; 622 let conll = StructParagraph[{p with psentence = AltSentence[Raw, RawSentence text;
622 - Mate, DepSentence("M",dep_paths); CONLL, DepSentence("C",dep_paths)]}] in 623 + Mate, DepSentence m_dep_paths; CONLL, DepSentence dep_paths]}] in
623 AltText[Raw,RawText query; Struct, StructText([ 624 AltText[Raw,RawText query; Struct, StructText([
624 AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)] 625 AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)]
625 | _ -> failwith "parse_text: not implemented" 626 | _ -> failwith "parse_text: not implemented"
pre/preSentences.ml
@@ -147,17 +147,15 @@ let find_tokens_in_chart tokens chart lnode rnode cat = @@ -147,17 +147,15 @@ let find_tokens_in_chart tokens chart lnode rnode cat =
147 147
148 let rec add_struct_sentence_ids_rec n sentences = 148 let rec add_struct_sentence_ids_rec n sentences =
149 Xlist.fold sentences ([],n) (fun (l,n) -> function 149 Xlist.fold sentences ([],n) (fun (l,n) -> function
150 - {psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p ->  
151 - {p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E" ^ string_of_int n,paths,last)]} :: l, n+1  
152 - | {psentence=AltSentence[Raw,s;ENIAM,QuotedSentences sentences]} as p -> 150 + {psentence=AltSentence[Raw,s;Struct,QuotedSentences sentences]} as p ->
153 let sentences, n = add_struct_sentence_ids_rec n sentences in 151 let sentences, n = add_struct_sentence_ids_rec n sentences in
154 - {p with psentence=AltSentence[Raw,s;ENIAM,QuotedSentences (List.rev sentences)]} :: l, n+1  
155 - | _ -> failwith "add_struct_sentence_ids") 152 + {p with psentence=AltSentence[Raw,s;Struct,QuotedSentences (List.rev sentences)]} :: l, n
  153 + | p -> {p with pfile_prefix=string_of_int n} :: l, n+1)
156 154
157 let add_struct_sentence_ids sentences = 155 let add_struct_sentence_ids sentences =
158 match sentences with 156 match sentences with
159 - [{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p] ->  
160 - [{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E",paths,last)]}] 157 + [{psentence=AltSentence[Raw,_;Struct,QuotedSentences _]}] -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences))
  158 + | [p] -> [p]
161 | _ -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences)) 159 | _ -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences))
162 160
163 let prepare_indexes paths = 161 let prepare_indexes paths =
@@ -181,13 +179,16 @@ let rec extract_sentences_rec tokens id = @@ -181,13 +179,16 @@ let rec extract_sentences_rec tokens id =
181 match t.token with 179 match t.token with
182 Tokens("sentence",ids) -> 180 Tokens("sentence",ids) ->
183 let paths,last = make_paths tokens ids in 181 let paths,last = make_paths tokens ids in
184 - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; 182 + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix="";
185 psentence=AltSentence[Raw,RawSentence t.orth; 183 psentence=AltSentence[Raw,RawSentence t.orth;
186 - ENIAM,StructSentence("",paths,last)]}] 184 + ENIAM,StructSentence(paths,last);
  185 + Mate,RawSentence t.orth;
  186 + Swigra,RawSentence t.orth;
  187 + POLFIE,RawSentence t.orth]}]
187 | Tokens("quoted_sentences",ids) -> 188 | Tokens("quoted_sentences",ids) ->
188 - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; 189 + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix="";
189 psentence=AltSentence[Raw,RawSentence t.orth; 190 psentence=AltSentence[Raw,RawSentence t.orth;
190 - ENIAM,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}] 191 + Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}]
191 | _ -> [] 192 | _ -> []
192 193
193 let extract_sentences tokens chart last = 194 let extract_sentences tokens chart last =
pre/preTypes.ml
@@ -117,7 +117,7 @@ let empty_token = { @@ -117,7 +117,7 @@ let empty_token = {
117 lroles="",""; semantics=Normal} 117 lroles="",""; semantics=Normal}
118 118
119 type mode = 119 type mode =
120 - Raw | Struct | CONLL | ENIAM | Mate 120 + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE
121 121
122 (* warstwy nkjp1m do analizy: 122 (* warstwy nkjp1m do analizy:
123 header 123 header
@@ -133,14 +133,14 @@ ann_named @@ -133,14 +133,14 @@ ann_named
133 type sentence = 133 type sentence =
134 RawSentence of string 134 RawSentence of string
135 (* | CONLL of conll list *) 135 (* | CONLL of conll list *)
136 - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *)  
137 - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *) 136 + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *)
  137 + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *)
138 | QuotedSentences of paragraph_record list 138 | QuotedSentences of paragraph_record list
139 (* | NKJP1M of nkjp1m list *) 139 (* | NKJP1M of nkjp1m list *)
140 (* | Skladnica of skladnica_tree *) 140 (* | Skladnica of skladnica_tree *)
141 | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *) 141 | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *)
142 142
143 -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) 143 +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *)
144 144
145 and paragraph = 145 and paragraph =
146 RawParagraph of string 146 RawParagraph of string