Commit 78e20b4b74f1f5a06ee32a42436f1fcff34afc05
1 parent
0b6dd720
generowanie wejścia dla Swigry i POLFIE
Showing
8 changed files
with
112 additions
and
93 deletions
corpora/CONLL.ml
@@ -33,8 +33,8 @@ let string_of_paths mode tokens paths = | @@ -33,8 +33,8 @@ let string_of_paths mode tokens paths = | ||
33 | 33 | ||
34 | let rec string_of_sentence mode tokens = function | 34 | let rec string_of_sentence mode tokens = function |
35 | RawSentence s -> if mode = Raw then s else "" | 35 | RawSentence s -> if mode = Raw then s else "" |
36 | - | StructSentence (_,tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) | ||
37 | - | DepSentence (_, paths) -> string_of_paths mode tokens paths | 36 | + | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) |
37 | + | DepSentence (paths) -> string_of_paths mode tokens paths | ||
38 | | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") | 38 | | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") |
39 | | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts | 39 | | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts |
40 | 40 | ||
@@ -111,8 +111,8 @@ let info_map = | @@ -111,8 +111,8 @@ let info_map = | ||
111 | let match_sentence (p_record,tokens) = | 111 | let match_sentence (p_record,tokens) = |
112 | let rec info_token s = match s with | 112 | let rec info_token s = match s with |
113 | RawSentence text -> failwith ("match_sentence: " ^ text) | 113 | RawSentence text -> failwith ("match_sentence: " ^ text) |
114 | - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) | ||
115 | - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | 114 | + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) |
115 | + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | ||
116 | | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") | 116 | | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") |
117 | | AltSentence alts -> failwith ("match_sentence: AltSentence") | 117 | | AltSentence alts -> failwith ("match_sentence: AltSentence") |
118 | (*if List.exists (fun (mode, s) -> mode = CONLL) alts | 118 | (*if List.exists (fun (mode, s) -> mode = CONLL) alts |
@@ -122,8 +122,8 @@ let match_sentence (p_record,tokens) = | @@ -122,8 +122,8 @@ let match_sentence (p_record,tokens) = | ||
122 | try | 122 | try |
123 | let id, text = StringMap.find info_map info_token in | 123 | let id, text = StringMap.find info_map info_token in |
124 | let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in | 124 | let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in |
125 | - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; | ||
126 | - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] | 125 | + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; |
126 | + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] | ||
127 | (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) | 127 | (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) |
128 | with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] | 128 | with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] |
129 | 129 | ||
@@ -188,8 +188,8 @@ let info_map = | @@ -188,8 +188,8 @@ let info_map = | ||
188 | let match_sentence (p_record,tokens) = | 188 | let match_sentence (p_record,tokens) = |
189 | let rec info_token s = match s with | 189 | let rec info_token s = match s with |
190 | RawSentence text -> failwith ("match_sentence: " ^ text) | 190 | RawSentence text -> failwith ("match_sentence: " ^ text) |
191 | - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) | ||
192 | - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | 191 | + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) |
192 | + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | ||
193 | | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") | 193 | | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") |
194 | | AltSentence alts -> failwith ("match_sentence: AltSentence") | 194 | | AltSentence alts -> failwith ("match_sentence: AltSentence") |
195 | (*if List.exists (fun (mode, s) -> mode = CONLL) alts | 195 | (*if List.exists (fun (mode, s) -> mode = CONLL) alts |
@@ -199,8 +199,8 @@ let match_sentence (p_record,tokens) = | @@ -199,8 +199,8 @@ let match_sentence (p_record,tokens) = | ||
199 | try | 199 | try |
200 | let id, text = StringMap.find info_map info_token in | 200 | let id, text = StringMap.find info_map info_token in |
201 | let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in | 201 | let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in |
202 | - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; | ||
203 | - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] | 202 | + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; |
203 | + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] | ||
204 | (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) | 204 | (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) |
205 | with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] | 205 | with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] |
206 | 206 | ||
@@ -274,7 +274,7 @@ let load_sentence in_channel = | @@ -274,7 +274,7 @@ let load_sentence in_channel = | ||
274 | then raise End_of_file | 274 | then raise End_of_file |
275 | else rev_paths, id in | 275 | else rev_paths, id in |
276 | let rev_paths, id = pom [] "" in | 276 | let rev_paths, id = pom [] "" in |
277 | - {pid = id; pbeg = -1; plen = -1; pnext = -1; psentence = DepSentence("",Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens | 277 | + {pid = id; pbeg = -1; plen = -1; pnext = -1; pfile_prefix = ""; psentence = DepSentence(Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens |
278 | (* {s_id = id; s_text = ""; s_paths = (List.rev rev_paths)} *) | 278 | (* {s_id = id; s_text = ""; s_paths = (List.rev rev_paths)} *) |
279 | 279 | ||
280 | let load_corpus in_channel = | 280 | let load_corpus in_channel = |
parser/exec.ml
@@ -43,7 +43,7 @@ let empty_result = { | @@ -43,7 +43,7 @@ let empty_result = { | ||
43 | (*structs=SemTypes.Atom "",SemTypes.Label "",SemTypes.Label "",[],""*)} | 43 | (*structs=SemTypes.Atom "",SemTypes.Label "",SemTypes.Label "",[],""*)} |
44 | 44 | ||
45 | let empty_eniam_parse_result = { | 45 | let empty_eniam_parse_result = { |
46 | - id=""; | 46 | + file_prefix=""; |
47 | status=Idle; | 47 | status=Idle; |
48 | msg=""; | 48 | msg=""; |
49 | lex_time=0.; | 49 | lex_time=0.; |
@@ -58,7 +58,7 @@ let empty_eniam_parse_result = { | @@ -58,7 +58,7 @@ let empty_eniam_parse_result = { | ||
58 | } | 58 | } |
59 | 59 | ||
60 | let empty_conll_parse_result = { | 60 | let empty_conll_parse_result = { |
61 | - id=""; | 61 | + file_prefix=""; |
62 | status=Idle; | 62 | status=Idle; |
63 | msg=""; | 63 | msg=""; |
64 | lex_time=0.; | 64 | lex_time=0.; |
@@ -102,14 +102,16 @@ let translate_mode = function | @@ -102,14 +102,16 @@ let translate_mode = function | ||
102 | | PreTypes.CONLL -> CONLL | 102 | | PreTypes.CONLL -> CONLL |
103 | | PreTypes.ENIAM -> ENIAM | 103 | | PreTypes.ENIAM -> ENIAM |
104 | | PreTypes.Mate -> Mate | 104 | | PreTypes.Mate -> Mate |
105 | + | PreTypes.Swigra -> Swigra | ||
106 | + | PreTypes.POLFIE -> POLFIE | ||
105 | 107 | ||
106 | let rec translate_sentence = function | 108 | let rec translate_sentence = function |
107 | PreTypes.RawSentence s -> RawSentence s | 109 | PreTypes.RawSentence s -> RawSentence s |
108 | - | PreTypes.StructSentence(id,paths,last) -> StructSentence(id,paths,last) | ||
109 | - | PreTypes.DepSentence(id,paths) -> DepSentence(id,paths) | 110 | + | PreTypes.StructSentence(paths,last) -> StructSentence(paths,last) |
111 | + | PreTypes.DepSentence(paths) -> DepSentence(paths) | ||
110 | | PreTypes.QuotedSentences sentences -> | 112 | | PreTypes.QuotedSentences sentences -> |
111 | QuotedSentences(Xlist.map sentences (fun p -> | 113 | QuotedSentences(Xlist.map sentences (fun p -> |
112 | - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; | 114 | + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix; |
113 | psentence=translate_sentence p.PreTypes.psentence})) | 115 | psentence=translate_sentence p.PreTypes.psentence})) |
114 | | PreTypes.AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) -> | 116 | | PreTypes.AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) -> |
115 | translate_mode mode, translate_sentence sentence)) | 117 | translate_mode mode, translate_sentence sentence)) |
@@ -118,7 +120,7 @@ let rec translate_paragraph = function | @@ -118,7 +120,7 @@ let rec translate_paragraph = function | ||
118 | PreTypes.RawParagraph s -> RawParagraph s | 120 | PreTypes.RawParagraph s -> RawParagraph s |
119 | | PreTypes.StructParagraph sentences -> | 121 | | PreTypes.StructParagraph sentences -> |
120 | StructParagraph(Xlist.map sentences (fun p -> | 122 | StructParagraph(Xlist.map sentences (fun p -> |
121 | - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; | 123 | + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix; |
122 | psentence=translate_sentence p.PreTypes.psentence})) | 124 | psentence=translate_sentence p.PreTypes.psentence})) |
123 | | PreTypes.AltParagraph l -> AltParagraph(Xlist.map l (fun (mode,paragraph) -> | 125 | | PreTypes.AltParagraph l -> AltParagraph(Xlist.map l (fun (mode,paragraph) -> |
124 | translate_mode mode, translate_paragraph paragraph)) | 126 | translate_mode mode, translate_paragraph paragraph)) |
@@ -130,8 +132,8 @@ let rec translate_text = function | @@ -130,8 +132,8 @@ let rec translate_text = function | ||
130 | | PreTypes.AltText l -> AltText(Xlist.map l (fun (mode,text) -> | 132 | | PreTypes.AltText l -> AltText(Xlist.map l (fun (mode,text) -> |
131 | translate_mode mode, translate_text text)) | 133 | translate_mode mode, translate_text text)) |
132 | 134 | ||
133 | -let eniam_parse_sentence timeout test_only_flag id paths last tokens = | ||
134 | - let result = {empty_eniam_parse_result with id=id} in | 135 | +let eniam_parse_sentence timeout test_only_flag paths last tokens = |
136 | + let result = empty_eniam_parse_result in | ||
135 | let time2 = time_fun () in | 137 | let time2 = time_fun () in |
136 | try | 138 | try |
137 | let chart = LCGlexicon.create (paths,last) tokens in | 139 | let chart = LCGlexicon.create (paths,last) tokens in |
@@ -187,8 +189,8 @@ let eniam_parse_sentence timeout test_only_flag id paths last tokens = | @@ -187,8 +189,8 @@ let eniam_parse_sentence timeout test_only_flag id paths last tokens = | ||
187 | let time3 = time_fun () in | 189 | let time3 = time_fun () in |
188 | {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2} | 190 | {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2} |
189 | 191 | ||
190 | -let conll_parse_sentence timeout test_only_flag id paths tokens = | ||
191 | - let result = {empty_conll_parse_result with id=id} in | 192 | +let conll_parse_sentence timeout test_only_flag paths tokens = |
193 | + let result = empty_conll_parse_result in | ||
192 | let time2 = time_fun () in | 194 | let time2 = time_fun () in |
193 | try | 195 | try |
194 | let dep_chart = LCGlexicon.dep_create paths tokens in | 196 | let dep_chart = LCGlexicon.dep_create paths tokens in |
@@ -253,22 +255,33 @@ let conll_parse_sentence timeout test_only_flag id paths tokens = | @@ -253,22 +255,33 @@ let conll_parse_sentence timeout test_only_flag id paths tokens = | ||
253 | 255 | ||
254 | let mate_in, mate_out = Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test" | 256 | let mate_in, mate_out = Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test" |
255 | 257 | ||
258 | +let file_prefix_of_mode = function | ||
259 | + Raw -> "R" | ||
260 | + | Struct -> "St" | ||
261 | + | CONLL -> "C" | ||
262 | + | ENIAM -> "E" | ||
263 | + | Mate -> "M" | ||
264 | + | Swigra -> "S" | ||
265 | + | POLFIE -> "P" | ||
266 | + | ||
256 | let get_paths = function | 267 | let get_paths = function |
257 | - {PreTypes.psentence=PreTypes.DepSentence(_,paths)},_ -> paths | 268 | + {PreTypes.psentence=PreTypes.DepSentence(paths)},_ -> paths |
258 | | _ -> failwith "get_paths" | 269 | | _ -> failwith "get_paths" |
259 | 270 | ||
260 | -let rec parse_sentence timeout test_only_flag mode tokens = function | 271 | +let rec parse_sentence timeout test_only_flag mode file_prefix tokens = function |
261 | RawSentence s -> RawSentence s | 272 | RawSentence s -> RawSentence s |
262 | - | StructSentence(id,paths,last) -> | 273 | + | StructSentence(paths,last) -> |
263 | (match mode with | 274 | (match mode with |
264 | ENIAM -> | 275 | ENIAM -> |
265 | - let result = eniam_parse_sentence timeout test_only_flag id paths last tokens in | 276 | + let result = eniam_parse_sentence timeout test_only_flag paths last tokens in |
277 | + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in | ||
266 | ENIAMSentence result | 278 | ENIAMSentence result |
267 | | _ -> failwith "parse_sentence") | 279 | | _ -> failwith "parse_sentence") |
268 | - | DepSentence(id,paths) -> | 280 | + | DepSentence(paths) -> |
269 | (match mode with | 281 | (match mode with |
270 | CONLL -> | 282 | CONLL -> |
271 | - let result = conll_parse_sentence timeout test_only_flag id paths tokens in | 283 | + let result = conll_parse_sentence timeout test_only_flag paths tokens in |
284 | + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in | ||
272 | CONLLSentence result | 285 | CONLLSentence result |
273 | (* let xml = DepTree.conll_to_xml paths in | 286 | (* let xml = DepTree.conll_to_xml paths in |
274 | let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) | 287 | let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) |
@@ -279,22 +292,23 @@ let rec parse_sentence timeout test_only_flag mode tokens = function | @@ -279,22 +292,23 @@ let rec parse_sentence timeout test_only_flag mode tokens = function | ||
279 | print_endline "parse_sentence 1"; | 292 | print_endline "parse_sentence 1"; |
280 | let conll = CONLL.string_of_paths PreTypes.Mate tokens paths in | 293 | let conll = CONLL.string_of_paths PreTypes.Mate tokens paths in |
281 | print_endline "parse_sentence 2"; | 294 | print_endline "parse_sentence 2"; |
282 | - printf "|%s|\n" conll; | 295 | + (* printf "|%s|\n" conll; *) |
283 | Printf.fprintf mate_out "%s\n\n%!" conll; | 296 | Printf.fprintf mate_out "%s\n\n%!" conll; |
284 | print_endline "parse_sentence 3"; | 297 | print_endline "parse_sentence 3"; |
285 | let new_paths = get_paths (CONLL.load_sentence mate_in) in | 298 | let new_paths = get_paths (CONLL.load_sentence mate_in) in |
286 | print_endline "parse_sentence 4"; | 299 | print_endline "parse_sentence 4"; |
287 | - let result = conll_parse_sentence timeout test_only_flag id new_paths tokens in | 300 | + let result = conll_parse_sentence timeout test_only_flag new_paths tokens in |
301 | + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in | ||
288 | CONLLSentence result | 302 | CONLLSentence result |
289 | | _ -> failwith "parse_sentence") | 303 | | _ -> failwith "parse_sentence") |
290 | | QuotedSentences sentences -> | 304 | | QuotedSentences sentences -> |
291 | let sentences = Xlist.rev_map sentences (fun p -> | 305 | let sentences = Xlist.rev_map sentences (fun p -> |
292 | - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in | 306 | + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in |
293 | {p with psentence=sentence}) in | 307 | {p with psentence=sentence}) in |
294 | QuotedSentences(List.rev sentences) | 308 | QuotedSentences(List.rev sentences) |
295 | | AltSentence l -> | 309 | | AltSentence l -> |
296 | let l = Xlist.rev_map l (fun (mode,sentence) -> | 310 | let l = Xlist.rev_map l (fun (mode,sentence) -> |
297 | - mode, parse_sentence timeout test_only_flag mode tokens sentence) in | 311 | + mode, parse_sentence timeout test_only_flag mode file_prefix tokens sentence) in |
298 | AltSentence(List.rev l) | 312 | AltSentence(List.rev l) |
299 | | _ -> failwith "parse_sentence" | 313 | | _ -> failwith "parse_sentence" |
300 | 314 | ||
@@ -302,7 +316,7 @@ let rec parse_paragraph timeout test_only_flag mode tokens = function | @@ -302,7 +316,7 @@ let rec parse_paragraph timeout test_only_flag mode tokens = function | ||
302 | RawParagraph s -> RawParagraph s | 316 | RawParagraph s -> RawParagraph s |
303 | | StructParagraph sentences -> | 317 | | StructParagraph sentences -> |
304 | let sentences = Xlist.rev_map sentences (fun p -> | 318 | let sentences = Xlist.rev_map sentences (fun p -> |
305 | - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in | 319 | + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in |
306 | {p with psentence=sentence}) in | 320 | {p with psentence=sentence}) in |
307 | StructParagraph(List.rev sentences) | 321 | StructParagraph(List.rev sentences) |
308 | | AltParagraph l -> | 322 | | AltParagraph l -> |
parser/execTypes.ml
@@ -20,7 +20,7 @@ | @@ -20,7 +20,7 @@ | ||
20 | type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated | 20 | type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated |
21 | 21 | ||
22 | type eniam_parse_result = { | 22 | type eniam_parse_result = { |
23 | - id: string; | 23 | + file_prefix: string; |
24 | status: status; | 24 | status: status; |
25 | msg: string; | 25 | msg: string; |
26 | lex_time: float; | 26 | lex_time: float; |
@@ -35,7 +35,7 @@ type eniam_parse_result = { | @@ -35,7 +35,7 @@ type eniam_parse_result = { | ||
35 | } | 35 | } |
36 | 36 | ||
37 | type conll_parse_result = { | 37 | type conll_parse_result = { |
38 | - id: string; | 38 | + file_prefix: string; |
39 | status: status; | 39 | status: status; |
40 | msg: string; | 40 | msg: string; |
41 | lex_time: float; | 41 | lex_time: float; |
@@ -54,13 +54,13 @@ type conll_parse_result = { | @@ -54,13 +54,13 @@ type conll_parse_result = { | ||
54 | } | 54 | } |
55 | 55 | ||
56 | type mode = | 56 | type mode = |
57 | - Raw | Struct | CONLL | ENIAM | Mate | 57 | + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE |
58 | 58 | ||
59 | type sentence = | 59 | type sentence = |
60 | RawSentence of string | 60 | RawSentence of string |
61 | (* | CONLL of conll list *) | 61 | (* | CONLL of conll list *) |
62 | - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *) | ||
63 | - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *) | 62 | + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *) |
63 | + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *) | ||
64 | | QuotedSentences of paragraph_record list | 64 | | QuotedSentences of paragraph_record list |
65 | (* | NKJP1M of nkjp1m list *) | 65 | (* | NKJP1M of nkjp1m list *) |
66 | (* | Skladnica of skladnica_tree *) | 66 | (* | Skladnica of skladnica_tree *) |
@@ -68,7 +68,7 @@ type sentence = | @@ -68,7 +68,7 @@ type sentence = | ||
68 | | ENIAMSentence of eniam_parse_result | 68 | | ENIAMSentence of eniam_parse_result |
69 | | CONLLSentence of conll_parse_result | 69 | | CONLLSentence of conll_parse_result |
70 | 70 | ||
71 | -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) | 71 | +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *) |
72 | 72 | ||
73 | and paragraph = | 73 | and paragraph = |
74 | RawParagraph of string | 74 | RawParagraph of string |
parser/pipe.ml
@@ -118,9 +118,9 @@ let lcg_process query = | @@ -118,9 +118,9 @@ let lcg_process query = | ||
118 | let _ = Unix.shutdown_connection ic in | 118 | let _ = Unix.shutdown_connection ic in |
119 | () | 119 | () |
120 | 120 | ||
121 | -(* let _ = | 121 | +let _ = |
122 | if Array.length Sys.argv < 2 then print_endline "missing argument" else | 122 | if Array.length Sys.argv < 2 then print_endline "missing argument" else |
123 | - lcg_process Sys.argv.(1) *) | 123 | + lcg_process Sys.argv.(1) |
124 | 124 | ||
125 | 125 | ||
126 | (* FIXME: parser dziwnie się zachowuje dla 'ścieżki anomalia.' 'ścieżki anomalia. GG' itp. - nie parsuje '.' a jak sparsuje to nie chce redukować *) | 126 | (* FIXME: parser dziwnie się zachowuje dla 'ścieżki anomalia.' 'ścieżki anomalia. GG' itp. - nie parsuje '.' a jak sparsuje to nie chce redukować *) |
@@ -210,7 +210,7 @@ let process_conll_corpus filename = | @@ -210,7 +210,7 @@ let process_conll_corpus filename = | ||
210 | let _ = | 210 | let _ = |
211 | (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *) | 211 | (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *) |
212 | (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) | 212 | (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) |
213 | - process_conll_corpus "../testy/skladnica-test1.conll"; | 213 | + (* process_conll_corpus "../testy/skladnica-test1.conll"; *) |
214 | () | 214 | () |
215 | 215 | ||
216 | (* TO DO: | 216 | (* TO DO: |
@@ -227,7 +227,8 @@ let _ = | @@ -227,7 +227,8 @@ let _ = | ||
227 | - assign_not_parsed | 227 | - assign_not_parsed |
228 | - sprawdzenie zerowania globalnych referencji przy parsowaniu korpusu | 228 | - sprawdzenie zerowania globalnych referencji przy parsowaniu korpusu |
229 | - mateParser | 229 | - mateParser |
230 | - 2016.10.19 | 230 | + 2016.10.22 |
231 | + - przerobić AltSentence tak by prefix nazw plików był jego elementem, albo wstawić liczbę z prefiksu do paragraph_record | ||
231 | *) | 232 | *) |
232 | 233 | ||
233 | 234 |
parser/visualization.ml
@@ -640,6 +640,8 @@ let string_of_mode = function | @@ -640,6 +640,8 @@ let string_of_mode = function | ||
640 | | CONLL -> "CONLL" | 640 | | CONLL -> "CONLL" |
641 | | ENIAM -> "ENIAM" | 641 | | ENIAM -> "ENIAM" |
642 | | Mate -> "Mate" | 642 | | Mate -> "Mate" |
643 | + | Swigra -> "Swigra" | ||
644 | + | POLFIE -> "POLFIE" | ||
643 | 645 | ||
644 | (*let rec string_of_sentence = function | 646 | (*let rec string_of_sentence = function |
645 | RawSentence s -> sprintf "RawSentence(%s)" s | 647 | RawSentence s -> sprintf "RawSentence(%s)" s |
@@ -742,30 +744,30 @@ let html_of_eniam_sentence path tokens (result : eniam_parse_result) = | @@ -742,30 +744,30 @@ let html_of_eniam_sentence path tokens (result : eniam_parse_result) = | ||
742 | (* | PreprocessingError -> "error_pre: %s\n" result.msg *) | 744 | (* | PreprocessingError -> "error_pre: %s\n" result.msg *) |
743 | | LexiconError -> sprintf "error_lex: %s\n" result.msg | 745 | | LexiconError -> sprintf "error_lex: %s\n" result.msg |
744 | | ParseError -> | 746 | | ParseError -> |
745 | - create_latex_chart path (result.id ^ "_chart") result.chart; | 747 | + create_latex_chart path (result.file_prefix ^ "_chart") result.chart; |
746 | sprintf "error_parse: %s\n" result.msg ^ | 748 | sprintf "error_parse: %s\n" result.msg ^ |
747 | - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id | 749 | + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix |
748 | | ParseTimeout -> | 750 | | ParseTimeout -> |
749 | - create_latex_chart path (result.id ^ "_chart") result.chart; | 751 | + create_latex_chart path (result.file_prefix ^ "_chart") result.chart; |
750 | sprintf "timeout: %s\n" result.msg ^ | 752 | sprintf "timeout: %s\n" result.msg ^ |
751 | - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id | 753 | + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix |
752 | | NotParsed -> | 754 | | NotParsed -> |
753 | - create_latex_chart path (result.id ^ "_chart") result.chart; | 755 | + create_latex_chart path (result.file_prefix ^ "_chart") result.chart; |
754 | sprintf "not_parsed: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^ | 756 | sprintf "not_parsed: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^ |
755 | - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id | 757 | + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix |
756 | | ReductionError -> sprintf "error_reduction: %s\n" result.msg | 758 | | ReductionError -> sprintf "error_reduction: %s\n" result.msg |
757 | | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size | 759 | | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size |
758 | | NotReduced -> sprintf "not_reduced: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size | 760 | | NotReduced -> sprintf "not_reduced: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size |
759 | | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size | 761 | | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size |
760 | (* | NotTranslated -> "not_translated: \n" *) | 762 | (* | NotTranslated -> "not_translated: \n" *) |
761 | | Parsed -> | 763 | | Parsed -> |
762 | - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree; | ||
763 | - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree; | ||
764 | - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; | 764 | + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; |
765 | + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; | ||
766 | + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; | ||
765 | sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size ^ | 767 | sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size ^ |
766 | - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^ | ||
767 | - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^ | ||
768 | - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id | 768 | + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ |
769 | + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ | ||
770 | + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix | ||
769 | | _ -> failwith "html_of_eniam_sentence" | 771 | | _ -> failwith "html_of_eniam_sentence" |
770 | 772 | ||
771 | let html_of_conll_sentence path tokens (result : conll_parse_result) = | 773 | let html_of_conll_sentence path tokens (result : conll_parse_result) = |
@@ -774,46 +776,46 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) = | @@ -774,46 +776,46 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) = | ||
774 | (* | PreprocessingError -> "error_pre: %s\n" result.msg *) | 776 | (* | PreprocessingError -> "error_pre: %s\n" result.msg *) |
775 | | LexiconError -> sprintf "error_lex: %s\n" result.msg | 777 | | LexiconError -> sprintf "error_lex: %s\n" result.msg |
776 | | ParseError -> | 778 | | ParseError -> |
777 | - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart; | ||
778 | - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart; | 779 | + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart; |
780 | + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart; | ||
779 | sprintf "error_parse: %s\n" result.msg ^ | 781 | sprintf "error_parse: %s\n" result.msg ^ |
780 | - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^ | ||
781 | - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id | 782 | + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^ |
783 | + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix | ||
782 | | ParseTimeout -> | 784 | | ParseTimeout -> |
783 | - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart; | ||
784 | - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart; | 785 | + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart; |
786 | + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart; | ||
785 | sprintf "timeout: %s\n" result.msg ^ | 787 | sprintf "timeout: %s\n" result.msg ^ |
786 | - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^ | ||
787 | - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id | 788 | + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^ |
789 | + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix | ||
788 | | NotParsed -> | 790 | | NotParsed -> |
789 | - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart; | ||
790 | - create_latex_not_parsed_dep_chart path (result.id ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart; | 791 | + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart; |
792 | + create_latex_not_parsed_dep_chart path (result.file_prefix ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart; | ||
791 | sprintf "not_parsed\n" ^ | 793 | sprintf "not_parsed\n" ^ |
792 | - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^ | ||
793 | - sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.id | 794 | + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^ |
795 | + sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.file_prefix | ||
794 | | ReductionError -> sprintf "error_reduction: %s\n" result.msg | 796 | | ReductionError -> sprintf "error_reduction: %s\n" result.msg |
795 | | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d\n" result.paths_size | 797 | | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d\n" result.paths_size |
796 | | NotReduced -> | 798 | | NotReduced -> |
797 | - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; | 799 | + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; |
798 | sprintf "not_reduced: paths_size=%d\n" result.paths_size ^ | 800 | sprintf "not_reduced: paths_size=%d\n" result.paths_size ^ |
799 | - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id | 801 | + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix |
800 | | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size | 802 | | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size |
801 | (* | NotTranslated -> "not_translated: \n" *) | 803 | (* | NotTranslated -> "not_translated: \n" *) |
802 | | Parsed -> | 804 | | Parsed -> |
803 | - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree; | ||
804 | - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree; | ||
805 | - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; | 805 | + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; |
806 | + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; | ||
807 | + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; | ||
806 | sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size ^ | 808 | sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size ^ |
807 | - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^ | ||
808 | - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^ | ||
809 | - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id | 809 | + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ |
810 | + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ | ||
811 | + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix | ||
810 | | _ -> failwith "html_of_conll_sentence" | 812 | | _ -> failwith "html_of_conll_sentence" |
811 | 813 | ||
812 | 814 | ||
813 | let rec html_of_sentence path tokens = function | 815 | let rec html_of_sentence path tokens = function |
814 | RawSentence s -> s | 816 | RawSentence s -> s |
815 | - | StructSentence(_,paths,last) -> html_of_struct_sentence tokens paths last | ||
816 | - | DepSentence(_,paths) -> html_of_dep_sentence tokens paths | 817 | + | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last |
818 | + | DepSentence(paths) -> html_of_dep_sentence tokens paths | ||
817 | | ENIAMSentence result -> html_of_eniam_sentence path tokens result | 819 | | ENIAMSentence result -> html_of_eniam_sentence path tokens result |
818 | | CONLLSentence result -> html_of_conll_sentence path tokens result | 820 | | CONLLSentence result -> html_of_conll_sentence path tokens result |
819 | | QuotedSentences sentences -> | 821 | | QuotedSentences sentences -> |
pre/preProcessing.ml
@@ -614,12 +614,13 @@ let parse_text = function | @@ -614,12 +614,13 @@ let parse_text = function | ||
614 | AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in | 614 | AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in |
615 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)] | 615 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)] |
616 | | AltText[Raw,RawText query;CONLL,StructText([ | 616 | | AltText[Raw,RawText query;CONLL,StructText([ |
617 | - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence(_,dep_paths)]} as p]],tokens)] -> | 617 | + StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]],tokens)] -> |
618 | parse_conll tokens dep_paths; | 618 | parse_conll tokens dep_paths; |
619 | let paths = parse query in | 619 | let paths = parse query in |
620 | let sentences = PreSentences.split_into_sentences query tokens paths in | 620 | let sentences = PreSentences.split_into_sentences query tokens paths in |
621 | + let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in | ||
621 | let conll = StructParagraph[{p with psentence = AltSentence[Raw, RawSentence text; | 622 | let conll = StructParagraph[{p with psentence = AltSentence[Raw, RawSentence text; |
622 | - Mate, DepSentence("M",dep_paths); CONLL, DepSentence("C",dep_paths)]}] in | 623 | + Mate, DepSentence m_dep_paths; CONLL, DepSentence dep_paths]}] in |
623 | AltText[Raw,RawText query; Struct, StructText([ | 624 | AltText[Raw,RawText query; Struct, StructText([ |
624 | AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)] | 625 | AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)] |
625 | | _ -> failwith "parse_text: not implemented" | 626 | | _ -> failwith "parse_text: not implemented" |
pre/preSentences.ml
@@ -147,17 +147,15 @@ let find_tokens_in_chart tokens chart lnode rnode cat = | @@ -147,17 +147,15 @@ let find_tokens_in_chart tokens chart lnode rnode cat = | ||
147 | 147 | ||
148 | let rec add_struct_sentence_ids_rec n sentences = | 148 | let rec add_struct_sentence_ids_rec n sentences = |
149 | Xlist.fold sentences ([],n) (fun (l,n) -> function | 149 | Xlist.fold sentences ([],n) (fun (l,n) -> function |
150 | - {psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p -> | ||
151 | - {p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E" ^ string_of_int n,paths,last)]} :: l, n+1 | ||
152 | - | {psentence=AltSentence[Raw,s;ENIAM,QuotedSentences sentences]} as p -> | 150 | + {psentence=AltSentence[Raw,s;Struct,QuotedSentences sentences]} as p -> |
153 | let sentences, n = add_struct_sentence_ids_rec n sentences in | 151 | let sentences, n = add_struct_sentence_ids_rec n sentences in |
154 | - {p with psentence=AltSentence[Raw,s;ENIAM,QuotedSentences (List.rev sentences)]} :: l, n+1 | ||
155 | - | _ -> failwith "add_struct_sentence_ids") | 152 | + {p with psentence=AltSentence[Raw,s;Struct,QuotedSentences (List.rev sentences)]} :: l, n |
153 | + | p -> {p with pfile_prefix=string_of_int n} :: l, n+1) | ||
156 | 154 | ||
157 | let add_struct_sentence_ids sentences = | 155 | let add_struct_sentence_ids sentences = |
158 | match sentences with | 156 | match sentences with |
159 | - [{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p] -> | ||
160 | - [{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E",paths,last)]}] | 157 | + [{psentence=AltSentence[Raw,_;Struct,QuotedSentences _]}] -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences)) |
158 | + | [p] -> [p] | ||
161 | | _ -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences)) | 159 | | _ -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences)) |
162 | 160 | ||
163 | let prepare_indexes paths = | 161 | let prepare_indexes paths = |
@@ -181,13 +179,16 @@ let rec extract_sentences_rec tokens id = | @@ -181,13 +179,16 @@ let rec extract_sentences_rec tokens id = | ||
181 | match t.token with | 179 | match t.token with |
182 | Tokens("sentence",ids) -> | 180 | Tokens("sentence",ids) -> |
183 | let paths,last = make_paths tokens ids in | 181 | let paths,last = make_paths tokens ids in |
184 | - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; | 182 | + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix=""; |
185 | psentence=AltSentence[Raw,RawSentence t.orth; | 183 | psentence=AltSentence[Raw,RawSentence t.orth; |
186 | - ENIAM,StructSentence("",paths,last)]}] | 184 | + ENIAM,StructSentence(paths,last); |
185 | + Mate,RawSentence t.orth; | ||
186 | + Swigra,RawSentence t.orth; | ||
187 | + POLFIE,RawSentence t.orth]}] | ||
187 | | Tokens("quoted_sentences",ids) -> | 188 | | Tokens("quoted_sentences",ids) -> |
188 | - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; | 189 | + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix=""; |
189 | psentence=AltSentence[Raw,RawSentence t.orth; | 190 | psentence=AltSentence[Raw,RawSentence t.orth; |
190 | - ENIAM,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}] | 191 | + Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}] |
191 | | _ -> [] | 192 | | _ -> [] |
192 | 193 | ||
193 | let extract_sentences tokens chart last = | 194 | let extract_sentences tokens chart last = |
pre/preTypes.ml
@@ -117,7 +117,7 @@ let empty_token = { | @@ -117,7 +117,7 @@ let empty_token = { | ||
117 | lroles="",""; semantics=Normal} | 117 | lroles="",""; semantics=Normal} |
118 | 118 | ||
119 | type mode = | 119 | type mode = |
120 | - Raw | Struct | CONLL | ENIAM | Mate | 120 | + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE |
121 | 121 | ||
122 | (* warstwy nkjp1m do analizy: | 122 | (* warstwy nkjp1m do analizy: |
123 | header | 123 | header |
@@ -133,14 +133,14 @@ ann_named | @@ -133,14 +133,14 @@ ann_named | ||
133 | type sentence = | 133 | type sentence = |
134 | RawSentence of string | 134 | RawSentence of string |
135 | (* | CONLL of conll list *) | 135 | (* | CONLL of conll list *) |
136 | - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *) | ||
137 | - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *) | 136 | + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *) |
137 | + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *) | ||
138 | | QuotedSentences of paragraph_record list | 138 | | QuotedSentences of paragraph_record list |
139 | (* | NKJP1M of nkjp1m list *) | 139 | (* | NKJP1M of nkjp1m list *) |
140 | (* | Skladnica of skladnica_tree *) | 140 | (* | Skladnica of skladnica_tree *) |
141 | | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *) | 141 | | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *) |
142 | 142 | ||
143 | -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) | 143 | +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *) |
144 | 144 | ||
145 | and paragraph = | 145 | and paragraph = |
146 | RawParagraph of string | 146 | RawParagraph of string |