From 78e20b4b74f1f5a06ee32a42436f1fcff34afc05 Mon Sep 17 00:00:00 2001 From: Wojciech Jaworski <wjaworski@mimuw.edu.pl> Date: Sat, 22 Oct 2016 12:32:42 +0200 Subject: [PATCH] generowanie wejścia dla Swigry i POLFIE --- corpora/CONLL.ml | 22 +++++++++++----------- parser/exec.ml | 56 +++++++++++++++++++++++++++++++++++--------------------- parser/execTypes.ml | 12 ++++++------ parser/pipe.ml | 9 +++++---- parser/visualization.ml | 70 ++++++++++++++++++++++++++++++++++++---------------------------------- pre/preProcessing.ml | 5 +++-- pre/preSentences.ml | 23 ++++++++++++----------- pre/preTypes.ml | 8 ++++---- 8 files changed, 112 insertions(+), 93 deletions(-) diff --git a/corpora/CONLL.ml b/corpora/CONLL.ml index c4661f0..953ec6e 100644 --- a/corpora/CONLL.ml +++ b/corpora/CONLL.ml @@ -33,8 +33,8 @@ let string_of_paths mode tokens paths = let rec string_of_sentence mode tokens = function RawSentence s -> if mode = Raw then s else "" - | StructSentence (_,tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) - | DepSentence (_, paths) -> string_of_paths mode tokens paths + | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) + | DepSentence (paths) -> string_of_paths mode tokens paths | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts @@ -111,8 +111,8 @@ let info_map = let match_sentence (p_record,tokens) = let rec info_token s = match s with RawSentence text -> failwith ("match_sentence: " ^ text) - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") | AltSentence alts -> failwith ("match_sentence: AltSentence") (*if List.exists (fun (mode, s) -> mode = CONLL) alts @@ -122,8 +122,8 @@ let match_sentence (p_record,tokens) = try let id, text = StringMap.find info_map info_token in let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] @@ -188,8 +188,8 @@ let info_map = let match_sentence (p_record,tokens) = let rec info_token s = match s with RawSentence text -> failwith ("match_sentence: " ^ text) - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths + | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) + | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") | AltSentence alts -> failwith ("match_sentence: AltSentence") (*if List.exists (fun (mode, s) -> mode = CONLL) alts @@ -199,8 +199,8 @@ let match_sentence (p_record,tokens) = try let id, text = StringMap.find info_map info_token in let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] @@ -274,7 +274,7 @@ let load_sentence in_channel = then raise End_of_file else rev_paths, id in let rev_paths, id = pom [] "" in - {pid = id; pbeg = -1; plen = -1; pnext = -1; psentence = DepSentence("",Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens + {pid = id; pbeg = -1; plen = -1; pnext = -1; pfile_prefix = ""; psentence = DepSentence(Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens (* {s_id = id; s_text = ""; s_paths = (List.rev rev_paths)} *) let load_corpus in_channel = diff --git a/parser/exec.ml b/parser/exec.ml index 0370366..1645c49 100644 --- a/parser/exec.ml +++ b/parser/exec.ml @@ -43,7 +43,7 @@ let empty_result = { (*structs=SemTypes.Atom "",SemTypes.Label "",SemTypes.Label "",[],""*)} let empty_eniam_parse_result = { - id=""; + file_prefix=""; status=Idle; msg=""; lex_time=0.; @@ -58,7 +58,7 @@ let empty_eniam_parse_result = { } let empty_conll_parse_result = { - id=""; + file_prefix=""; status=Idle; msg=""; lex_time=0.; @@ -102,14 +102,16 @@ let translate_mode = function | PreTypes.CONLL -> CONLL | PreTypes.ENIAM -> ENIAM | PreTypes.Mate -> Mate + | PreTypes.Swigra -> Swigra + | PreTypes.POLFIE -> POLFIE let rec translate_sentence = function PreTypes.RawSentence s -> RawSentence s - | PreTypes.StructSentence(id,paths,last) -> StructSentence(id,paths,last) - | PreTypes.DepSentence(id,paths) -> DepSentence(id,paths) + | PreTypes.StructSentence(paths,last) -> StructSentence(paths,last) + | PreTypes.DepSentence(paths) -> DepSentence(paths) | PreTypes.QuotedSentences sentences -> QuotedSentences(Xlist.map sentences (fun p -> - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix; psentence=translate_sentence p.PreTypes.psentence})) | PreTypes.AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) -> translate_mode mode, translate_sentence sentence)) @@ -118,7 +120,7 @@ let rec translate_paragraph = function PreTypes.RawParagraph s -> RawParagraph s | PreTypes.StructParagraph sentences -> StructParagraph(Xlist.map sentences (fun p -> - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix; psentence=translate_sentence p.PreTypes.psentence})) | PreTypes.AltParagraph l -> AltParagraph(Xlist.map l (fun (mode,paragraph) -> translate_mode mode, translate_paragraph paragraph)) @@ -130,8 +132,8 @@ let rec translate_text = function | PreTypes.AltText l -> AltText(Xlist.map l (fun (mode,text) -> translate_mode mode, translate_text text)) -let eniam_parse_sentence timeout test_only_flag id paths last tokens = - let result = {empty_eniam_parse_result with id=id} in +let eniam_parse_sentence timeout test_only_flag paths last tokens = + let result = empty_eniam_parse_result in let time2 = time_fun () in try let chart = LCGlexicon.create (paths,last) tokens in @@ -187,8 +189,8 @@ let eniam_parse_sentence timeout test_only_flag id paths last tokens = let time3 = time_fun () in {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2} -let conll_parse_sentence timeout test_only_flag id paths tokens = - let result = {empty_conll_parse_result with id=id} in +let conll_parse_sentence timeout test_only_flag paths tokens = + let result = empty_conll_parse_result in let time2 = time_fun () in try let dep_chart = LCGlexicon.dep_create paths tokens in @@ -253,22 +255,33 @@ let conll_parse_sentence timeout test_only_flag id paths tokens = let mate_in, mate_out = Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test" +let file_prefix_of_mode = function + Raw -> "R" + | Struct -> "St" + | CONLL -> "C" + | ENIAM -> "E" + | Mate -> "M" + | Swigra -> "S" + | POLFIE -> "P" + let get_paths = function - {PreTypes.psentence=PreTypes.DepSentence(_,paths)},_ -> paths + {PreTypes.psentence=PreTypes.DepSentence(paths)},_ -> paths | _ -> failwith "get_paths" -let rec parse_sentence timeout test_only_flag mode tokens = function +let rec parse_sentence timeout test_only_flag mode file_prefix tokens = function RawSentence s -> RawSentence s - | StructSentence(id,paths,last) -> + | StructSentence(paths,last) -> (match mode with ENIAM -> - let result = eniam_parse_sentence timeout test_only_flag id paths last tokens in + let result = eniam_parse_sentence timeout test_only_flag paths last tokens in + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in ENIAMSentence result | _ -> failwith "parse_sentence") - | DepSentence(id,paths) -> + | DepSentence(paths) -> (match mode with CONLL -> - let result = conll_parse_sentence timeout test_only_flag id paths tokens in + let result = conll_parse_sentence timeout test_only_flag paths tokens in + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in CONLLSentence result (* let xml = DepTree.conll_to_xml paths in let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) @@ -279,22 +292,23 @@ let rec parse_sentence timeout test_only_flag mode tokens = function print_endline "parse_sentence 1"; let conll = CONLL.string_of_paths PreTypes.Mate tokens paths in print_endline "parse_sentence 2"; - printf "|%s|\n" conll; + (* printf "|%s|\n" conll; *) Printf.fprintf mate_out "%s\n\n%!" conll; print_endline "parse_sentence 3"; let new_paths = get_paths (CONLL.load_sentence mate_in) in print_endline "parse_sentence 4"; - let result = conll_parse_sentence timeout test_only_flag id new_paths tokens in + let result = conll_parse_sentence timeout test_only_flag new_paths tokens in + let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in CONLLSentence result | _ -> failwith "parse_sentence") | QuotedSentences sentences -> let sentences = Xlist.rev_map sentences (fun p -> - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in {p with psentence=sentence}) in QuotedSentences(List.rev sentences) | AltSentence l -> let l = Xlist.rev_map l (fun (mode,sentence) -> - mode, parse_sentence timeout test_only_flag mode tokens sentence) in + mode, parse_sentence timeout test_only_flag mode file_prefix tokens sentence) in AltSentence(List.rev l) | _ -> failwith "parse_sentence" @@ -302,7 +316,7 @@ let rec parse_paragraph timeout test_only_flag mode tokens = function RawParagraph s -> RawParagraph s | StructParagraph sentences -> let sentences = Xlist.rev_map sentences (fun p -> - let sentence = parse_sentence timeout test_only_flag mode tokens p.psentence in + let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens p.psentence in {p with psentence=sentence}) in StructParagraph(List.rev sentences) | AltParagraph l -> diff --git a/parser/execTypes.ml b/parser/execTypes.ml index f6841fc..ab8c868 100644 --- a/parser/execTypes.ml +++ b/parser/execTypes.ml @@ -20,7 +20,7 @@ type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated type eniam_parse_result = { - id: string; + file_prefix: string; status: status; msg: string; lex_time: float; @@ -35,7 +35,7 @@ type eniam_parse_result = { } type conll_parse_result = { - id: string; + file_prefix: string; status: status; msg: string; lex_time: float; @@ -54,13 +54,13 @@ type conll_parse_result = { } type mode = - Raw | Struct | CONLL | ENIAM | Mate + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE type sentence = RawSentence of string (* | CONLL of conll list *) - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *) - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *) + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *) + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *) | QuotedSentences of paragraph_record list (* | NKJP1M of nkjp1m list *) (* | Skladnica of skladnica_tree *) @@ -68,7 +68,7 @@ type sentence = | ENIAMSentence of eniam_parse_result | CONLLSentence of conll_parse_result -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *) and paragraph = RawParagraph of string diff --git a/parser/pipe.ml b/parser/pipe.ml index 3df2897..8cb246c 100644 --- a/parser/pipe.ml +++ b/parser/pipe.ml @@ -118,9 +118,9 @@ let lcg_process query = let _ = Unix.shutdown_connection ic in () -(* let _ = +let _ = if Array.length Sys.argv < 2 then print_endline "missing argument" else - lcg_process Sys.argv.(1) *) + lcg_process Sys.argv.(1) (* FIXME: parser dziwnie się zachowuje dla 'ścieżki anomalia.' 'ścieżki anomalia. GG' itp. - nie parsuje '.' a jak sparsuje to nie chce redukować *) @@ -210,7 +210,7 @@ let process_conll_corpus filename = let _ = (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *) (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) - process_conll_corpus "../testy/skladnica-test1.conll"; + (* process_conll_corpus "../testy/skladnica-test1.conll"; *) () (* TO DO: @@ -227,7 +227,8 @@ let _ = - assign_not_parsed - sprawdzenie zerowania globalnych referencji przy parsowaniu korpusu - mateParser - 2016.10.19 + 2016.10.22 + - przerobić AltSentence tak by prefix nazw plików był jego elementem, albo wstawić liczbę z prefiksu do paragraph_record *) diff --git a/parser/visualization.ml b/parser/visualization.ml index 6d8c1be..9a21b45 100644 --- a/parser/visualization.ml +++ b/parser/visualization.ml @@ -640,6 +640,8 @@ let string_of_mode = function | CONLL -> "CONLL" | ENIAM -> "ENIAM" | Mate -> "Mate" + | Swigra -> "Swigra" + | POLFIE -> "POLFIE" (*let rec string_of_sentence = function RawSentence s -> sprintf "RawSentence(%s)" s @@ -742,30 +744,30 @@ let html_of_eniam_sentence path tokens (result : eniam_parse_result) = (* | PreprocessingError -> "error_pre: %s\n" result.msg *) | LexiconError -> sprintf "error_lex: %s\n" result.msg | ParseError -> - create_latex_chart path (result.id ^ "_chart") result.chart; + create_latex_chart path (result.file_prefix ^ "_chart") result.chart; sprintf "error_parse: %s\n" result.msg ^ - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix | ParseTimeout -> - create_latex_chart path (result.id ^ "_chart") result.chart; + create_latex_chart path (result.file_prefix ^ "_chart") result.chart; sprintf "timeout: %s\n" result.msg ^ - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix | NotParsed -> - create_latex_chart path (result.id ^ "_chart") result.chart; + create_latex_chart path (result.file_prefix ^ "_chart") result.chart; sprintf "not_parsed: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^ - sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.id + sprintf "<BR><A HREF=\"%s_chart.pdf\">Chart</A>\n" result.file_prefix | ReductionError -> sprintf "error_reduction: %s\n" result.msg | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size | NotReduced -> sprintf "not_reduced: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size (* | NotTranslated -> "not_translated: \n" *) | Parsed -> - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree; - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree; - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size ^ - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^ - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^ - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix | _ -> failwith "html_of_eniam_sentence" let html_of_conll_sentence path tokens (result : conll_parse_result) = @@ -774,46 +776,46 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) = (* | PreprocessingError -> "error_pre: %s\n" result.msg *) | LexiconError -> sprintf "error_lex: %s\n" result.msg | ParseError -> - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart; - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart; + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart; + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart; sprintf "error_parse: %s\n" result.msg ^ - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^ - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^ + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix | ParseTimeout -> - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart; - create_latex_parsed_dep_chart path (result.id ^ "_parsed_dep_chart") result.parsed_dep_chart; + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart; + create_latex_parsed_dep_chart path (result.file_prefix ^ "_parsed_dep_chart") result.parsed_dep_chart; sprintf "timeout: %s\n" result.msg ^ - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^ - sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.id + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^ + sprintf "<BR><A HREF=\"%s_parsed_dep_chart.pdf\">Parsed Chart</A>\n" result.file_prefix | NotParsed -> - create_latex_dep_chart path (result.id ^ "_dep_chart") result.dep_chart; - create_latex_not_parsed_dep_chart path (result.id ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart; + create_latex_dep_chart path (result.file_prefix ^ "_dep_chart") result.dep_chart; + create_latex_not_parsed_dep_chart path (result.file_prefix ^ "_not_parsed_dep_chart") result.not_parsed_dep_chart; sprintf "not_parsed\n" ^ - sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.id ^ - sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.id + sprintf "<BR><A HREF=\"%s_dep_chart.pdf\">Chart</A>\n" result.file_prefix ^ + sprintf "<BR><A HREF=\"%s_not_parsed_dep_chart.pdf\">Not Parsed Chart</A>\n" result.file_prefix | ReductionError -> sprintf "error_reduction: %s\n" result.msg | TooManyNodes -> sprintf "to_many_nodes: paths_size=%d\n" result.paths_size | NotReduced -> - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; sprintf "not_reduced: paths_size=%d\n" result.paths_size ^ - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix | SemError -> sprintf "error_sem: %s dependency_tree_size=%d\n" result.msg result.dependency_tree_size (* | NotTranslated -> "not_translated: \n" *) | Parsed -> - print_simplified_dependency_tree path (result.id ^ "_simplified_dependency_tree") tokens result.dependency_tree; - print_dependency_tree path (result.id ^ "_dependency_tree") result.dependency_tree; - LCGlatexOf.print_dependency_tree path (result.id ^ "_dependency_tree_references") result.dependency_tree; + print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; + LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size ^ - sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.id ^ - sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.id ^ - sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.id + sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ + sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix | _ -> failwith "html_of_conll_sentence" let rec html_of_sentence path tokens = function RawSentence s -> s - | StructSentence(_,paths,last) -> html_of_struct_sentence tokens paths last - | DepSentence(_,paths) -> html_of_dep_sentence tokens paths + | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last + | DepSentence(paths) -> html_of_dep_sentence tokens paths | ENIAMSentence result -> html_of_eniam_sentence path tokens result | CONLLSentence result -> html_of_conll_sentence path tokens result | QuotedSentences sentences -> diff --git a/pre/preProcessing.ml b/pre/preProcessing.ml index 3e97784..db2f025 100644 --- a/pre/preProcessing.ml +++ b/pre/preProcessing.ml @@ -614,12 +614,13 @@ let parse_text = function AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)] | AltText[Raw,RawText query;CONLL,StructText([ - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence(_,dep_paths)]} as p]],tokens)] -> + StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]],tokens)] -> parse_conll tokens dep_paths; let paths = parse query in let sentences = PreSentences.split_into_sentences query tokens paths in + let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in let conll = StructParagraph[{p with psentence = AltSentence[Raw, RawSentence text; - Mate, DepSentence("M",dep_paths); CONLL, DepSentence("C",dep_paths)]}] in + Mate, DepSentence m_dep_paths; CONLL, DepSentence dep_paths]}] in AltText[Raw,RawText query; Struct, StructText([ AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)] | _ -> failwith "parse_text: not implemented" diff --git a/pre/preSentences.ml b/pre/preSentences.ml index 2745985..abe5073 100644 --- a/pre/preSentences.ml +++ b/pre/preSentences.ml @@ -147,17 +147,15 @@ let find_tokens_in_chart tokens chart lnode rnode cat = let rec add_struct_sentence_ids_rec n sentences = Xlist.fold sentences ([],n) (fun (l,n) -> function - {psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p -> - {p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E" ^ string_of_int n,paths,last)]} :: l, n+1 - | {psentence=AltSentence[Raw,s;ENIAM,QuotedSentences sentences]} as p -> + {psentence=AltSentence[Raw,s;Struct,QuotedSentences sentences]} as p -> let sentences, n = add_struct_sentence_ids_rec n sentences in - {p with psentence=AltSentence[Raw,s;ENIAM,QuotedSentences (List.rev sentences)]} :: l, n+1 - | _ -> failwith "add_struct_sentence_ids") + {p with psentence=AltSentence[Raw,s;Struct,QuotedSentences (List.rev sentences)]} :: l, n + | p -> {p with pfile_prefix=string_of_int n} :: l, n+1) let add_struct_sentence_ids sentences = match sentences with - [{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p] -> - [{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E",paths,last)]}] + [{psentence=AltSentence[Raw,_;Struct,QuotedSentences _]}] -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences)) + | [p] -> [p] | _ -> List.rev (fst (add_struct_sentence_ids_rec 1 sentences)) let prepare_indexes paths = @@ -181,13 +179,16 @@ let rec extract_sentences_rec tokens id = match t.token with Tokens("sentence",ids) -> let paths,last = make_paths tokens ids in - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix=""; psentence=AltSentence[Raw,RawSentence t.orth; - ENIAM,StructSentence("",paths,last)]}] + ENIAM,StructSentence(paths,last); + Mate,RawSentence t.orth; + Swigra,RawSentence t.orth; + POLFIE,RawSentence t.orth]}] | Tokens("quoted_sentences",ids) -> - [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; + [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix=""; psentence=AltSentence[Raw,RawSentence t.orth; - ENIAM,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}] + Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}] | _ -> [] let extract_sentences tokens chart last = diff --git a/pre/preTypes.ml b/pre/preTypes.ml index e3d120a..f63ae2a 100644 --- a/pre/preTypes.ml +++ b/pre/preTypes.ml @@ -117,7 +117,7 @@ let empty_token = { lroles="",""; semantics=Normal} type mode = - Raw | Struct | CONLL | ENIAM | Mate + Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE (* warstwy nkjp1m do analizy: header @@ -133,14 +133,14 @@ ann_named type sentence = RawSentence of string (* | CONLL of conll list *) - | StructSentence of string * (int * int * int) list * int (* file_prefix * (id * lnode * rnode) list * last *) - | DepSentence of string * (int * int * string) array (* file_prefix * (id * super * label) conll_id *) + | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *) + | DepSentence of (int * int * string) array (* (id * super * label) conll_id *) | QuotedSentences of paragraph_record list (* | NKJP1M of nkjp1m list *) (* | Skladnica of skladnica_tree *) | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *) -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) +and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *) and paragraph = RawParagraph of string -- libgit2 0.22.2