diff --git a/LCGlexicon/resources/lexicon-pl.dic b/LCGlexicon/resources/lexicon-pl.dic index 4e517a2..3f8307b 100644 --- a/LCGlexicon/resources/lexicon-pl.dic +++ b/LCGlexicon/resources/lexicon-pl.dic @@ -7,7 +7,7 @@ month-lex month-interval year-interval roman roman-interval hour-minute-interval hour-interval obj-id match-result url email day-month day year date hour hour-minute - się nie by s <root> or or2 <colon> <speaker> <speaker-end> <squery> + się nie by s <root> <conll_root> or or2 <colon> <speaker> <speaker-end> <squery> @WEIGHTS symbol_weight=1 @@ -272,6 +272,8 @@ pos=unk: np*number*case*gender*person; # [LCGrenderer.make_frame false tokens lex_sems [] schema_list ["<conll_root>"] d batrs] # | lemma,c,l -> failwith ("process_interp: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) in +lemma=<conll_root>,pos=interp: <conll_root>/(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj); + pos=sinterj: BRACKET interj; lemma=</sentence>,pos=interp: BRACKET s\?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj); diff --git a/LCGparser/ENIAM_LCGrules.ml b/LCGparser/ENIAM_LCGrules.ml index d3b1113..25d02f7 100644 --- a/LCGparser/ENIAM_LCGrules.ml +++ b/LCGparser/ENIAM_LCGrules.ml @@ -446,8 +446,8 @@ let backward_cross_composition references args functs = let rules = [ backward_application; forward_application; - backward_cross_composition; - forward_cross_composition; + (* backward_cross_composition; *) + (* forward_cross_composition; *) ] let rec flatten_functor2 l seml = function diff --git a/corpora/CONLL.ml b/corpora/CONLL.ml index 03d565d..98fcbee 100644 --- a/corpora/CONLL.ml +++ b/corpora/CONLL.ml @@ -3,133 +3,55 @@ open ENIAMsubsyntaxTypes open ENIAMtokenizerTypes let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts - then f (snd @@ List.find (fun (m,_) -> m = mode) alts) - else f (snd @@ List.find (fun (m,_) -> m = Struct) alts) - -let string_of_token mode token conll_id super label = - let decompose_lemma = function - | Lemma(a,b,c) -> a,b,if c = [[]] - then "_" - else String.concat "][" @@ Xlist.map c (fun x -> - String.concat "|" @@ Xlist.map x ( fun y -> - String.concat "." y)) - | t -> failwith ("string_of_token: not Lemma") in - match mode with - | Raw -> token.orth - | Struct -> failwith ("function string_of_token for mode Struct is not defined") - | CONLL -> let lemma,cat,interp = decompose_lemma token.token in - String.concat "\t" [string_of_int conll_id; - token.orth; lemma; cat; cat; interp; "_"; "_"; - string_of_int token.beg; string_of_int token.len] - | Mate -> let lemma,cat,interp = decompose_lemma token.token in - String.concat "\t" [string_of_int conll_id; - token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"] - | _ -> failwith "string_of_token: ni" - -let string_of_paths mode tokens paths = - let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id -> - let id,super,label = paths.(conll_id) in - (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in - String.concat "\n" (List.rev l) ^ "\n\n" - -let rec string_of_sentence mode tokens = function - RawSentence s -> if mode = Raw then s else "" - | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) - | DepSentence (paths) -> string_of_paths mode tokens paths - | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") - | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts - -let string_of_p_record mode tokens p_record = - (if p_record.id = "" then "" else p_record.id ^ "\n") ^ - string_of_sentence mode tokens p_record.sentence - -(*let rec string_of_paragraph mode tokens = function - RawParagraph s -> if mode = Raw then s else "" - | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens) - | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts - -let rec string_of_text mode tokens = function - RawText s -> if mode = Raw then s else "" - | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens) - | AltText alts -> alternative_string (string_of_text mode) mode alts*) - - -(******************) -(*** -let establish_next tokens paths = - let n = ExtArray.size tokens in - Int.iter 1 (n - 2) (fun i -> - let f = ExtArray.get tokens i in - let s = ExtArray.get tokens (i+1) in - ExtArray.set tokens i {f with next = s.beg}); - let last = ExtArray.get tokens (n-1) in - ExtArray.set tokens (n-1) {last with next = last.beg + last.len} - - - (*let rec pom res = function - h :: t -> let next = if res = [] - then h.beg+h.len - else (List.hd res).beg in - pom ({h with next = next} :: res) t - | [] -> res in - pom [] rev_tokens*) - -let rec establish_for_token i text tokens = function - (id,_,_) :: t as l-> - let h = ExtArray.get tokens id in - if Xstring.check_prefix " " text - then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l - else if Xstring.check_prefix h.orth text - then - let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in - let n_h = {h with beg = i ; len = n} in - ExtArray.set tokens id n_h; - establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t - else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text) - | [] -> 100, i - -let rec establish_lengths text paths tokens = - let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in - establish_next tokens paths; - pbeg, plen-100 - -(******************) - -exception ErrorInfoFile of string - -let info_file = "../corpora/info_sentences.txt" - -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file - -let add_to_map map info_str = - match Xstring.split "\n" info_str with - [id; text; info_token] -> StringMap.add map info_token (id, text) - | _ -> raise (ErrorInfoFile info_str) - -let info_map = - Xlist.fold info StringMap.empty add_to_map - -let match_sentence (p_record,tokens) = - let rec info_token s = match s with - RawSentence text -> failwith ("match_sentence: " ^ text) - | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) - | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths - | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") - | AltSentence alts -> failwith ("match_sentence: AltSentence") - (*if List.exists (fun (mode, s) -> mode = CONLL) alts - then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) - else failwith ("match_sentence: no CONLL mode in AltSentence")*) in - let info_token, paths = info_token p_record.psentence in - try - let id, text = StringMap.find info_map info_token in - let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] -(* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) - with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] - -let match_corpus corpus = - Xlist.map corpus match_sentence***) + then f (snd @@ List.find (fun (m,_) -> m = mode) alts) + else f (snd @@ List.find (fun (m,_) -> m = Struct) alts) + +let string_of_token mode token conll_id super label = + let decompose_lemma = function + | Lemma(a,b,c) -> a,b,if c = [[]] + then "_" + else String.concat "][" @@ Xlist.map c (fun x -> + String.concat "|" @@ Xlist.map x ( fun y -> + String.concat "." y)) + | t -> failwith ("string_of_token: not Lemma") in + match mode with + | Raw -> token.orth + | Struct -> failwith ("function string_of_token for mode Struct is not defined") + | CONLL -> let lemma,cat,interp = decompose_lemma token.token in + String.concat "\t" [string_of_int conll_id; + token.orth; lemma; cat; cat; interp; "_"; "_"; + string_of_int token.beg; string_of_int token.len] + | Mate -> let lemma,cat,interp = decompose_lemma token.token in + String.concat "\t" [string_of_int conll_id; + token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"] + | _ -> failwith "string_of_token: ni" + +let string_of_paths mode tokens paths = + let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id -> + let id,super,label = paths.(conll_id) in + (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in + String.concat "\n" (List.rev l) ^ "\n\n" + +let rec string_of_sentence mode tokens = function + RawSentence s -> if mode = Raw then s else "" + | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) + | DepSentence (paths) -> string_of_paths mode tokens paths + | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") + | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts + +let string_of_p_record mode tokens p_record = + (if p_record.id = "" then "" else p_record.id ^ "\n") ^ + string_of_sentence mode tokens p_record.sentence + +(*let rec string_of_paragraph mode tokens = function + RawParagraph s -> if mode = Raw then s else "" + | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens) + | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts + +let rec string_of_text mode tokens = function + RawText s -> if mode = Raw then s else "" + | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens) + | AltText alts -> alternative_string (string_of_text mode) mode alts*) (******************) @@ -207,15 +129,6 @@ let establish_next tokens paths = let last = ExtArray.get tokens (n-1) in ExtArray.set tokens (n-1) {last with next = last.beg + last.len} - - (*let rec pom res = function - h :: t -> let next = if res = [] - then h.beg+h.len - else (List.hd res).beg in - pom ({h with next = next} :: res) t - | [] -> res in - pom [] rev_tokens*) - let rec establish_for_token i text tokens = function (id,_,_) :: t as l-> let h = ExtArray.get tokens id in @@ -245,15 +158,15 @@ exception ErrorInfoFile of string let info_file = "../corpora/info_sentences2.txt" -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file +let info () = Xstring.split "\n\n" @@ File.load_file_gen info_file let add_to_map map info_str = match Xstring.split "\n" info_str with [id; text; info_token] -> StringMap.add map info_token (id, text) | _ -> raise (ErrorInfoFile info_str) -let info_map = - Xlist.fold (List.tl info) StringMap.empty add_to_map +let info_map () = + Xlist.fold (List.tl (info ())) StringMap.empty add_to_map let match_sentence (p_record,tokens) = let rec info_token s = match s with @@ -268,7 +181,7 @@ let match_sentence (p_record,tokens) = let info_token, paths = info_token p_record.sentence in (* try *) let id, text = try - StringMap.find info_map info_token + StringMap.find (info_map ()) info_token with | _ -> p_record.id, get_text tokens in let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in @@ -282,7 +195,7 @@ let match_corpus corpus = [] -> [] | a::l -> try let r = f a in r :: pom f l - with e -> (*print_endline (Printexc.to_string e);*) pom f l in + with e -> pom f l in pom match_sentence corpus (******************) @@ -304,7 +217,6 @@ let load_token in_channel = else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in {empty_token_env with orth = orth; token = Lemma(lemma,cat,interp);}, int_of_string id, int_of_super super, label in let line = input_line in_channel in - (* print_endline ("load_token: " ^ line); *) if line = "" then raise Empty_line else if line.[0] = '#' @@ -329,30 +241,19 @@ let load_token in_channel = let label = Xstring.cut_sufix "_" label_err in n_token id orth lemma cat interp super label) | _ -> failwith ("load_token: " ^ line) -(* {c_id = List.nth pom 1; - c_lemma = List.nth pom 2; - c_cat = List.nth pom 3; - c_interp = (let interp = List.nth pom 5 in - if interp = "_" - then [] - else Str.split (Str.regexp "|") interp); - c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *) let load_sentence in_channel = let tokens = ExtArray.make 100 empty_token_env in let _ = ExtArray.add tokens {empty_token_env with token = Interp "<conll_root>"} in let rec pom rev_paths id = - (* print_endline "pom 1"; *) try - (* print_endline "pom 2"; *) let token, conll_id, super, label = load_token in_channel in let id_a = ExtArray.add tokens token in if id_a <> conll_id then failwith "load_sentence: different ids" else - (* print_endline "pom 3"; *) pom ((id_a,super,label) :: rev_paths) id - with Id_line new_id -> (*print_endline "pom 4";*)pom rev_paths new_id - | Empty_line -> (*print_endline "pom 5";*)rev_paths, id - | End_of_file -> (*print_endline "pom 6";*)if rev_paths = [] + with Id_line new_id -> pom rev_paths new_id + | Empty_line -> rev_paths, id + | End_of_file -> if rev_paths = [] then raise End_of_file else rev_paths, id in let rev_paths, id = pom [] "" in @@ -366,4 +267,4 @@ let load_corpus in_channel = pom ((conll_sentence, tokens) :: res) with End_of_file -> res | e -> prerr_endline (Printexc.to_string e); res in - (* match_corpus @@ *) List.rev @@ pom [] + List.rev @@ pom [] diff --git a/corpora/CONLL_adapter.ml b/corpora/CONLL_adapter.ml index e07a335..7805584 100644 --- a/corpora/CONLL_adapter.ml +++ b/corpora/CONLL_adapter.ml @@ -1,47 +1,371 @@ +open Xstd +open ENIAMsubsyntaxTypes +open ENIAMtokenizerTypes -let convert_dep_tree id first_try paths tokens lex_sems = - let do_if cond f paths = if cond then f paths tokens else paths in +let if_lemma lemmas = function + Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas + | _ -> false + +let if_cat cats = function + Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats + | _ -> false + +let if_interps interps token = + let interp = match token with + Lemma(_,_,i) -> i + | _ -> [[[]]] in + let if_interp nr value = + List.exists (fun x -> + try + List.exists (fun y -> + y = value) (List.nth x nr) + with _ -> false + ) interp in + Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) + +let correct_coordination1 paths tokens = + let paths_ls = List.mapi (fun i (id,super,label) -> + (i,id,super,label)) (Array.to_list paths) in + + let l = [("subst:nom",0),(["fin";"praet"],0); + ("subst:acc",0),(["inf"],0); + ("ppron3:nom",0),(["fin";"praet"],0); + ("ppron3:acc",0),(["fin";"praet"],0); + ("adv",0),(["fin";"praet"],0); + ("adv",0),(["inf"],0); + ("adv",0),(["adj"],0); + ("prep",0),(["fin";"praet"],0); + ("prep",0),(["inf"],0); + ("prep",0),(["ppas"],0); + ("prep",0),(["subst"],0); + ("prep:gen",0),(["subst:gen"],0); + ("adj:nom",0),(["fin";"praet"],0); + ("adj:nom",0),(["subst:nom"],0); + ("adj:gen",0),(["subst:gen"],0); + ("adj:dat",0),(["subst:dat"],0); + ("adj:acc",0),(["subst:acc"],0); + ("adj:inst",0),(["subst:inst"],0); + ("adj:loc",0),(["subst:loc"],0); + ("subst:gen",0),(["subst:nom"],0); + (* ("subst:gen",0),(["subst:gen"],0); *) + ("subst:gen",0),(["subst:dat"],0); + ("subst:gen",0),(["subst:acc"],0); + ("subst:gen",0),(["subst:inst"],0); + ("subst:gen",0),(["subst:loc"],0); + ("ppron3:gen",0),(["subst:nom"],0); + ("ppron3:gen",0),(["subst:dat"],0); + ("ppron3:gen",0),(["subst:acc"],0); + ("ppron3:gen",0),(["subst:inst"],0); + ("ppron3:gen",0),(["subst:loc"],0); + ("qub",0),(["fin";"praet"],0); + ("qub",0),(["subst"],0); + ("qub",0),(["adj"],0); + ("pact",0),(["subst"],0); + ("ppas",0),(["subst"],0) + ] in + + let find_dependents sons = + + let is (i,id,super,label) pattern = match Xstring.split ":" pattern with + ["prep";case] -> if_cat ["prep"] (ExtArray.get tokens id).token && + if_interps [0,case] (ExtArray.get tokens id).token + | [cat;case] -> if_cat [cat] (ExtArray.get tokens id).token && + if_interps [1,case] (ExtArray.get tokens id).token + | [cat] -> if_cat [cat] (ExtArray.get tokens id).token + | _ -> failwith "is (in correct_coordination1)" in + + let incr_representative acc son = Xlist.map acc (fun ((one,a),(rest,b)) -> + if is son one + then (one,a + 1), (rest,b) + else if List.exists (is son) rest + then (one,a), (rest,b + 1) + else (one,a), (rest,b)) in + + let get_from sons pattern = List.find (fun x -> is x pattern) sons in + + let l = Xlist.fold sons l incr_representative in + let results = List.filter (fun ((_,a),(_,b)) -> a = 1 && b > 1) l in + Xlist.map results (fun result -> + get_from sons @@ fst @@ fst result, + List.filter (fun son -> + List.exists (fun one -> is son one) (fst (snd result))) sons) in + + let establish_neighbour super ((i_d,id_d,super_d,label_d),sons) = + let not_between (i_s,_,_,_) = + (super < i_d && super < i_s) || + (super > i_d && super > i_s) in + let (i_n,id_n,super_n,label_n) = List.find (fun son -> + not_between son) sons in + paths.(i_d) <- (id_d, i_n, label_d) in + + let examine_coords (i,id,super,label) sons = + try + let dependents = find_dependents sons in + Xlist.iter dependents (establish_neighbour super) + with + | _ -> () in + + Array.iteri (fun i (id,super,label) -> + if if_cat ["conj"] (ExtArray.get tokens id).token + then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in + if (List.length sons > 2) + then examine_coords (i,id,super,label) sons)) paths; + paths + +let correct_coordination2 paths tokens = + let paths_c = Array.copy paths in + let paths_ls () = List.mapi (fun i (id,super,label) -> + (i,id,super,label)) (Array.to_list paths_c) in + + (* let ps a sons = + print_endline a; + List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; + print_endline "" in *) + + let rec correct_rec (i,id,super,label) sons = + let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in + (* ps "left:" (List.rev left_s); + ps "right:" right_s; *) + find_father i (List.rev left_s); + find_father i right_s + + and find_father i0 = function + [(i,id,super,label)] -> paths_c.(i) <- (id,i0,label) + | (a,b,c,d) :: (i,id,super,label) :: t -> + paths_c.(i) <- (id,i0,label); + if not (if_cat ["conj"] (ExtArray.get tokens i).token || + (ExtArray.get tokens i).orth = ",") + then failwith "find_father"; + correct_rec (i,id,super,label) (if a < i + then (a,b,c,d) :: t + else List.rev @@ (a,b,c,d) :: t) + | _ -> failwith "find_father" in + + let check_previous_for_interp i = + if i >= 0 && (ExtArray.get tokens i).orth = "," && + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c)) + then paths_c.(i) <- (0,-1,"") in + + Array.iteri (fun i (id,super,label) -> + if if_cat ["conj"] (ExtArray.get tokens i).token || + (ExtArray.get tokens i).orth = "," + then + (check_previous_for_interp (i-1); + let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in + if (List.length sons > 2) + then correct_rec (i,id,super,label) sons)) paths_c; + paths_c + +let praet_qub_aglt paths tokens = + Array.iteri (fun i (id,super,label) -> + if super >= 0 then + (let id_s, super_s, label_s = paths.(super) in + if if_cat ["aglt"] (ExtArray.get tokens id).token && + (ExtArray.get tokens id_s).orth = "by" + then let id_gf,super_gf,label_gf = paths.(super_s) in + if if_cat ["praet"] (ExtArray.get tokens id_gf).token + then paths.(i) <- (id,super_s,label))) paths; + paths + +let replace_tokens paths tokens = +(* for i = 0 to ExtArray.size tokens - 1 do + print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth) +done; *) + let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> + if (ExtArray.get tokens i).orth = orth then i else acc) in + + let multidot i id0 super0 label0 = + let id1, super1, label1 = paths.(super0) in + if super1 >= 0 then + let id2, super2, label2 = paths.(super1) in + if (ExtArray.get tokens id1).orth = "." && + (ExtArray.get tokens id2).orth = "." + then + (paths.(super1) <- (find_token "..." ,super2, label2); + paths.(super0) <- (0,-1,""); + paths.(i) <- (0,-1,"")) in + + let brev i id super label = + let if_the_last_dot () = + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in + Array.fold_left (fun acc (i2,s,l) -> + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in + + let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () + then "" + else "." in + + let n_orth = (ExtArray.get tokens id).orth ^ dot in + paths.(i) <- (find_token n_orth,super,label) in + + Array.iteri (fun i (id,super,label) -> + if (ExtArray.get tokens id).orth = "." + then multidot i id super label; + if if_cat ["brev"] (ExtArray.get tokens id).token + then brev i id super label) + paths; + paths + +let replace_hyphens paths tokens = + let ref_paths = ref paths in + let find_token token = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> + if (ExtArray.get tokens i).token = token then i else acc) in + let find_specific_token token beg next = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> + if (ExtArray.get tokens i).token = token && + beg <= (ExtArray.get tokens i).beg && + (ExtArray.get tokens i).next <= next + then i else acc) in + + let correct_last sons_of_zero = (* TODO: synowie zamiast syna *) + let i1,s1,l1 = !ref_paths.(Array.length !ref_paths - 1) in + if (ExtArray.get tokens i1).orth = "." + then + !ref_paths.(Array.length !ref_paths - 1) <- (find_token (Interp "</sentence>"),1,l1) + else + (ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),1,"-") |]; + !ref_paths.(Array.length !ref_paths - 2) <- (i1,Array.length !ref_paths - 1,l1)); + Xlist.iter sons_of_zero (fun son_of_zero -> + let i2,s2,l2 = !ref_paths.(son_of_zero) in + !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) in + + let one_hyphen sons_of_zero = + let i2,s2,l2 = !ref_paths.(1) in + Xlist.iter sons_of_zero (fun son_of_zero -> + let i1,s1,l1 = !ref_paths.(son_of_zero) in + !ref_paths.(son_of_zero) <- (i1,1,l1)); + !ref_paths.(1) <- (find_token (Interp "<or-sentence>"),0,l2); + correct_last sons_of_zero in + + let two_hyphens first second son parent = + let i1,s1,l1 = !ref_paths.(first) in + let i2,s2,l2 = !ref_paths.(second) in + let beg, next = (ExtArray.get tokens i2).beg, (ExtArray.get tokens i2).next in + let i3,s3,l3 = !ref_paths.(son) in + let i4,s4,l4 = !ref_paths.(parent) in + ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),first,"-") |]; + !ref_paths.(first) <- (find_token (Interp "<or-sentence>"),0,l1); + !ref_paths.(second) <- (find_specific_token (Interp "</or-sentence>") beg next,first,l2); + !ref_paths.(son) <- (i3,second,l3); + !ref_paths.(parent) <- (i4,first,l4) in + + let rec is_dep_correct a b out zero res i (id,super,label) = (* out = how many words in (a,b) have parent outside [a,b]*) + (* print_endline ((string_of_int a) ^ " " ^ (string_of_int b) ^ " " ^ (string_of_int out) ^ " " ^ (string_of_int zero) ^ " " ^ (string_of_int i)); *) + if out > 1 || zero > 1 || (* zero = how many words (not interps) have parent 0 *) + (a < i && i < b && super < a && label <> "interp") || + (a < super && super < b && (i < a || b < i)) + then false, res + else + if i+1 = Array.length !ref_paths + then out = 1 && zero = 1, res + else + if a < i && i < b && b < super + then is_dep_correct a b (out+1) zero (i,super) (i+1) !ref_paths.(i+1) + else + if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token) + then is_dep_correct a b out (zero+1) res (i+1) !ref_paths.(i+1) + else is_dep_correct a b out zero res (i+1) !ref_paths.(i+1) in + + let hyphens = snd @@ Array.fold_left (fun (i,acc) (id,super,label) -> + if (ExtArray.get tokens id).orth = "-" + then i+1, i :: acc + else i+1, acc) (0,[]) !ref_paths in + + let sons_of_zero = snd @@ Array.fold_left (fun (i,acc) (id,super,label) -> + if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token) + then i+1, i :: acc + else i+1, acc) (0,[]) !ref_paths in + + (if List.length sons_of_zero = 1 + then + if List.length hyphens = 1 && hyphens = [1] + then one_hyphen sons_of_zero + else + if List.length hyphens = 2 + then let a, b = List.nth hyphens 1, List.nth hyphens 0 in + let is_good, (son,parent) = is_dep_correct a b 0 0 (0,0) 1 !ref_paths.(1) in + if a = 1 && is_good + then two_hyphens a b son parent); + !ref_paths + +let correct_interp_with_father_0 paths tokens = + Array.iteri (fun i (id,super,label) -> + if (super = 0 || + (ExtArray.get tokens id).token = Interp "<or-sentence>" || + (ExtArray.get tokens id).token = Interp "</or-sentence>") && (ExtArray.get tokens id).orth = "," + then Array.iteri (fun i1 (id1,super1,label1) -> + if super1 = i + then paths.(i1) <- (id1,0,label1)) paths) paths; + paths + +let remove_interps interp paths tokens = + let paths_ls = Array.to_list paths in + Array.iteri (fun i (id,super,label) -> + if (ExtArray.get tokens id).orth = interp && + not (List.exists (fun (_,super,_) -> super = i) paths_ls) + then paths.(i) <- (0,-1,"")) paths; + paths + +let correct_passive_voice paths tokens = + Array.iteri (fun i (id,super,label) -> + if super >= 0 then + (let id_s, super_s, label_s = paths.(super) in + if (if_cat ["praet"] (ExtArray.get tokens id).token && + if_cat ["ppas"] (ExtArray.get tokens id_s).token) + then (paths.(i) <- (id,super_s,label); + paths.(super) <- (id_s,i,label_s); + Array.iteri (fun i_p (id_p,super_p,label_p) -> + if super_p = super + then paths.(i_p) <- (id_p,i,label_p)) paths))) paths; + paths + +let swap_dep paths tokens = + let change_dep i (id,super,label) = + let id_S, super_S, label_S = paths.(super) in + paths.(i) <- (id,super_S,label); + paths.(super) <- (id_S, id, label_S) in + let rec correct_dep i (id,super,label) = + let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; + "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in + if (if_cat ["comp"] (ExtArray.get tokens id).token && + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token) || + (if_cat ["conj"] (ExtArray.get tokens id).token && + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token && + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) || + (if_cat ["ppron3"] (ExtArray.get tokens id).token && + if_interps [5,"praep"] (ExtArray.get tokens id).token) || + (if_lemma adv_relators (ExtArray.get tokens id).token && + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token) + then + change_dep i (id,super,label); + if (if_lemma adv_relators (ExtArray.get tokens id).token && + if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) + then correct_dep i paths.(i) in + Array.iteri correct_dep paths; paths + + (* + correct_coordination1 -> sąsiad słowem najbliższym po prawej, jeśli pomiędzy nim a mną spójnik, to najbliższym po lewej + nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy + nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *) + +let convert_dep_tree id first_try paths tokens = let paths = Array.copy paths in - let paths = do_if first_try TreeChange.replace_tokens paths in - let paths = do_if first_try (TreeChange.remove_interps ".") paths in - let paths = do_if first_try TreeChange.replace_hyphens paths in - let paths = do_if first_try TreeChange.correct_coordination1 paths in - let paths = do_if first_try TreeChange.correct_interp_with_father_0 paths in - let paths = do_if first_try TreeChange.correct_coordination2 paths in - let paths = do_if first_try (TreeChange.remove_interps ",") paths in - let paths = do_if first_try TreeChange.correct_passive_voice paths in - let paths = do_if first_try TreeChange.praet_qub_aglt paths in - let paths = do_if (not first_try) TreeChange.swap_dep paths in - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> - fprintf file "%s\n" Visualization.html_header; - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); - fprintf file "%s\n" Visualization.html_trailer); - (* let paths = do_if first_try TreeChange.replace_tokens paths in - let paths = do_if first_try TreeChange.replace_hyphens paths in - let paths = do_if first_try (TreeChange.remove_interps ".") paths in - let paths = do_if (not first_try) TreeChange.swap_dep paths in - let paths = do_if first_try TreeChange.correct_coordination1 paths in - let paths = try - do_if first_try TreeChange.correct_coordination2 paths - with - | _ -> ( - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> - fprintf file "%s\n" Visualization.html_header; - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); - fprintf file "%s\n" Visualization.html_trailer); - do_if first_try TreeChange.correct_interp_with_father_0 paths; - do_if first_try (TreeChange.remove_interps ",") paths; - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ "2.html") (fun file -> - fprintf file "%s\n" Visualization.html_header; - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); - fprintf file "%s\n" Visualization.html_trailer); - do_if first_try TreeChange.correct_coordination2 paths) in - let paths = do_if first_try TreeChange.praet_qub_aglt paths in - let paths = do_if first_try TreeChange.correct_interp_with_father_0 paths in - let paths = do_if first_try (TreeChange.remove_interps ",") paths in - let paths = do_if first_try (TreeChange.remove_interps "-") paths in - let paths = do_if first_try TreeChange.correct_passive_voice paths in - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> - fprintf file "%s\n" Visualization.html_header; - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); - fprintf file "%s\n" Visualization.html_trailer); *) + let paths = + if first_try + then + let pom = replace_tokens paths tokens in + let pom = (remove_interps ".") pom tokens in + let pom = replace_hyphens pom tokens in + let pom = correct_coordination1 pom tokens in + let pom = correct_interp_with_father_0 pom tokens in + let pom = correct_coordination2 pom tokens in + let pom = remove_interps "," pom tokens in + let pom = correct_passive_voice pom tokens in + praet_qub_aglt pom tokens + else + swap_dep paths tokens in + (* File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> + Printf.fprintf file "%s\n" Visualization.html_header; + Printf.fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); + Printf.fprintf file "%s\n" Visualization.html_trailer); *) + paths diff --git a/diagnostics/LCGfields.ml b/corpora/LCGfields.ml index ed76cab..f395bb7 100644 --- a/diagnostics/LCGfields.ml +++ b/corpora/LCGfields.ml @@ -83,7 +83,7 @@ let field_of_dependency_tree str_node fields dep_tree = Array.fold_left (fun acc x -> acc ^ (field_of_linear_term str_node field x) ^ "\n\t\t" ) "" dep_tree)) -let field_of_eniam_sentence fields tokens (result : eniam_parse_result) = +let field_of_eniam_sentence fields (result : eniam_parse_result) = match result.status with Idle -> "Idle" (* | PreprocessingError -> "PreprocessingError" *) @@ -99,7 +99,7 @@ let field_of_eniam_sentence fields tokens (result : eniam_parse_result) = | Parsed -> ignore ("Parsed\n\t\t" ^ (field_of_dependency_tree eniam fields result.dependency_tree)); "Parsed\n" | _ -> failwith "field_of_eniam_sentence" -let field_of_conll_sentence fields tokens (result : conll_parse_result) = +let field_of_conll_sentence fields (result : conll_parse_result) = stat_map := StatMap.add !stat_map result.status; match result.status with Idle -> "Idle" @@ -117,33 +117,36 @@ let field_of_conll_sentence fields tokens (result : conll_parse_result) = | _ -> failwith "field_of_conll_sentence" -let rec field_of_sentence fields tokens = function +let rec field_of_sentence fields = function RawSentence s -> s | StructSentence _ -> "StructSentence" | DepSentence _ -> "DepSentence" - | ENIAMSentence result -> field_of_eniam_sentence fields tokens result - | CONLLSentence result -> field_of_conll_sentence fields tokens result + | ENIAMSentence result -> field_of_eniam_sentence fields result + | CONLLSentence result -> field_of_conll_sentence fields result | QuotedSentences sentences -> "QuotedSentences" | AltSentence l -> String.concat "\n\t" (Xlist.map l (fun (m, s) -> - Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields tokens s))) + Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields s))) | _ -> failwith "field_of_sentence: ni" -let rec field_of_paragraph fields tokens = function +let rec field_of_paragraph fields = function RawParagraph s -> print_endline "no fields detected: only raw paragraph"; s | StructParagraph sentences -> - String.concat "\n\t" (Xlist.map sentences (fun p -> field_of_sentence fields tokens p.psentence)) + String.concat "\n\t" (Xlist.map sentences (fun p -> field_of_sentence fields p.psentence)) | AltParagraph l -> String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> (*m = ENIAM ||*) m = CONLL) l) (fun (m,t) -> - Visualization.string_of_mode m ^ "\n\t" ^ (field_of_paragraph fields tokens t))) - (* field_of_paragraph fields tokens (snd @@ List.find (fun (mode,text) -> mode = ENIAM || mode = CONLL) l) *) + Visualization.string_of_mode m ^ "\n\t" ^ (field_of_paragraph fields t))) + (* field_of_paragraph fields (snd @@ List.find (fun (mode,text) -> mode = ENIAM || mode = CONLL) l) *) let rec print_fields_rec fields = function - RawText s -> print_endline "no fields detected: only raw text"; -| StructText(paragraphs,tokens) -> - print_endline (String.concat "\n\n" (Xlist.map paragraphs (field_of_paragraph fields tokens)) ^ "\n") + RawText s -> s + (* print_endline "no fields detected: only raw text"; *) +| StructText(paragraphs) -> + String.concat "\n\n" (Xlist.map paragraphs (field_of_paragraph fields)) ^ "\n" | AltText l -> - print_fields_rec fields (snd @@ List.find (fun (m,t) -> m = Struct (*|| m = ENIAM*) || m = CONLL) l) + String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> m = Struct || m = CONLL) l) (fun (m,t) -> + Visualization.string_of_mode m ^ "\n\t" ^ (print_fields_rec fields t))) + (* print_fields_rec fields (snd @@ List.find (fun (m,t) -> m = Struct (*|| m = ENIAM*) || m = CONLL) l) *) let print_fields fields text = - print_fields_rec fields text + print_endline @@ print_fields_rec fields text (* ; print_field_map () *) diff --git a/corpora/makefile b/corpora/makefile index 0dbbb0b..06617e4 100755 --- a/corpora/makefile +++ b/corpora/makefile @@ -16,9 +16,9 @@ lib: freq_test: $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(MODS) freq_test.ml -test: CONLL.ml test_conll2.ml +test: CONLL.ml CONLL_adapter.ml test_conll2.ml mkdir -p results - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) CONLL.ml test_conll2.ml + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx diff --git a/corpora/test_conll.ml b/corpora/test_conll.ml index 3fb7e87..352b93b 100644 --- a/corpora/test_conll.ml +++ b/corpora/test_conll.ml @@ -1,3 +1,8 @@ +open Xstd +open ENIAMsubsyntaxTypes +open ENIAMtokenizerTypes +open LCGtypes +open ExecTypes let empty_result = { input_text=RawText ""; @@ -146,7 +151,7 @@ let eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems = let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens lex_sems = let result = empty_conll_parse_result in let time2 = time_fun () in - let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems + let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in try let dep_chart = LCGlexicon.dep_create paths tokens lex_sems in let dep_chart,references = LCGchart.dep_lazify dep_chart in @@ -193,7 +198,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le let time5 = time_fun () in {result with status=ReductionError; msg=Printexc.to_string e; reduction_time=time5 -. time4} else if first_try - then conll_parse_sentence timeout test_only_flag id false paths tokens + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems else {result with status=NotParsed} with Timeout t -> @@ -201,7 +206,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3} | NotDepParsed(id_ndp,left,l,right) -> if first_try - then conll_parse_sentence timeout test_only_flag id false paths tokens + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems else let time4 = time_fun () in {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3} | e -> @@ -210,7 +215,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le with e -> (*print_endline (Printexc.to_string e);*) let time3 = time_fun () in if first_try - then conll_parse_sentence timeout test_only_flag id false paths tokens + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems else {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2} @@ -243,11 +248,7 @@ let get_paths old_paths = function paths | _ -> failwith "get_paths" -<<<<<<< HEAD -let rec parse_sentence timeout test_only_flag mode file_prefix tokens lex_sems = function -======= -let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = function ->>>>>>> dep_trees +let rec parse_sentence timeout test_only_flag mode id file_prefix tokens lex_sems = function RawSentence s -> (match mode with Swigra -> @@ -259,23 +260,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct | StructSentence(paths,last) -> (match mode with ENIAM -> -<<<<<<< HEAD let result = eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems in -======= - let result = empty_eniam_parse_result in - (* let result = print_endline "eniam_parse_sentence"; eniam_parse_sentence timeout test_only_flag paths last tokens in *) ->>>>>>> dep_trees + (* let result = empty_eniam_parse_result in *) let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in ENIAMSentence result | _ -> failwith "parse_sentence") | DepSentence(paths) -> (match mode with CONLL -> -<<<<<<< HEAD - let result = conll_parse_sentence timeout test_only_flag paths tokens lex_sems in -======= - let result = (*print_endline "conll_parse_sentence";*) conll_parse_sentence timeout test_only_flag id true paths tokens in ->>>>>>> dep_trees + let result = conll_parse_sentence timeout test_only_flag id true paths tokens lex_sems in let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix; paths = paths} in @@ -289,19 +282,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct if not Paths.config.Paths.mate_parser_enabled then DepSentence paths else ( print_endline "parse_sentence 1"; (* print_endline (Visualization.html_of_dep_sentence tokens paths); *) - let conll = ENIAM_CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in + let conll = CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in print_endline "parse_sentence 2"; (* printf "|%s|\n" conll; *) Printf.fprintf mate_out "%s%!" conll; print_endline "parse_sentence 3"; - let new_paths = get_paths paths (ENIAM_CONLL.load_sentence mate_in) in + let new_paths = get_paths paths (CONLL.load_sentence mate_in) in print_endline "parse_sentence 4"; (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *) -<<<<<<< HEAD - let result = conll_parse_sentence timeout test_only_flag new_paths tokens lex_sems in -======= - let result = conll_parse_sentence timeout test_only_flag id true new_paths tokens in ->>>>>>> dep_trees + let result = conll_parse_sentence timeout test_only_flag id true new_paths tokens lex_sems in let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix; paths=new_paths} in @@ -309,66 +298,94 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct | _ -> failwith "parse_sentence") | QuotedSentences sentences -> let sentences = Xlist.rev_map sentences (fun p -> -<<<<<<< HEAD - let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens lex_sems p.psentence in -======= - let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens p.psentence in ->>>>>>> dep_trees + let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens lex_sems p.psentence in {p with psentence=sentence}) in QuotedSentences(List.rev sentences) | AltSentence l -> let l = Xlist.rev_map l (fun (mode,sentence) -> -<<<<<<< HEAD - mode, parse_sentence timeout test_only_flag mode file_prefix tokens lex_sems sentence) in + mode, parse_sentence timeout test_only_flag mode id file_prefix tokens lex_sems sentence) in AltSentence(List.rev l) | _ -> failwith "parse_sentence" -let rec parse_paragraph timeout test_only_flag mode tokens lex_sems = function +let rec parse_paragraph timeout test_only_flag mode id tokens lex_sems = function RawParagraph s -> RawParagraph s | StructParagraph sentences -> let sentences = Xlist.rev_map sentences (fun p -> - let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens lex_sems p.psentence in -======= - mode, parse_sentence timeout test_only_flag mode id file_prefix tokens sentence) in - AltSentence(List.rev l) - | _ -> failwith "parse_sentence" + let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens lex_sems p.psentence in + {p with psentence=sentence}) in + StructParagraph(List.rev sentences) + | AltParagraph l -> + let l = Xlist.rev_map l (fun (mode,paragraph) -> + mode, parse_paragraph timeout test_only_flag mode id tokens lex_sems paragraph) in + AltParagraph(List.rev l) + +let rec parse_text timeout test_only_flag mode id tokens lex_sems = function + RawText s -> RawText s + | StructText paragraphs -> + let paragraphs = Xlist.rev_map paragraphs (fun paragraph -> + parse_paragraph timeout test_only_flag mode id tokens lex_sems paragraph) in + StructText(List.rev paragraphs) + | AltText l -> AltText(Xlist.map l (fun (mode,text) -> + mode, parse_text timeout test_only_flag mode id tokens lex_sems text)) + +let select_mode = function + (Raw,_),_ -> failwith "select_mode" + | _,(Raw,_) -> failwith "select_mode" + | (Struct,_),_ -> failwith "select_mode" + | _,(Struct,_) -> failwith "select_mode" + | (CONLL,s),_ -> CONLL,s + | _,(CONLL,s) -> CONLL,s + | (ENIAM,s),_ -> ENIAM,s + | _,(ENIAM,s) -> ENIAM,s + | (Swigra,s),_ -> Swigra,s + | _,(Swigra,s) -> Swigra,s + | (Mate,s),_ -> Mate,s + | _,(Mate,s) -> Mate,s + | _ -> failwith "select_mode: ni" -let rec parse_paragraph timeout test_only_flag mode id tokens = function +let rec select_sentences_sentence = function + RawSentence s -> failwith "select_sentences_sentence" + | StructSentence(paths,last) -> failwith "select_sentences_sentence" + | DepSentence paths -> failwith "select_sentences_sentence" + | QuotedSentences sentences -> + let sentences = Xlist.rev_map sentences (fun p -> + let sentence,_ = select_sentences_sentence p.psentence in + {p with psentence=sentence}) in + QuotedSentences(List.rev sentences), Parsed + | AltSentence l -> + let raw,selected = Xlist.fold l ([],[]) (fun (raw,selected) (mode,sentence) -> + if mode = Raw then (mode,sentence) :: raw, selected else + let sentence,status = select_sentences_sentence sentence in + if status <> Parsed && status <> NotTranslated then raw,selected else + match selected with + [] -> raw,[mode,sentence] + | [mode2,sentence2] -> raw,[select_mode ((mode,sentence),(mode2,sentence2))] + | _ -> failwith "select_sentences_sentence") in + AltSentence(raw @ selected), Parsed + | ENIAMSentence result -> ENIAMSentence result, result.status + | CONLLSentence result -> CONLLSentence result, result.status + | SemSentence result -> SemSentence result, result.status + +let rec select_sentences_paragraph = function RawParagraph s -> RawParagraph s | StructParagraph sentences -> let sentences = Xlist.rev_map sentences (fun p -> - let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens p.psentence in ->>>>>>> dep_trees + let sentence,_ = select_sentences_sentence p.psentence in {p with psentence=sentence}) in StructParagraph(List.rev sentences) | AltParagraph l -> let l = Xlist.rev_map l (fun (mode,paragraph) -> -<<<<<<< HEAD - mode, parse_paragraph timeout test_only_flag mode tokens lex_sems paragraph) in - AltParagraph(List.rev l) - -let rec parse_text timeout test_only_flag mode tokens lex_sems = function -======= - mode, parse_paragraph timeout test_only_flag mode id tokens paragraph) in + mode, select_sentences_paragraph paragraph) in AltParagraph(List.rev l) -let rec parse_text timeout test_only_flag mode id = function ->>>>>>> dep_trees +let rec select_sentences_text = function RawText s -> RawText s | StructText paragraphs -> let paragraphs = Xlist.rev_map paragraphs (fun paragraph -> -<<<<<<< HEAD - parse_paragraph timeout test_only_flag mode tokens lex_sems paragraph) in + select_sentences_paragraph paragraph) in StructText(List.rev paragraphs) | AltText l -> AltText(Xlist.map l (fun (mode,text) -> - mode, parse_text timeout test_only_flag mode tokens lex_sems text)) -======= - parse_paragraph timeout test_only_flag mode id tokens paragraph) in - StructText(List.rev paragraphs, tokens) - | AltText l -> AltText(Xlist.map l (fun (mode,text) -> - mode, parse_text timeout test_only_flag mode id text)) ->>>>>>> dep_trees - + mode, select_sentences_text text)) let rec extract_query_text = function RawText s -> s @@ -392,11 +409,7 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n = let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1} in if msg <> "" then {result with status=PreprocessingError; msg=msg} else ( (* print_endline "process_query 3"; *) -<<<<<<< HEAD - let parsed_text = parse_text timeout test_only_flag Struct tokens lex_sems (translate_text pre_text) in -======= - let parsed_text = parse_text timeout test_only_flag Struct id (translate_text pre_text) in ->>>>>>> dep_trees + let parsed_text = parse_text timeout test_only_flag Struct id tokens lex_sems (translate_text pre_text) in (* print_endline "process_query 4"; *) let time3 = time_fun () in let result = if test_only_flag then result else {result with status=Parsed; parsed_text=parsed_text} in @@ -421,23 +434,50 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n = let result = {result with semantic_time=time4 -. time3} in result) +let get_sock_addr host_name port = + let he = Unix.gethostbyname host_name in + let addr = he.Unix.h_addr_list in + Unix.ADDR_INET(addr.(0),port) + +let id_counter = ref 0 + +let get_id () = + incr id_counter; + "ID_" ^ (string_of_int !id_counter) + +let get_query_id = function + ENIAMsubsyntaxTypes.AltText[_;ENIAMsubsyntaxTypes.CONLL,ENIAMsubsyntaxTypes.StructText([ENIAMsubsyntaxTypes.StructParagraph[p]])] -> if p.ENIAMsubsyntaxTypes.pid = "" then get_id () else p.ENIAMsubsyntaxTypes.pid + | ENIAMsubsyntaxTypes.AltText[ENIAMsubsyntaxTypes.CONLL,ENIAMsubsyntaxTypes.StructText([ENIAMsubsyntaxTypes.StructParagraph[p]])] -> if p.ENIAMsubsyntaxTypes.pid = "" then get_id () else p.ENIAMsubsyntaxTypes.pid + | _ -> failwith "get_query_id" + +let process_id s = + if Xstring.check_prefix "ID_" s then s else + let a,b,c = match Xstring.split_delim "/" s with + [a;b;c] -> a,b,c + | _ -> failwith ("process_id: " ^ s) in + if Xstring.check_prefix "NKJP_1M_" a && Xstring.check_prefix "morph_" b && Xstring.check_sufix "-p" b && + Xstring.check_prefix "morph_" c && Xstring.check_sufix "-s" c then + Xstring.cut_prefix "NKJP_1M_" a ^ "." ^ Xstring.cut_sufix "-s" (Xstring.cut_prefix "morph_" c) + else failwith ("process_id: " ^ s) let process_conll_corpus filename = + print_endline "process_conll_corpus: START"; let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in - print_endline "process_conll_corpus"; - let corpus = [List.hd corpus] in + print_endline "process_conll_corpus: DONE"; + (* let corpus = [List.hd corpus] in *) let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in - Xlist.iter corpus (fun query -> + print_endline "connection_opened"; + Xlist.iter corpus (fun (query,tokens) -> let id = process_id (get_query_id query) in let path = "results/" ^ id ^ "/" in ignore (Sys.command ("mkdir -p " ^ path)); - let result = process_query ic oc 30. false "x" query 10 in - Visualization.print_html_text path "input_text" result.input_text; - Visualization.print_html_text path "pre_text" result.pre_text; - Visualization.print_html_text path "parsed_text" result.parsed_text; - Visualization.print_html_text path "selected_sent_text" result.selected_sent_text; - Visualization.print_html_text path "semantic_text" result.semantic_text; - Visualization.print_html_text path "selected_semantic_text" result.selected_semantic_text; + let result = process_query ic oc 30. false "x" (query,tokens) 10 in + (* Visualization.print_html_text path "input_text" result.input_text tokens; + Visualization.print_html_text path "pre_text" result.pre_text tokens; + Visualization.print_html_text path "parsed_text" result.parsed_text tokens; + Visualization.print_html_text path "selected_sent_text" result.selected_sent_text tokens; + Visualization.print_html_text path "semantic_text" result.semantic_text tokens; + Visualization.print_html_text path "selected_semantic_text" result.selected_semantic_text tokens; *) (* printf "input_text:\n%s\n" (Visualization.string_of_text result.input_text); printf "pre_text:\n%s\n" (Visualization.string_of_text result.pre_text); *) (* Exec.print_result stdout result; *) @@ -445,13 +485,15 @@ let process_conll_corpus filename = (* CompTrees.compare_results result.parsed_text; *) (* Visualization.print_paths "results/" "paths" result.paths; *) ()); - Marshal.to_channel oc (PreTypes.RawText "",ExtArray.make 1 ENIAMtokenizerTypes.empty_token) []; + Marshal.to_channel oc (ENIAMsubsyntaxTypes.RawText "",ExtArray.make 1 ENIAMtokenizerTypes.empty_token) []; flush oc; let _ = Unix.shutdown_connection ic in () let _ = + LCGfields.reset(); (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *) - (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) - process_conll_corpus "../testy/skladnica-test1.conll"; + process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; + (* process_conll_corpus "../testy/skladnica-test1.conll"; *) + LCGfields.print_results(); () diff --git a/corpora/test_conll2.ml b/corpora/test_conll2.ml index e3d41a9..8d3d909 100644 --- a/corpora/test_conll2.ml +++ b/corpora/test_conll2.ml @@ -116,7 +116,7 @@ let test_example path id tokens lex_sems paths last = let test_dep_example path id tokens lex_sems paths = try ENIAM_LCGreductions.reset_variant_label (); - (* let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in *) + let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; let chart = create_dep_chart tokens lex_sems paths in ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; @@ -150,7 +150,7 @@ let test_dep_example path id tokens lex_sems paths = let rec parse_sentence name id tokens lex_sems = function RawSentence s -> id | StructSentence(paths,last) -> - test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; + (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *) id + 1 | DepSentence(paths) -> test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths; diff --git a/diagnostics/treeChange.ml b/diagnostics/treeChange.ml deleted file mode 100644 index 1c7ae71..0000000 --- a/diagnostics/treeChange.ml +++ /dev/null @@ -1,348 +0,0 @@ -open Xstd -open PreTypes - -let if_lemma lemmas = function - Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas - | _ -> false - -let if_cat cats = function - Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats - | _ -> false - -let if_interps interps token = - let interp = match token with - Lemma(_,_,i) -> i - | _ -> [[[]]] in - let if_interp nr value = - List.exists (fun x -> - try - List.exists (fun y -> - y = value) (List.nth x nr) - with _ -> false - ) interp in - Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) - -let correct_coordination1 paths tokens = - let paths_ls = List.mapi (fun i (id,super,label) -> - (i,id,super,label)) (Array.to_list paths) in - - let l = [("subst:nom",0),(["fin";"praet"],0); - ("subst:acc",0),(["inf"],0); - ("ppron3:nom",0),(["fin";"praet"],0); - ("ppron3:acc",0),(["fin";"praet"],0); - ("adv",0),(["fin";"praet"],0); - ("adv",0),(["inf"],0); - ("adv",0),(["adj"],0); - ("prep",0),(["fin";"praet"],0); - ("prep",0),(["inf"],0); - ("prep",0),(["ppas"],0); - ("prep",0),(["subst"],0); - ("prep:gen",0),(["subst:gen"],0); - ("adj:nom",0),(["fin";"praet"],0); - ("adj:nom",0),(["subst:nom"],0); - ("adj:gen",0),(["subst:gen"],0); - ("adj:dat",0),(["subst:dat"],0); - ("adj:acc",0),(["subst:acc"],0); - ("adj:inst",0),(["subst:inst"],0); - ("adj:loc",0),(["subst:loc"],0); - ("subst:gen",0),(["subst:nom"],0); - (* ("subst:gen",0),(["subst:gen"],0); *) - ("subst:gen",0),(["subst:dat"],0); - ("subst:gen",0),(["subst:acc"],0); - ("subst:gen",0),(["subst:inst"],0); - ("subst:gen",0),(["subst:loc"],0); - ("ppron3:gen",0),(["subst:nom"],0); - ("ppron3:gen",0),(["subst:dat"],0); - ("ppron3:gen",0),(["subst:acc"],0); - ("ppron3:gen",0),(["subst:inst"],0); - ("ppron3:gen",0),(["subst:loc"],0); - ("qub",0),(["fin";"praet"],0); - ("qub",0),(["subst"],0); - ("qub",0),(["adj"],0); - ("pact",0),(["subst"],0); - ("ppas",0),(["subst"],0) - ] in - - let find_dependents sons = - - let is (i,id,super,label) pattern = match Xstring.split ":" pattern with - ["prep";case] -> if_cat ["prep"] (ExtArray.get tokens id).token && - if_interps [0,case] (ExtArray.get tokens id).token - | [cat;case] -> if_cat [cat] (ExtArray.get tokens id).token && - if_interps [1,case] (ExtArray.get tokens id).token - | [cat] -> if_cat [cat] (ExtArray.get tokens id).token - | _ -> failwith "is (in correct_coordination1)" in - - let incr_representative acc son = Xlist.map acc (fun ((one,a),(rest,b)) -> - if is son one - then (one,a + 1), (rest,b) - else if List.exists (is son) rest - then (one,a), (rest,b + 1) - else (one,a), (rest,b)) in - - let get_from sons pattern = List.find (fun x -> is x pattern) sons in - - let l = Xlist.fold sons l incr_representative in - let results = List.filter (fun ((_,a),(_,b)) -> a = 1 && b > 1) l in - Xlist.map results (fun result -> - get_from sons @@ fst @@ fst result, - List.filter (fun son -> - List.exists (fun one -> is son one) (fst (snd result))) sons) in - - let establish_neighbour super ((i_d,id_d,super_d,label_d),sons) = - let not_between (i_s,_,_,_) = - (super < i_d && super < i_s) || - (super > i_d && super > i_s) in - let (i_n,id_n,super_n,label_n) = List.find (fun son -> - not_between son) sons in - paths.(i_d) <- (id_d, i_n, label_d) in - - let examine_coords (i,id,super,label) sons = - try - let dependents = find_dependents sons in - Xlist.iter dependents (establish_neighbour super) - with - | _ -> () in - - Array.iteri (fun i (id,super,label) -> - if if_cat ["conj"] (ExtArray.get tokens id).token - then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in - if (List.length sons > 2) - then examine_coords (i,id,super,label) sons)) paths; - paths - -let correct_coordination2 paths tokens = - let paths_c = Array.copy paths in - let paths_ls () = List.mapi (fun i (id,super,label) -> - (i,id,super,label)) (Array.to_list paths_c) in - - (* let ps a sons = - print_endline a; - List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; - print_endline "" in *) - - let rec correct_rec (i,id,super,label) sons = - let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in - (* ps "left:" (List.rev left_s); - ps "right:" right_s; *) - find_father i (List.rev left_s); - find_father i right_s - - and find_father i0 = function - [(i,id,super,label)] -> paths_c.(i) <- (id,i0,label) - | (a,b,c,d) :: (i,id,super,label) :: t -> - paths_c.(i) <- (id,i0,label); - if not (if_cat ["conj"] (ExtArray.get tokens i).token || - (ExtArray.get tokens i).orth = ",") - then failwith "find_father"; - correct_rec (i,id,super,label) (if a < i - then (a,b,c,d) :: t - else List.rev @@ (a,b,c,d) :: t) - | _ -> failwith "find_father" in - - let check_previous_for_interp i = - if i >= 0 && (ExtArray.get tokens i).orth = "," && - not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c)) - then paths_c.(i) <- (0,-1,"") in - - Array.iteri (fun i (id,super,label) -> - if if_cat ["conj"] (ExtArray.get tokens i).token || - (ExtArray.get tokens i).orth = "," - then - (check_previous_for_interp (i-1); - let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in - if (List.length sons > 2) - then correct_rec (i,id,super,label) sons)) paths_c; - paths_c - -let praet_qub_aglt paths tokens = - Array.iteri (fun i (id,super,label) -> - if super >= 0 then - (let id_s, super_s, label_s = paths.(super) in - if if_cat ["aglt"] (ExtArray.get tokens id).token && - (ExtArray.get tokens id_s).orth = "by" - then let id_gf,super_gf,label_gf = paths.(super_s) in - if if_cat ["praet"] (ExtArray.get tokens id_gf).token - then paths.(i) <- (id,super_s,label))) paths; - paths - -let replace_tokens paths tokens = -(* for i = 0 to ExtArray.size tokens - 1 do - print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth) -done; *) - let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> - if (ExtArray.get tokens i).orth = orth then i else acc) in - - let multidot i id0 super0 label0 = - let id1, super1, label1 = paths.(super0) in - if super1 >= 0 then - let id2, super2, label2 = paths.(super1) in - if (ExtArray.get tokens id1).orth = "." && - (ExtArray.get tokens id2).orth = "." - then - (paths.(super1) <- (find_token "..." ,super2, label2); - paths.(super0) <- (0,-1,""); - paths.(i) <- (0,-1,"")) in - - let brev i id super label = - let if_the_last_dot () = - let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> - s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in - Array.fold_left (fun acc (i2,s,l) -> - acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in - - let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () - then "" - else "." in - - let n_orth = (ExtArray.get tokens id).orth ^ dot in - paths.(i) <- (find_token n_orth,super,label) in - - Array.iteri (fun i (id,super,label) -> - if (ExtArray.get tokens id).orth = "." - then multidot i id super label; - if if_cat ["brev"] (ExtArray.get tokens id).token - then brev i id super label) - paths; - paths - -let replace_hyphens paths tokens = - let ref_paths = ref paths in - let find_token token = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> - if (ExtArray.get tokens i).token = token then i else acc) in - let find_specific_token token beg next = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> - if (ExtArray.get tokens i).token = token && - beg <= (ExtArray.get tokens i).beg && - (ExtArray.get tokens i).next <= next - then i else acc) in - - let correct_last son_of_zero = - let i1,s1,l1 = !ref_paths.(Array.length !ref_paths - 1) in - let i2,s2,l2 = !ref_paths.(son_of_zero) in - if (ExtArray.get tokens i1).orth = "." - then - (!ref_paths.(Array.length !ref_paths - 1) <- (find_token (Interp "</sentence>"),1,l1); - !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) - else - (ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),1,"-") |]; - !ref_paths.(Array.length !ref_paths - 2) <- (i1,Array.length !ref_paths - 1,l1); - !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) in - - let one_hyphen sons_of_zero = - let i2,s2,l2 = !ref_paths.(1) in - Xlist.iter sons_of_zero (fun son_of_zero -> - let i1,s1,l1 = !ref_paths.(son_of_zero) in - !ref_paths.(son_of_zero) <- (i1,1,l1)); - !ref_paths.(1) <- (find_token (Interp "<or-sentence>"),0,l2); - correct_last son_of_zero in - - let two_hyphens first second son parent = - let i1,s1,l1 = !ref_paths.(first) in - let i2,s2,l2 = !ref_paths.(second) in - let beg, next = (ExtArray.get tokens i2).beg, (ExtArray.get tokens i2).next in - let i3,s3,l3 = !ref_paths.(son) in - let i4,s4,l4 = !ref_paths.(parent) in - ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),first,"-") |]; - !ref_paths.(first) <- (find_token (Interp "<or-sentence>"),0,l1); - !ref_paths.(second) <- (find_specific_token (Interp "</or-sentence>") beg next,first,l2); - !ref_paths.(son) <- (i3,second,l3); - !ref_paths.(parent) <- (i4,first,l4) in - - let rec is_dep_correct a b out zero res i (id,super,label) = (* out = how many words in (a,b) have parent outside [a,b]*) - (* print_endline ((string_of_int a) ^ " " ^ (string_of_int b) ^ " " ^ (string_of_int out) ^ " " ^ (string_of_int zero) ^ " " ^ (string_of_int i)); *) - if out > 1 || zero > 1 || (* zero = how many words (not interps) have parent 0 *) - (a < i && i < b && super < a && label <> "interp") || - (a < super && super < b && (i < a || b < i)) - then false, res - else - if i+1 = Array.length !ref_paths - then out = 1 && zero = 1, res - else - if a < i && i < b && b < super - then is_dep_correct a b (out+1) zero (i,super) (i+1) !ref_paths.(i+1) - else - if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token) - then is_dep_correct a b out (zero+1) res (i+1) !ref_paths.(i+1) - else is_dep_correct a b out zero res (i+1) !ref_paths.(i+1) in - - let hyphens = snd @@ Array.fold_left (fun (i,acc) (id,super,label) -> - if (ExtArray.get tokens id).orth = "-" - then i+1, i :: acc - else i+1, acc) (0,[]) !ref_paths in - - let sons_of_zero = snd @@ Array.fold_left (fun (i,acc) (id,super,label) -> - if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token) - then i+1, i :: acc - else i+1, acc) (0,[]) !ref_paths in - - (if List.length sons_of_zero = 1 - then - if List.length hyphens = 1 && hyphens = [1] - then one_hyphen sons_of_zero - else - if List.length hyphens = 2 - then let a, b = List.nth hyphens 1, List.nth hyphens 0 in - let is_good, (son,parent) = is_dep_correct a b 0 0 (0,0) 1 !ref_paths.(1) in - if a = 1 && is_good - then two_hyphens a b son parent); - !ref_paths - -let correct_interp_with_father_0 paths tokens = - Array.iteri (fun i (id,super,label) -> - if (super = 0 || - (ExtArray.get tokens id).token = Interp "<or-sentence>" || - (ExtArray.get tokens id).token = Interp "</or-sentence>") && (ExtArray.get tokens id).orth = "," - then Array.iteri (fun i1 (id1,super1,label1) -> - if super1 = i - then paths.(i1) <- (id1,0,label1)) paths) paths; - paths - -let remove_interps interp paths tokens = - let paths_ls = Array.to_list paths in - Array.iteri (fun i (id,super,label) -> - if (ExtArray.get tokens id).orth = interp && - not (List.exists (fun (_,super,_) -> super = i) paths_ls) - then paths.(i) <- (0,-1,"")) paths; - paths - -let correct_passive_voice paths tokens = - Array.iteri (fun i (id,super,label) -> - if super >= 0 then - (let id_s, super_s, label_s = paths.(super) in - if (if_cat ["praet"] (ExtArray.get tokens id).token && - if_cat ["ppas"] (ExtArray.get tokens id_s).token) - then (paths.(i) <- (id,super_s,label); - paths.(super) <- (id_s,i,label_s); - Array.iteri (fun i_p (id_p,super_p,label_p) -> - if super_p = super - then paths.(i_p) <- (id_p,i,label_p)) paths))) paths; - paths - -let swap_dep paths tokens = - let change_dep i (id,super,label) = - let id_S, super_S, label_S = paths.(super) in - paths.(i) <- (id,super_S,label); - paths.(super) <- (id_S, id, label_S) in - let rec correct_dep i (id,super,label) = - let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; - "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in - if (if_cat ["comp"] (ExtArray.get tokens id).token && - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token) || - (if_cat ["conj"] (ExtArray.get tokens id).token && - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token && - not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) || - (if_cat ["ppron3"] (ExtArray.get tokens id).token && - if_interps [5,"praep"] (ExtArray.get tokens id).token) || - (if_lemma adv_relators (ExtArray.get tokens id).token && - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token) - then - change_dep i (id,super,label); - if (if_lemma adv_relators (ExtArray.get tokens id).token && - if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) - then correct_dep i paths.(i) in - Array.iteri correct_dep paths; paths - - (* - correct_coordination1 -> sąsiad słowem najbliższym po prawej, jeśli pomiędzy nim a mną spójnik, to najbliższym po lewej - nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy - nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *) diff --git a/parser/visualization.ml b/parser/visualization.ml index f776899..84d4324 100644 --- a/parser/visualization.ml +++ b/parser/visualization.ml @@ -916,7 +916,7 @@ let rec html_of_text path tokens = function sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_text path tokens text))) ^ "</table>" -let print_html_text path name text tokens lex_sems = +let print_html_text path name text tokens (*lex_sems*) = File.file_out (path ^ name ^ ".html") (fun file -> fprintf file "%s\n" html_header; fprintf file "%s<BR>\n" (html_of_text path tokens text); diff --git a/pre/makefile b/pre/makefile index 92301b5..a629f05 100755 --- a/pre/makefile +++ b/pre/makefile @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt OCAMLDEP=ocamldep INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam OCAMLFLAGS=$(INCLUDES) -g -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa INSTALLDIR=`ocamlc -where` WAL= paths.ml diff --git a/pre/preProcessing.ml b/pre/preProcessing.ml index 47ca980..2eec195 100644 --- a/pre/preProcessing.ml +++ b/pre/preProcessing.ml @@ -121,9 +121,9 @@ let parse_text = function let lex_sems = ENIAMlexSemantics.assign tokens text in text,tokens,lex_sems | AltText[Raw,RawText query;CONLL,StructText[ - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens -> + StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens -> let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in - let conll = StructParagraph[{p with psentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] + let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in let paths = ENIAMsubsyntax.parse query in let sentences = ENIAMsentences.split_into_sentences "" query tokens paths in @@ -135,7 +135,7 @@ let parse_text = function let rec main_loop in_chan out_chan = (* print_endline "main_loop 1"; *) - let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_record ExtArray.t) in + let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_env ExtArray.t) in (* print_endline "main_loop 2"; *) if fst query = RawText "" then () else ( (try @@ -154,7 +154,7 @@ let rec main_loop in_chan out_chan = (* print_endline "main_loop 7"; *) Marshal.to_channel out_chan ( RawText "", - ExtArray.make 1 ENIAMtokenizerTypes.empty_token, + ExtArray.make 1 ENIAMtokenizerTypes.empty_token_env, ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem, Printexc.to_string e, 0.) [])); diff --git a/testy/skladnica-test2.conll b/testy/skladnica-test2.conll index faf79b7..3205517 100644 --- a/testy/skladnica-test2.conll +++ b/testy/skladnica-test2.conll @@ -11,7 +11,7 @@ 5 szanse szansa subst subst pl|acc|f 4 obj_th _ _ 6 ? ? interp interp _ 4 punct _ _ -# trees/NKJP_1M_1202900095/morph_3-p/morph_3.46-s.xml.tree +# trees/NKJP_1M_1202900095/morph_3-p/morph_3.46-s.xml.trees 1 - - interp interp 0 _ _ _ 2 Słoń słoń subst subst sg|nom|m2 4 _ _ _ 3 - - interp interp 0 _ _ _ @@ -19,7 +19,7 @@ 5 Pinio Pinio subst subst sg|nom|m1 4 _ _ _ 6 . . interp interp 0 _ _ _ -# trees/NKJP_1M_2002000114/morph_2-p/morph_2.72-s.xml.tree +# trees/NKJP_1M_2002000114/morph_2-p/morph_2.72-s.xml.trees 1 - - interp interp 0 _ _ _ 2 Nie nie qub qub 3 _ _ _ 3 mogę móc fin fin sg|pri|imperf 7 _ _ _ @@ -29,7 +29,7 @@ 7 zachrypiał zachrypieć praet praet sg|m1|perf 0 _ _ _ 8 . . interp interp 0 _ _ _ -# trees/NKJP_1M_2002000028/morph_5-p/morph_5.40-s.xml.tree +# trees/NKJP_1M_2002000028/morph_5-p/morph_5.40-s.xml.trees 1 - - interp interp 0 _ _ _ 2 Właśnie właśnie qub qub 4 _ _ _ 3 to to subst subst sg|acc|n 4 _ _ _ @@ -39,7 +39,7 @@ 7 twardo twardo adv adv pos 6 _ _ _ 8 . . interp interp 0 _ _ _ -# trees/NKJP_1M_1202000001/morph_3-p/morph_3.9-s.xml.tree +# trees/NKJP_1M_1202000001/morph_3-p/morph_3.9-s.xml.trees 1 CKM CKM subst subst sg|nom|n 0 _ _ _ 2 : interp 0 _ _ _ 3 Jak jak adv adv pos 5 _ _ _ @@ -50,7 +50,7 @@ 8 patrzeć patrzeć inf inf imperf 5 _ _ _ 9 ? ? interp interp 0 _ _ _ -# trees/NKJP_1M_2001000023/morph_1-p/morph_1.61-s.xml.tree +# trees/NKJP_1M_2001000023/morph_1-p/morph_1.61-s.xml.trees 1 Pochylił pochylić praet praet sg|m1|perf 0 _ _ _ 2 em być aglt aglt sg|pri|imperf|wok 1 _ _ _ 3 się się qub qub 1 _ _ _