From e8d2c65cd63bb36a671eb2bbd0effd7a39eec931 Mon Sep 17 00:00:00 2001 From: Wojciech Jaworski <wjaworski@mimuw.edu.pl> Date: Mon, 16 Oct 2017 12:08:47 +0200 Subject: [PATCH] Analiza różnic w lematyzacji --- NKJP2/validateMorphology.ml | 390 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------- NKJP2/validateTokenizer.ml | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- morphology/doc/model2.pdf | Bin 355865 -> 0 bytes morphology/resources/alt_supplement.tab | 1 + 4 files changed, 421 insertions(+), 50 deletions(-) diff --git a/NKJP2/validateMorphology.ml b/NKJP2/validateMorphology.ml index 6667c5e..ba3a71c 100644 --- a/NKJP2/validateMorphology.ml +++ b/NKJP2/validateMorphology.ml @@ -26,10 +26,10 @@ let rec has_brev = function | _ :: l -> has_brev l | [] -> false -let rec get_ntoken = function - (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp - | _ :: l -> get_ntoken l - | [] -> raise Not_found +let rec get_brev = function + BrevLemma s :: _ -> s + | _ :: l -> get_brev l + | [] -> failwith "get_brev" let rec add_ntoken stats = function Token t -> @@ -91,7 +91,7 @@ let lemmatize_string s = | Token ({token=FirstCap _} as t) -> t :: l | Token ({token=AllCap _} as t) -> t :: l | Token ({token=CapLetter _} as t) -> t :: l - | Token ({token=RomanDig _}) -> (*print_endline ("lemmatize_string: " ^ s);*) (*t ::*) l + | Token ({token=RomanDig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l | Token ({token=Dig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l | Token ({token=Proper _} as t) -> t :: l | Seq[Token {token=AllSmall _};Token {token=Lemma _}] -> l @@ -125,34 +125,52 @@ let lemmatize_string s = let get_cat_interp = function "subst","subst",[n;c;["m1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]] + | "subst","subst",[n;c;["m1"]],[_;_;["m1"];col] -> "subst",[n;c;["m1"];col] | "subst","subst",[n;c;["m2"]],[_;_;["m2"]] -> "subst",[n;c;["m2"]] | "subst","subst",[n;c;["m3"]],[_;_;["m3"]] -> "subst",[n;c;["m3"]] - | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]] - | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]] + | "subst","subst",[n;c;["n"]],[_;_;["n"];col] -> "subst",[n;c;["n"];col] | "subst","subst",[n;c;["f"]],[_;_;["f"]] -> "subst",[n;c;["f"]] - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]] - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]] - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p2"]] -> "subst",[n;c;["p2"]] - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p3"]] -> "subst",[n;c;["p3"]] - | "subst","subst",[n;c;["m1";"p1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]] - | "subst","subst",[n;c;["m1";"p1"]],[_;_;["p1"]] -> "subst",[n;c;["p1"]] + | "subst","subst",[n;c;g],[_;_;_] -> "subst",[n;c;g] + | "subst","subst",[n;c;g],[_;_;_;_] -> "subst",[n;c;g] + | "subst","adj",[n;c;g],_ -> "subst",[n;c;g] | "depr","subst",[["pl"];["nom"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["nom"];["m2"]] + | "depr","subst",[["pl"];["acc"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["acc"];["m2"]] | "ppron3","ppron3",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["ter"];_;_] -> "ppron3",ninterp - | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron3",ninterp - | "numcol","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *) - | "num","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *) + | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron12",ninterp + | "numcol","num",ninterp,_ -> "num",ninterp + | "num","num",ninterp,_ -> "num",ninterp (* na tym etapie nie da się skorygować błędów *) + (* | "num","num",[["pl"];c;g;["rec"]],[["sg";"pl"];["nom";"gen";"acc"];["m1";"m2";"m3";"f";"n"];["rec"]] -> "num",[["pl"];c;g;["rec"]] + | "num","num",[["pl"];c;["m2"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m2"];["rec"]] + | "num","num",[["pl"];c;["m3"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m3"];["rec"]] + | "num","num",[["pl"];c;["f"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["f"];["rec"]] + | "num","num",[["pl"];c;["n"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["n"];["rec"];col] + | "num","num",[["pl"];c;["m1"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m1"];["congr"]] + | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]] + | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]] + | "num","num",[["pl"];c;["f"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["f"];["congr"]] + | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col] + | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]] + | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]] + | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col] *) | "siebie","siebie",[[c]],[["acc";"gen"]] -> "siebie",[[c]] | "adj","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp + | "adj","adj",ninterp,[["sg";"pl"];["nom";"gen";"dat";"acc";"inst";"loc";"voc"];["m1";"m2";"m3";"f";"n"];["pos"]] -> "adj",ninterp | "adja","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adja",ninterp | "adjc","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjc",ninterp | "adjp","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjp",ninterp + | "adj","adj",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp | "adv","adv",[[g]],[["pos"]] -> "adv",[[g]] - | "adv","adv",ninterp,interp -> if ninterp = interp then "adv",ninterp else raise Not_found + | "adv","adv",[],[["pos"]] -> "adv",[["pos"]] + | "adv",_,ninterp,_ -> "adv",ninterp + | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found + | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found + | "conj",_,ninterp,_ -> "conj",ninterp + | "prep","prep",[c1;w],[c2;_] -> if c1 = c2 then "prep",[c1;w] else raise Not_found | "prep","prep",ninterp,interp -> if ninterp = interp then "prep",ninterp else raise Not_found | "qub","qub",ninterp,interp -> if ninterp = interp then "qub",ninterp else raise Not_found - | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found - | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found + | "qub",_,ninterp,_ -> "qub",ninterp | "interj","interj",ninterp,interp -> if ninterp = interp then "interj",ninterp else raise Not_found + | "interj",_,ninterp,_ -> "interj",ninterp | "burk","burk",ninterp,interp -> if ninterp = interp then "burk",ninterp else raise Not_found | "pred","pred",ninterp,interp -> if ninterp = interp then "pred",ninterp else raise Not_found | "fin","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "fin",[n;p;["imperf"]] @@ -163,6 +181,8 @@ let get_cat_interp = function | "impt","inf",[n;p;["imperf"]],[["imperf"]] -> "impt",[n;p;["imperf"]] | "impt","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "impt",[n;p;["perf"]] | "impt","inf",[n;p;["perf"]],[["perf"]] -> "impt",[n;p;["perf"]] + | "bedzie","inf",[n;p;["imperf"]],[["imperf"]] -> "bedzie",[n;p;["imperf"]] + | "aglt","inf",[n;p;["imperf"];w],[["imperf"]] -> "aglt",[n;p;["imperf"];w] | "inf","inf",[["imperf"]],[["imperf";"perf"]] -> "inf",[["imperf"]] | "inf","inf",[["imperf"]],[["imperf"]] -> "inf",[["imperf"]] | "inf","inf",[["perf"]],[["imperf";"perf"]] -> "inf",[["perf"]] @@ -175,7 +195,7 @@ let get_cat_interp = function | "praet","inf",[n;g;["imperf"];a],[["imperf"]] -> "praet",[n;g;["imperf"];a] | "praet","inf",[n;g;["perf"];a],[["imperf";"perf"]] -> "praet",[n;g;["perf"];a] | "praet","inf",[n;g;["perf"];a],[["perf"]] -> "praet",[n;g;["perf"];a] - | "winien","inf",[n;g;["imperf"]],[["imperf"]] -> "winien",[n;g;["imperf"]] + | "winien","winien",[n;g;["imperf"]],[_;_;["imperf"]] -> "winien",[n;g;["imperf"]] | "ppas","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["imperf"];a] | "ppas","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ppas",[n;c;g;["imperf"];a] | "ppas","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["perf"];a] @@ -202,6 +222,16 @@ let get_cat_interp = function | "imps","inf",[["perf"]],[["perf"]] -> "imps",[["perf"]] | _ -> raise Not_found +let get_lemma_cat_interp = function + nlemma,lemma,"adj","ppas",[n;c;g;["pos"]],[["sg"];["nom";"voc"];["m1";"m2";"m3"];a;aff] -> lemma,"ppas",[n;c;g;a;aff] + | nlemma,lemma,"adja","adja",[],[] -> lemma,"adja",[] + | nlemma,lemma,"subst","subst",[["pl"];c;g],[["pl"];["nom";"voc"];_] -> lemma,"subst",[["pl"];c;g] + (* | "5","5","adj","dig",ninterp,[] -> "piąty","adj",ninterp + | "6","6","adj","dig",ninterp,[] -> "szósty","adj",ninterp *) + (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtokens.string_of_interps [ninterp] ^ " " ^ ENIAMtokens.string_of_interps [interp]); raise Not_found *) + | _ -> raise Not_found + + let correct_nlemma = function "letnia " -> "letnia" | "10minutowy" -> "minutowy" @@ -233,7 +263,7 @@ let correct_nlemma = function | "16-latek" -> raise Not_found | s -> s -let process_ntoken stats nlemma ncat ninterp = +let process_ntoken stats q nlemma ncat ninterp = try let nlemma = correct_nlemma nlemma in let nl = lemmatize_string nlemma in @@ -241,44 +271,291 @@ let process_ntoken stats nlemma ncat ninterp = {token=Lemma(lemma,cat,interp)} -> Xlist.fold interp nl (fun nl interp -> try - let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in - if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl + if lemma = nlemma then + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: nl else + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: nl + with Not_found -> nl) + | {token=Dig(_,"dig")} -> nl (* FIXME: todo *) + (* (try + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: nl + with Not_found -> nl) *) + | {token=RomanDig(_,"roman")} -> + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl + | {token=Proper(lemma,cat,interp,_)} -> (*print_endline ("P " ^ nlemma);*) nl (* FIXME: todo *) + | _ -> nl) in + if nl2 = [] then StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q + else StringQMap.add_val stats "lemmatized" q + with Not_found -> StringQMap.add_val stats "incorrect" q + +let process_ntoken2 stats q name id_div orth beg paragraph nlemma ncat ninterp = + try + let nlemma = correct_nlemma nlemma in + let nl = lemmatize_string nlemma in + let nl2 = Xlist.fold nl [] (fun nl -> function + {token=Lemma(lemma,cat,interp)} -> + Xlist.fold interp nl (fun nl interp -> + try + if lemma = nlemma then + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: nl else + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: nl with Not_found -> nl) | {token=Dig _} -> nl (* FIXME: todo *) + | {token=RomanDig(_,"roman")} -> + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) | _ -> nl) in - if nl2 = [] then StringQMap.add stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) - else StringQMap.add stats "lemmatized" - with Not_found -> StringQMap.add stats "incorrect" + if nl2 = [] then + StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)) + ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int beg ^ " " ^ orth ^ "\n" ^ paragraph) q + else StringQMap.add_val stats "lemmatized" q + with Not_found -> StringQMap.add_val stats "incorrect" q + +let validate_ntoken stats q (nlemma,ncat,ninterp) = + process_ntoken stats q nlemma ncat ninterp -let validate_ntoken stats (nlemma,ncat,ninterp) = - process_ntoken stats nlemma ncat ninterp +let rec validate_ntoken_token name id_div paragraph stats = function + Token t -> + (try + let nlemma,ncat,ninterp = get_ntoken t.attrs in + process_ntoken2 stats 1 name id_div t.orth t.beg paragraph nlemma ncat ninterp + (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]); + Printf.printf "%s\t%d\t%s\t%d\n" name id_div t.orth t.beg; + print_endline paragraph; + stats *) + with Not_found -> stats) + | Seq l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph) + | Variant l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph) -let match_lemmatize stats t = - if has_brev t.attrs then StringQMap.add stats "brev" else +let validate_ntoken_entry stats name typ channel entries = + prerr_endline name; + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> + let paragraph,tokens = annotate name sentences in + Xlist.fold tokens stats (validate_ntoken_token name id_div paragraph))) + +let rec subset_list = function + [],[] -> true + | [x] :: l1, y :: l2 -> if Xlist.mem y x then subset_list (l1,l2) else false + | _ -> false + +let match_cat_interp = function + | "subst","subst",[nn;nc;ng],[n;c;g;col] -> if subset_list ([nn;nc;ng],[n;c;g]) then "subst",[nn;nc;ng;col] else raise Not_found +(* | "numcol","num",ninterp,_ -> "num",ninterp*) + | "num","num",[nn;nc;["n"];na],[n;c;g;a;col] -> if subset_list ([nn;nc;["n"];na],[n;c;g;a]) then "num",[nn;nc;["n"];na;col] else raise Not_found + | "num","num",[nn;nc;ng;na],[n;c;g;a;col] -> if subset_list ([nn;nc;ng;na],[n;c;g;a]) then "num",[nn;nc;ng;na] else raise Not_found + | "adv","adv",[],[["pos"]] -> "adv",[["pos"]] + | _ -> raise Not_found + +let match_cat_interp_substgender = function + "subst","subst",[nn;nc;ng],[n;c;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found + | "subst","subst",[nn;nc;ng],[n;c;_;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found + | _ -> raise Not_found + +exception HasBrev +exception NoNtoken +exception LemmaNotMatched of string * string * string list list * token_env list +exception MultipleLemmaMatched of string * string * string list list * token list + +let rec sort_uniq_rec rev = function + [] -> rev + | x :: y :: l -> if x = y then sort_uniq_rec rev (y :: l) else sort_uniq_rec (x :: rev) (y :: l) + | [x] -> x :: rev + +let sort_uniq l = + match sort_uniq_rec [] (Xlist.sort l compare) with + [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol"]]]);Lemma(lemma2,"subst",[[n2;c2;["n"];["col"]]])] as l -> + if lemma1 = lemma2 && n1 = n2 && c1 = c2 then [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol";"col"]]])] else l + | [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("kląsknięcie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l -> + if c1 = c2 then [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]])] else l + | [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("wybrażenie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l -> + if c1 = c2 then [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]])] else l + | [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]]);Lemma(lemma2,"subst",[[["pl"];c2;["n"];["ncol"]]])] as l -> + (* print_endline lemma1; *) + if lemma1 = lemma2 && c1 = c2 then [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]])] else l + | l -> (*print_endline (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t)));*) l + +type t = TokenMatched | TokenLowercase | TokenBrev | TokenSubstGender | TokenDeviated + +let match_lemmatize_simple t nlemma ncat ninterp = + let l1 = ENIAMpaths.lemmatize_token t in + let l2 = Xlist.fold l1 [] (fun l -> function + {token=Lemma(lemma,cat,interp)} -> + Xlist.fold interp l (fun l interp -> + try + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else + if lemma = nlemma then + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: l else l + with Not_found -> l) + | {token=Dig _} -> l (* FIXME: todo *) + | {token=RomanDig(_,"roman")} -> + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) + | _ -> l) in + match sort_uniq l2 with + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) + | [t] -> t, TokenMatched + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) + +let match_lemmatize_lowercase t nlemma ncat ninterp = + let t = match t.token with + | FirstCap(s,lower,cl,ll) -> {t with token=AllSmall lower} + | CapLetter(s,lower) -> {t with token=SmallLetter lower} + | AllCap(_,a,b) -> {t with token=FirstCap(a,b,"","")} (* FIXME: to powinno być zdezambiguowane *) + | _ -> t in let l = ENIAMpaths.lemmatize_token t in - try - let nlemma,ncat,ninterp = get_ntoken t.attrs in + let l2 = Xlist.fold l [] (fun l -> function + {token=Lemma(lemma,cat,interp)} -> + Xlist.fold interp l (fun l interp -> + try + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else + if lemma = nlemma then + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: l else l + with Not_found -> l) + | {token=Dig _} -> l (* FIXME: todo *) + | {token=RomanDig(_,"roman")} -> + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) + | _ -> l) in + match sort_uniq l2 with + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l)) + | [t] -> t, TokenLowercase + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) + +let match_lemmatize_substgender t nlemma ncat ninterp = + let l1 = ENIAMpaths.lemmatize_token t in + let l2 = Xlist.fold l1 [] (fun l -> function + {token=Lemma(lemma,cat,interp)} -> + Xlist.fold interp l (fun l interp -> + try + if lemma = nlemma then + let cat,interp = match_cat_interp_substgender (ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: l else l + with Not_found -> l) + | {token=Dig _} -> l (* FIXME: todo *) + | {token=RomanDig(_,"roman")} -> + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) + | _ -> l) in + match sort_uniq l2 with + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) + | [t] -> t, TokenSubstGender + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) + +let match_lemmatize_deviated t nlemma ncat ninterp = + let l1 = ENIAMpaths.lemmatize_token t in + let nlemma = try correct_nlemma nlemma with Not_found -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) in + let nl = lemmatize_string nlemma in + let nl2 = Xlist.fold nl [] (fun nl -> function + {token=Lemma(lemma,cat,interp)} -> + Xlist.fold interp nl (fun nl interp -> + try + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in + (lemma,cat,interp) :: nl + with Not_found -> nl) + | _ -> nl) in + let l2 = Xlist.fold nl2 [] (fun l (nlemma,ncat,ninterp) -> + Xlist.fold l1 l (fun l -> function + {token=Lemma(lemma,cat,interp)} -> + Xlist.fold interp l (fun l interp -> + try + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else + if lemma = nlemma then + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: l else l + with Not_found -> l) + | _ -> l)) in + match sort_uniq l2 with + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) + | [t] -> t, TokenDeviated + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) + +let rec match_lemmatize_rec t nlemma ncat ninterp f0 = function + f :: l -> + (try f t nlemma ncat ninterp + with LemmaNotMatched _ -> match_lemmatize_rec t nlemma ncat ninterp f0 l) + | [] -> f0 t nlemma ncat ninterp + +let match_lemmatize (*stats q name id_div paragraph*) t = + if has_brev t.attrs then raise HasBrev (*StringQMap.add_val stats "brev" q*) +(* let nlemma = get_brev t.attrs in + (let l = ENIAMpaths.lemmatize_token t in + let l2 = Xlist.fold l [] (fun l -> function + {token=Lemma(lemma,cat,interp)} -> + Xlist.fold interp l (fun l interp -> + try + if lemma = nlemma then (Lemma(nlemma,cat,[interp])) :: l else l + with Not_found -> l) + (* | {token=Dig _} -> l (* FIXME: todo *) + | {token=RomanDig(_,"roman")} -> + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) *) + | _ -> l) in + match sort_uniq l2 with + [] -> raise (LemmaNotMatched(nlemma,"BREV",[],l)) + | [t] -> t, TokenBrev + | _ -> raise (MultipleLemmaMatched(nlemma,"BREV",[],l2)))*) + else + let nlemma,ncat,ninterp = try get_ntoken t.attrs with Not_found -> raise NoNtoken in + match_lemmatize_rec t nlemma ncat ninterp match_lemmatize_simple + [match_lemmatize_simple; match_lemmatize_lowercase; match_lemmatize_substgender; match_lemmatize_deviated] + (* let ninterp = if ncat = "adv" && ninterp = [] then [["pos"]] else ninterp in *) +(* let l1 = ENIAMpaths.lemmatize_token t in + let l2 = Xlist.fold l1 [] (fun l -> function + {token=Lemma(lemma,cat,interp)} -> + Xlist.fold interp l (fun l interp -> + try + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else + if lemma = nlemma then + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: l else l + with Not_found -> l) + | {token=Dig _} -> l (* FIXME: todo *) + | {token=RomanDig(_,"roman")} -> + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) + | _ -> l) in + match sort_uniq l2 with + [] -> (*raise (LemmaNotMatched(nlemma,ncat,ninterp,l))*) +lowercase + | [t] -> t, TokenMatched + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))*) + +(* try let nlemma = correct_nlemma nlemma in let nl = lemmatize_string nlemma in let nl2 = Xlist.fold nl [] (fun nl -> function {token=Lemma(lemma,cat,interp)} -> Xlist.fold interp nl (fun nl interp -> try - let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in - if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl + if lemma = nlemma then + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: nl else + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in + (Lemma(lemma,cat,[interp])) :: nl with Not_found -> nl) | {token=Dig _} -> nl (* FIXME: todo *) + | {token=RomanDig(_,"roman")} -> + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) | _ -> nl) in - if nl2 = [] then StringQMap.add stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) + if nl2 = [] then + StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)) + ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int t.beg ^ " " ^ t.orth ^ "\n" ^ paragraph) q + (* if nl2 = [] then StringQMap.add_val stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q *) (* let l2 = Xlist.fold l [] (fun l2 t2 -> match t2.token with Lemma(lemma,cat,interp) -> if lemma = nlemma (*|| lemma = lowercase nlemma t.token*) then t2 :: l2 else l2 (* | Proper(lemma,cat,interp,_) -> if lemma = nlemma || lemma = lowercase nlemma t.token then t2 :: l2 else l2 *) | _ -> l2) in if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ nlemma) else *) - else StringQMap.add stats "lemmatized" + else StringQMap.add_val stats "lemmatized" q (* let l3 = Xlist.fold l2 [] (fun l3 t -> match t.token with Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3 @@ -308,12 +585,30 @@ let match_lemmatize stats t = | [{token=Lemma _};{token=SmallLetter _}] -> stats | [{token=Lemma _};{token=FirstCap _}] -> stats | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))*) - with Not_found -> StringQMap.add stats "no ntoken" (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*) + with Not_found -> StringQMap.add_val stats "no ntoken/incorrect" q + (* with Not_found -> StringQMap.add_val stats "no ntoken" q (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*) *)*) -let rec validate_token stats = function - Token t -> match_lemmatize stats t - | Seq l -> Xlist.fold l stats validate_token - | Variant l -> Xlist.fold l stats validate_token +let rec validate_token name id_div paragraph stats = function + Token t -> + (* if t.orth = "POWIŚLE" then Printf.printf "%s %d %s\n%s\n" name id_div paragraph (ENIAMtokens.string_of_token_env t); *) + (try let _,f = match_lemmatize (*stats 1 name id_div paragraph*) t in + match f with + TokenMatched -> StringQMap.add stats "validated" + | TokenLowercase -> StringQMap.add stats "validated as lowercase" + | TokenBrev -> StringQMap.add stats "validated abbreviation" + | TokenSubstGender -> StringQMap.add stats "validated substgender" + | TokenDeviated -> StringQMap.add stats "validated deviated" + with + HasBrev -> StringQMap.add stats ("has brev: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*)) + (* | NoNtoken -> StringQMap.add stats ("no ntoken: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*)) *) + | NoNtoken -> StringQMap.add stats "no ntoken" + | LemmaNotMatched(nlemma,ncat,ninterp,l) -> + (* StringQMap.add stats (Printf.sprintf "lemma not matched: %s %s : %s \n%s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph) *) + StringQMap.add stats (Printf.sprintf "%s %s %s %d %s\n#%s\n#%s" ncat t.orth name id_div (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) + (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph) + | MultipleLemmaMatched(nlemma,ncat,ninterp,l) -> StringQMap.add stats (Printf.sprintf "multiple lemma matched: %s %s : %s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t))))) + | Seq l -> Xlist.fold l stats (validate_token name id_div paragraph) + | Variant l -> Xlist.fold l stats (validate_token name id_div paragraph) let validate_morphology stats name typ channel entries = prerr_endline name; @@ -323,7 +618,7 @@ let validate_morphology stats name typ channel entries = (* print_endline paragraph; *) (*let s = "W Specjalnym Ośrodku Szkolno-Wychowawczym" in if String.length paragraph >= String.length s && String.sub paragraph 0 (String.length s) = s then*) - Xlist.fold tokens stats validate_token + Xlist.fold tokens stats (validate_token name id_div paragraph) (*else stats*))) let ntokens_filename = "results/ntokens.tab" @@ -333,7 +628,7 @@ let parse_ninterp s = let fold_ntokens ntokens_filename s f = File.fold_tab ntokens_filename s (fun s -> function - [_;nlemma;ncat;ninterp] -> f s (nlemma,ncat,parse_ninterp ninterp) + [q;nlemma;ncat;ninterp] -> f s (int_of_string q) (nlemma,ncat,parse_ninterp ninterp) | l -> failwith ("fold_ntokens: " ^ String.concat "\t" l)) let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; @@ -371,12 +666,15 @@ let _ = create_ntoken_list stats name typ channel entries) in *) (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> create_ntoken_list stats name typ channel entries) in *) - let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in + (* let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in *) + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> + validate_ntoken_entry stats name typ channel entries) in *) (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> validate_morphology stats name typ channel entries) in *) - (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> - validate_morphology stats name typ channel entries) in *) + let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> + validate_morphology stats name typ channel entries) in let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); + flush stdout; ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); () diff --git a/NKJP2/validateTokenizer.ml b/NKJP2/validateTokenizer.ml index 538bf78..cf33f58 100644 --- a/NKJP2/validateTokenizer.ml +++ b/NKJP2/validateTokenizer.ml @@ -609,7 +609,7 @@ let rec annotate_paragraph name paragraph l = function with Not_found -> (try let m,ets,l = annotate_apply_rules (et :: ets) l rules in m :: annotate_paragraph name paragraph l ets - with Not_found -> failwith "annotate_paragraph 1"))) + with Not_found -> (*print_endline ("annotate_paragraph 1: " ^ (string_of_vtoken et));*)failwith "annotate_paragraph 1"))) | [] -> if l = [] then [] else failwith "annotate_paragraph 2" let validate_segmentation stats name typ channel entries = @@ -713,12 +713,15 @@ let transform_nkjp_interp cat interp1 = | "prep" | "adv" | "fin" | "inf" | "imps" | "pcon" | "bedzie" | "impt" | "siebie" | "aglt" | "pant" | "brev" | "qub" -> interp | _ -> print_endline ("transform_nkjp_interp: " ^ cat ^ " " ^ String.concat ":" interp1); interp +let transform_nkjp_interp_simple cat interp1 = + Xlist.map interp1 (fun s -> [s]) + let merge_token = function t,[] -> Token t | t,[{ncat="brev"} as n] -> set_sent n.nsent {t with attrs=BrevLemma n.nlemma :: t.attrs} | t,[n] -> if n.nlemma = "+/-" then set_sent n.nsent t else - if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: t.attrs} + if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: t.attrs} else set_sent n.nsent t | _ -> failwith "merge_token" @@ -732,7 +735,7 @@ let merge_letni l seq = match List.rev seq with last :: l -> let attrs = if n.nsent=SentEnd || n.nsent=SentBegEnd then (SentEnd : attr) :: last.attrs else last.attrs in - Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t)) + Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t)) | _ -> failwith "merge_letni" let blabla_orths = StringSet.of_list ["8.12"; "9.11"; "1.1"; "1.2"] @@ -751,7 +754,7 @@ let merge_paragraph name = function | AR("brev",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) - | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *) + | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp_simple n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *) | t -> failwith ("merge_paragraph: " ^ string_of_atoken t) let test_annotate name typ channel entries = @@ -783,6 +786,60 @@ let test_annotate name typ channel entries = (* print_endline (String.concat "\n" (Xlist.map m string_of_atoken))); *) ())) +type cap = Capital | Small | Sign + +let classify_cap s = + match Xunicode.classified_chars_of_utf8_string s with + Xunicode.Capital _ :: _ -> Capital + | Xunicode.ForeignCapital _ :: _ -> Capital + | Xunicode.Small _ :: _ -> Small + | Xunicode.ForeignSmall _ :: _ -> Small + | _ -> Sign + +let rec get_ntoken = function + (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp + | _ :: l -> get_ntoken l + | [] -> raise Not_found + +let rec disambiguate_capitalics = function + Token t -> + (try + let nlemma,ncat,ninterp = get_ntoken t.attrs in + let c = match t.token, classify_cap nlemma with + ENIAMtokenizerTypes.SmallLetter _, Small -> true + | ENIAMtokenizerTypes.CapLetter _, Capital -> true + | ENIAMtokenizerTypes.AllSmall _ , Small-> true + | ENIAMtokenizerTypes.AllCap _, Capital -> true + (* | ENIAMtokenizerTypes.AllCap _, Small -> true *) + | ENIAMtokenizerTypes.FirstCap _, Capital -> true + | ENIAMtokenizerTypes.SomeCap _, Capital -> true + | ENIAMtokenizerTypes.SomeCap _, Small -> true + | ENIAMtokenizerTypes.RomanDig _, Capital -> true + | ENIAMtokenizerTypes.Interp _, _ -> true + | ENIAMtokenizerTypes.Symbol _, _ -> true + | ENIAMtokenizerTypes.Dig _, _ -> true + | ENIAMtokenizerTypes.Other _, _ -> true + | ENIAMtokenizerTypes.Lemma _, _ -> true + | ENIAMtokenizerTypes.Proper _, _ -> true + | ENIAMtokenizerTypes.Compound _, _ -> true + | ENIAMtokenizerTypes.Tokens _, _ -> true + | _ -> false in + Token t, c + (* let nc = classify_cap nlemma in + let no = classify_cap t.orth in + if no = nc then Token t,true else Token t,false *) + with Not_found -> Token t,true) + | Seq l -> + let l,c = Xlist.fold l ([],true) (fun (l,c) t -> + let t,d = disambiguate_capitalics t in + t :: l, c && d) in + Seq(List.rev l), c + | Variant l -> + let l2 = Xlist.fold l [] (fun l t -> + let t,d = disambiguate_capitalics t in + if d then t :: l else l) in + if l2 = [] then Variant l,false else Variant l2,true + let annotate name sentences = let tokens = flatten_sentences sentences in let tokens = simple_allign "" "" [] tokens in @@ -793,8 +850,21 @@ let annotate name sentences = let eniam_tokens = annotate_variants_par eniam_tokens in let m = annotate_paragraph name paragraph tokens eniam_tokens in let m = List.rev (Xlist.rev_map m (merge_paragraph name)) in + let m = List.rev (Xlist.fold m [] (fun m t -> + let t,_ = disambiguate_capitalics t in + t :: m)) in paragraph, m +let test_disambiguate_capitalics stats name typ channel entries = + prerr_endline name; + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> + let paragraph,tokens = annotate name sentences in + Xlist.fold tokens stats (fun stats t -> + let _,c = disambiguate_capitalics t in + if c then stats else StringQMap.add stats (Printf.sprintf "%s %s" (ENIAMtokens.string_of_tokens 0 t) paragraph)))) + + let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727"; "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056"; @@ -839,6 +909,8 @@ let _ = test_annotate name typ channel entries); *) (* ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) -> test_annotate name typ channel entries); *) + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> + test_disambiguate_capitalics stats name typ channel entries) in *) (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); *) (* ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); *) diff --git a/morphology/doc/model2.pdf b/morphology/doc/model2.pdf index 1ece0e0..c31b9b0 100644 Binary files a/morphology/doc/model2.pdf and b/morphology/doc/model2.pdf differ diff --git a/morphology/resources/alt_supplement.tab b/morphology/resources/alt_supplement.tab index 9fe48ec..f75e5ad 100644 --- a/morphology/resources/alt_supplement.tab +++ b/morphology/resources/alt_supplement.tab @@ -2,4 +2,5 @@ się się qub siebie siebie siebie:acc.gen sobie siebie siebie:dat.loc sobą siebie siebie:inst +to to pred -- libgit2 0.22.2