validateMorphology.ml 10.6 KB
(*
 *  ENIAM_NKJP, an interface for National Corpus of Polish (NKJP).
 *  Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open Xstd
open ValidateTokenizer

let rec select_interp = function (* przejście z m1 do m1.p1 *)
    "n" :: l,["n1"] :: ll -> ["n1"] :: (select_interp (l,ll))
  | "n" :: l,["n2"] :: ll -> ["n2"] :: (select_interp (l,ll))
  | "n" :: l,["p2"] :: ll -> ["p2"] :: (select_interp (l,ll))
  | "n" :: l,["p3"] :: ll -> ["p3"] :: (select_interp (l,ll))
  | "n" :: l,["n1";"n2"] :: ll -> ["n1";"n2"] :: (select_interp (l,ll))
  | "n" :: l,["m1";"m2";"m3";"f";"n1";"n2";"p1";"p2";"p3"] :: ll -> ["n1";"n2";"p2";"p3"] :: (select_interp (l,ll))
  | "n" :: l,["m1";"m2";"m3";"f";"n1";"n2";"p1";"p2"] :: ll -> ["n1";"n2";"p2"] :: (select_interp (l,ll))
  | "n" :: l,["m1";"m2";"m3";"n1";"n2"] :: ll -> ["n1";"n2"] :: (select_interp (l,ll))
  | "n" :: l,["m1";"m2";"m3";"f";"n1";"n2"] :: ll -> ["n1";"n2"] :: (select_interp (l,ll))
  | "n" :: l,["m2";"m3";"f";"n1";"n2";"p2";"p3"] :: ll -> ["n1";"n2";"p2";"p3"] :: (select_interp (l,ll))
  | "m1" :: l,["_"] :: ll -> ["m1"] :: (select_interp (l,ll))
  | "m2" :: l,["_"] :: ll -> ["m2"] :: (select_interp (l,ll))
  | "m3" :: l,["_"] :: ll -> ["m3"] :: (select_interp (l,ll))
  | "f" :: l,["_"] :: ll -> ["f"] :: (select_interp (l,ll))
  | "n" :: l,["_"] :: ll -> ["n1";"n2";"p2";"p3"] :: (select_interp (l,ll))
  | a :: l,al :: ll -> if Xlist.mem al a then [a] :: (select_interp (l,ll)) else raise Not_found
  | [],[] -> []
  | _ -> raise Not_found

let lowercase s = function
    AllSmall _ -> s
  | SmallLetter _ -> s
  | FirstCap(_,_,c,l) ->
      if Xstring.check_prefix c s then
        l ^ Xstring.cut_prefix c s
      else failwith ("lowercase: " ^ s ^ " " ^ c)
  | t -> failwith ("lowercase: " ^ ENIAMtokens.string_of_token t)

let match_lemmatize stats t lemma cat interp =
  if cat = "brev" then stats else
  if t.token = Symbol "." then stats else
  let l = ENIAMpaths.lemmatize_token t in
  let l2 = Xlist.fold l [] (fun l2 t2 ->
    match t2.token with
      Lemma(lemma2,cat2,interp2) -> if lemma = lemma2 || lemma = lowercase lemma2 t.token then t2 :: l2 else l2
    | Proper(lemma2,cat2,interp2,_) -> if lemma = lemma2 || lemma = lowercase lemma2 t.token then t2 :: l2 else l2
    | _  -> t2 :: l2) in
  if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ lemma) else
  let l3 = Xlist.fold l2 [] (fun l3 t ->
    match t.token with
      Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3
    | Proper(lemma2,cat2,interp2,_) -> if cat = cat2 then t :: l3 else l3
    | _  -> t :: l3) in
  if l3 = [] then StringQMap.add stats ("no cat: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat) else
  let l4 = Xlist.fold l3 [] (fun l4 t ->
    match t.token with
      Lemma(lemma2,cat2,interp2) ->
        let interp2 = Xlist.fold interp2 [] (fun interp2 interp3 -> try select_interp (interp,interp3) :: interp2 with Not_found -> interp2) in
        if interp2 = [] then l4 else (
        if Xlist.size interp2 > 1 then print_endline "match_lemmatize: multiple interp";
        (try {t with token=Lemma(lemma2,cat2,interp2)} :: l4 with Not_found -> l4))
    | Proper(lemma2,cat2,interp2,sense2) ->
        let interp2 = Xlist.fold interp2 [] (fun interp2 interp3 -> try select_interp (interp,interp3) :: interp2 with Not_found -> interp2) in
        if interp2 = [] then l4 else (
        if Xlist.size interp2 > 1 then print_endline "match_lemmatize: multiple interp";
        (try {t with token=Proper(lemma2,cat2,interp2,sense2)} :: l4 with Not_found -> l4))
    | _  -> t :: l4) in
  match l4 with
    [] -> StringQMap.add stats ("no interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ " " ^ String.concat ":" interp ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))
  (* | [{token=AllSmall _}] -> StringQMap.add stats ("no Lemma: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ " " ^ String.concat ":" interp ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env)) *)
  | [{token=AllSmall _}] -> StringQMap.add stats "no Lemma"
  | [{token=FirstCap _}] -> StringQMap.add stats ("no Lemma 2: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ " " ^ String.concat ":" interp ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))
  (* | [{token=FirstCap _}] -> StringQMap.add stats "no Lemma2" *)
  | [{token=Lemma _};{token=AllSmall _}] -> stats
  | [{token=Lemma _};{token=SmallLetter _}] -> stats
  | [{token=Lemma _};{token=FirstCap _}] -> stats
  | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))

let is_lemmatizable = function
  | AllSmall _ -> true
  | SmallLetter _ -> true
  | FirstCap _ -> true
  | AllCap _ -> true
  | CapLetter _ -> true
  | SomeCap _ -> true
  | t -> false

let validate_token stats = function
    AT(t,[sent,orth,lemma,"brev",interp]) -> StringQMap.add stats "brev"
  | AT(t,l(*[sent,orth,lemma,cat,interp]*)) ->
      if is_lemmatizable t.token then
        StringQMap.add stats "lemmatizable" else StringQMap.add stats "non lemmatizable"
         (*match_lemmatize stats t lemma cat interp*)
  (* | AT(_,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)*)
  | AV(tl,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)
  | AR(stat,tl,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)
  (* | _ -> StringQMap.add stats "validate_token: ni" *)

let validate_morphology stats name typ channel entries =
  prerr_endline name;
  Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
    Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
      let paragraph,tokens = annotate name sentences in
      (* print_endline paragraph; *)
      (*let s = "W Specjalnym Ośrodku Szkolno-Wychowawczym" in
      if String.length paragraph >= String.length s && String.sub paragraph 0 (String.length s) = s then*)
        Xlist.fold tokens stats validate_token
      (*else stats*)))

let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
"620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727";
"620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056";
"711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";*)
  (*"040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083";
  "120-2-900092";"120-2-900094";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900008";"120-4-900010";"130-3-900001";"130-3-910001";"130-5-000000267";
  "130-5-000000406";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001338";"130-5-000001628";"130-5-000001742";"200-1-000011";"200-1-000026";"200-2-000078";
  "200-2-000173";"200-2-000175";"200-4-000000307";"200-4-000000316";"310-2-000007";"320-2-000000094";"320-2-000034";"320-2-000064";"320-3-000226";"330-2-000000030";
  "330-2-000000033";"330-2-000000200";"330-2-000000213";"330-2-000003";"330-2-000013";"620-3-010000057";"620-3-010000838";"620-3-010001103";"620-3-010001107";"620-3-010001108";
  "620-3-010001109";"620-3-010001125";"620-3-010001274";"620-3-010001448";"620-3-010001732";"620-3-010001772";"711-3-010000021";"712-1-900003";"712-1-900004";"720-3-000071";
  "720-3-010000323";"DP1999";"DP2002";"DP2003";"EkspressWieczorny";"forumowisko.pl_20218";"forumowisko.pl_42911";"forumowisko.pl_724";"GazetaGoleniowska";"GazetaTczewska";
  "NIE";"SuperExpress";"TrybunaSlaska";*)
  (* "120-2-000009";"120-2-000010";"120-2-000012";"120-2-900019";"120-2-900041";"120-2-900044";"120-2-900092";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900001";
  "120-4-900008";"130-3-900001";"130-5-000000267";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001628";"130-5-000001635";"130-5-000001742";"200-1-000011";
  "200-2-000078";"200-2-000181";"200-4-000000314";"200-4-000026";"200-4-000059";"310-2-000007";"320-2-000000087";"320-2-000000094";"320-2-000034";"330-2-000013";"620-3-010000057";
  "620-3-010000099";"620-3-010000838";"620-3-010000839";"620-3-010001729";"620-3-010001743";"620-3-010001853";"620-3-010001873";"620-3-010001895";"711-3-010000021";"720-3-000071";
  "720-3-010000323";"720-3-010000337";"DP2000";"EkspressWieczorny";"forumowisko.pl_12517";"forumowisko.pl_20218";"forumowisko.pl_42911";"GazetaTczewska";"SuperExpress" *)
  (* "120-2-900092";"120-4-900000001";"120-4-900008";"130-3-900001";"200-2-000078";"200-4-000059";"330-2-000013";"720-3-000071";"720-3-010000337";"EkspressWieczorny" *)
  (* "110-4-000000102";"120-2-000006";"120-2-900032";"120-2-900035";"130-3-900005";"130-3-910001";
  "130-5-000000507";"130-5-000000765";"130-5-000001156";"200-2-000191";"330-2-000000030";
  "620-3-010000835";"620-3-010001772";"DP1999";"GazetaGoleniowska";"GazetaMalborska";"KOT";
  "KurierKwidzynski";"NIE";"Rzeczpospolita";"TrybunaSlaska" *)
  (* "110-4-000000102";"120-2-000006";"120-2-900032";"130-5-000000507";"130-5-000001156";
  "620-3-010000835";"GazetaGoleniowska";"KurierKwidzynski";"NIE";"Rzeczpospolita"; *)
  "110-4-000000102";"KurierKwidzynski";
  (* "620-3-010001496"; *)
]

let _ =
  ENIAMtokenizer.initialize ();
  ENIAMinflexion.initialize ();
  let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate_morphology stats name typ channel entries) in
  (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate_segmentation stats name typ channel entries) in *)
  let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
  Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k);
  ()