validateTokenizer.ml 8.14 KB
(*
 *  ENIAM_NKJP, an interface for National Corpus of Polish (NKJP).
 *  Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open Xstd

let space = {empty_token_env with orth=" "; token=Symbol " "}
let query_beg = {empty_token_env with token=Interp "<query>"}
let query_end = {empty_token_env with token=Interp "</query>"}
let sencence_beg = {empty_token_env with token=Interp "<sentence>"}
let sencence_end = {empty_token_env with token=Interp "</sentence>"}
let clause_beg = {empty_token_env with token=Interp "<clause>"}
let clause_end = {empty_token_env with token=Interp "</clause>"}

type sent = SentBeg | SentEnd | Inside

let set_sent_end = function
    (_,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ ->
      (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  | _ -> failwith "set_sent_end"

let flatten_sentences sentences =
  List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) ->
    set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) ->
      (sent,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l, Inside))))

let make_token orth lemma cat interp =
  {empty_token_env with
         orth=orth;
         token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])}

let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; "li"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ]
(* let prefixes = StringSet.of_list [
  (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'";
  "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *)

let is_space_required prev_orth prev_cat orth cat =
  if cat = "interp" || cat = "aglt" || prev_cat = "interp" || prev_cat = "" || StringSet.mem suffixes orth then false else (
  let prev_char = List.hd (List.rev (Xunicode.classified_chars_of_utf8_string prev_orth)) in
  let cur_char = List.hd (Xunicode.classified_chars_of_utf8_string orth) in
  match prev_char,cur_char with
    Xunicode.Sign a,Xunicode.Sign b -> (*print_endline ("is_space_required 1: " ^ prev_orth ^ " " ^ orth ^ " " ^ a ^ " " ^ b);*) true
  | _,Xunicode.Sign _ -> false
  | Xunicode.Sign _,_ -> false
  | Xunicode.Digit _,Xunicode.Digit _ -> true
  | Xunicode.Digit _,_ -> false
  | _,Xunicode.Digit _ -> false
  | Xunicode.Small _,Xunicode.Small _ -> true
  | Xunicode.ForeignSmall _,Xunicode.Small _ -> true
  | Xunicode.Capital _,Xunicode.Capital _ -> true
  | Xunicode.Small _,Xunicode.Capital _ -> true
  | Xunicode.Capital _,Xunicode.Small _ -> true
  | Xunicode.ForeignCapital _,Xunicode.Small _ -> true
  | a,b -> failwith ("is_space_required: " ^ prev_orth ^ " " ^ orth ^ " " ^ Xunicode.to_string a ^ " " ^ Xunicode.to_string b))

let rec allign prev_orth prev_cat rev = function
    (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l ->
       allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l
  | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign"
  | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l ->
       let rev =
         if no_spaces > 0 then space :: rev else
         if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in
       let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in
       let rev = (make_token orth lemma cat interp) :: rev in
       let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in
       allign orth cat rev l
  | [] -> List.rev (query_end :: rev)

let rec set_lengths n rev = function
    t :: l ->
       let len =
         if t.token = Interp "<query>" || t.token = Interp "</query>" then factor else
         Xlist.size (Xunicode.utf8_chars_of_utf8_string t.orth) * factor in
       set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l
  | [] -> List.rev rev

(* FIXME: poprawić interpretacje przecinka i innych znaków interpunkcyjnych *)
let rec set_special_tokens_lengths rev = function
    ({token=Interp "<sentence>"} as sent) :: ({token=Interp "<clause>"} as cl) :: t :: l ->
       let sent = {sent with len=1; next=sent.beg+1} in
       let cl = {cl with beg=sent.next; len=1; next=sent.next+1} in
       let t = {t with beg=t.beg+2; len=t.len-2} in
       set_special_tokens_lengths (Token t :: Token cl :: Token sent :: rev) l
  | ({orth="."; token=Lemma(".","interp",[[]])} as dot) :: ({token=Interp "</clause>"} as cl) :: {token=Interp "</sentence>"} :: l ->
       let cl = {cl with beg=dot.beg; len=20; next=dot.beg+20} in
       let dot = {dot with beg=cl.next; len=80; token= Interp "</sentence>"} in
       set_special_tokens_lengths (Token dot :: Token cl :: rev) l
  | t :: l -> set_special_tokens_lengths (Token t :: rev) l
  | [] -> List.rev rev

let render_paragraph tokens =
  String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth)))

let rec get_next = function
    Token t -> t.next
  | Seq [] -> failwith "get_next"
  | Seq l -> get_next (List.hd (List.rev l))
  | Variant [] -> failwith "get_next"
  | Variant l -> get_next (List.hd l)

let make_seq  = function
    [] -> failwith "make_seq"
  | [t] -> t
  | l -> Seq l

let rec match_token_sequence erev nrev rev = function
    et :: ets, nt :: nts ->
      let enext = get_next et in
      let nnext = get_next nt in
      if enext = nnext then
        match_token_sequence [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts)
      else if enext < nnext then
        match_token_sequence (et :: erev) nrev rev (ets, nt :: nts)
      else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts)
  | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l)
  | _ -> failwith "match_token_sequence"

let rec compare_tokens = function
    Token et, Token nt ->
       et.orth = nt.orth && et.beg = nt.beg && et.len = nt.len && et.next = nt.next
  | et,Variant l ->
       Xlist.fold l true (fun b nt ->
         compare_tokens (et,nt) && b)
  | Variant l,nt ->
       Xlist.fold l false (fun b et ->
         compare_tokens (et,nt) || b)
  | Seq[et], nt -> compare_tokens (et,nt)
  | et, Seq[nt] -> compare_tokens (et,nt)
  | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false
  | _ -> false

let validate stats name typ channel entries =
  (* if name = "120-2-900066" then ( *)
  print_endline name;
  Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
    Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
      let tokens = flatten_sentences sentences in
      let tokens = allign "" "" [] tokens in
      let paragraph = render_paragraph tokens in
      let tokens = set_lengths 0 [] tokens in
      let tokens = set_special_tokens_lengths [] tokens in
      let tokens = ENIAMpatterns.remove_spaces [] tokens in
      let eniam_tokens = ENIAMtokenizer.parse paragraph in
      let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in
      Xlist.fold l stats (fun stats (eniam_token,nkjp_token) ->
        if compare_tokens (eniam_token,nkjp_token) then stats else (
          let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 eniam_token) in
          let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 nkjp_token) in
          Printf.printf "%s\n%s\n\n%!"  s t;
          StringQMap.add stats (s ^ "\n" ^ t)))))


(*let _ =
  let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate stats name typ channel entries) in
  ()*)