validateTokenizer.ml 7.27 KB
(*
 *  ENIAM_NKJP, an interface for National Corpus of Polish (NKJP).
 *  Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open Xstd

let space = {empty_token_env with orth=" "; token=Symbol " "}
let query_beg = {empty_token_env with token=Interp "<query>"}
let query_end = {empty_token_env with token=Interp "</query>"}
let sencence_beg = {empty_token_env with token=Interp "<sentence>"}
let sencence_end = {empty_token_env with token=Interp "</sentence>"}
let clause_beg = {empty_token_env with token=Interp "<clause>"}
let clause_end = {empty_token_env with token=Interp "</clause>"}

type sent = SentBeg | SentEnd | Inside

let set_sent_end = function
    (_,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ ->
      (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  | _ -> failwith "set_sent_end"

let flatten_sentences sentences =
  List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) ->
    set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) ->
      (sent,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l, Inside))))

let make_token orth lemma cat interp =
  {empty_token_env with
         orth=orth;
         token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])}

let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ]
(* let prefixes = StringSet.of_list [
  (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'";
  "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *)


let rec allign prev_orth prev_cat rev = function
    (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l ->
       allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l
  | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign"
  | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l ->
       let rev =
         if no_spaces > 0 then space :: rev
         else if cat = "interp" || cat = "aglt" || prev_cat = "interp" || StringSet.mem suffixes orth (*|| StringSet.mem prefixes prev_orth*) then rev
         else (
         (* print_endline ("allign: " ^ prev_orth ^ " " ^ orth); *)
         space :: rev) in
       let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in
       let rev = (make_token orth lemma cat interp) :: rev in
       let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in
       allign orth cat rev l
  | [] -> List.rev (query_end :: rev)

let rec set_lengths n rev = function
    t :: l ->
       let len =
         if t.token = Interp "<query>" || t.token = Interp "</query>" then factor else
         Xlist.size (Xunicode.utf8_chars_of_utf8_string t.orth) * factor in
       set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l
  | [] -> List.rev rev

let render_paragraph tokens =
  String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth)))

let rec get_next = function
    Token t -> t.next
  | Seq [] -> failwith "get_next"
  | Seq l -> get_next (List.hd (List.rev l))
  | Variant [] -> failwith "get_next"
  | Variant l -> get_next (List.hd l)

let rec match_tokens erev nrev rev = function
    et :: ets, nt :: nts ->
      let next = get_next et in
      if next = nt.next then
        match_tokens [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts)
      else if next < nt.next then
        match_tokens (et :: erev) nrev rev (ets, nt :: nts)
      else match_tokens erev (nt :: nrev) rev (et :: ets, nts)
  | [],[] -> List.rev rev
  | _ -> failwith "match_tokens"

(*  let compare_token et t =
  et.orth=t.orth && et.beg=t.beg && et.len=t.len && et.next=t.next && et.token=t.token

let get_beg = function
    Token t -> t.beg
  | Seq [] -> failwith "get_beg"
  | Seq l -> get_beg (List.hd l)
  | Variant [] -> failwith "get_next"
  | Variant l -> get_beg (List.hd l)

let rec compare_tokens stats = function
    Token et :: ets, t :: ts ->
       if compare_token et t then compare_tokens stats (ets,ts) else (
       Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t);
       stats)
  | Variant l :: ets, ts -> failwith "compare_tokens 4"
  | Seq l :: ets, ts -> failwith "compare_tokens 3"
  | [], ts -> failwith "compare_tokens 2"
  | _, [] -> failwith "compare_tokens 1"

let rec get_subsequence_rec next rev = function
    t :: tokens -> if t.next = next then List.rev (t :: rev) else get_subsequence_rec next (t :: rev) tokens
  | [] -> failwith "get_subsequence_rec"

let get_subsequence beg next = function
    t :: tokens -> if t.beg = beg then get_subsequence_rec next [] (t :: tokens) else failwith "get_subsequence 2"
  | [] -> failwith "get_subsequence 1"

let compare_token stats tokens = function
    Token et :: ets, t :: ts ->
       if compare_token et t then compare_tokens stats (ets,ts) else (
       Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t);
       stats)
  | Variant l :: ets, ts -> failwith "compare_tokens 4"
  | Seq l :: ets, ts -> failwith "compare_tokens 3"
  | [], ts -> failwith "compare_tokens 2"
  | _, [] -> failwith "compare_tokens 1"

let rec compare_tokens stats tokens = function
    et :: ets ->
      let ts,tokens = get_subsequence (get_beg et) (get_next et) tokens in
      compare_token stats ts et
  | [] -> if tokens = [] then stats else failwith "compare_tokens 1"*)

let rec compare_tokens stats = function
    (ets,nts) :: l ->
       Xlist.iter ets (fun et -> Printf.printf "%s\n" (ENIAMtokens.string_of_tokens 0 et));
       Xlist.iter nts (fun nt -> Printf.printf "%s\n" (ENIAMtokens.string_of_token_env nt));
       print_endline "";
       compare_tokens stats l
  | [] -> stats

let validate stats name typ channel entries =
  (* if name = "120-2-900066" then ( *)
  print_endline name;
  Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
    Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
      let tokens = flatten_sentences sentences in
      let tokens = allign "" "" [] tokens in
      let tokens = set_lengths 0 [] tokens in
      let paragraph = render_paragraph tokens in
      let tokens = remove_spaces [] tokens in
      let eniam_tokens = ENIAMtokenizer.parse paragraph in
      let l = match_tokens [] [] [] (eniam_tokens,tokens) in
      compare_tokens stats l))


let _ =
  let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate stats name typ channel entries) in
  ()