validateTokenizer.ml 34.4 KB

Edit Raw Blame History

(*
 *  ENIAM_NKJP, an interface for National Corpus of Polish (NKJP).
 *  Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open Xstd

type sent = SentBeg | SentEnd | Inside | SentBegEnd | Space

let set_sent_end = function
    (Inside,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ ->
      (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  | (SentBeg,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ ->
      (SentBegEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  | _ -> failwith "set_sent_end"

let set_beg_as_zero = function
    (sent,_,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l ->
      (sent,0,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  | [] -> failwith "set_beg_as_zero"

let flatten_sentences sentences =
  set_beg_as_zero (List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) ->
    set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) ->
      (sent,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l, Inside)))))

let space = Space," "," ","sp",[]

let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; "li"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ]
(* let prefixes = StringSet.of_list [
  (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'";
  "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *)

let is_space_required prev_orth prev_cat orth cat =
  if cat = "interp" || cat = "aglt" || prev_cat = "interp" || prev_cat = "" || StringSet.mem suffixes orth then false else (
  let prev_char = List.hd (List.rev (Xunicode.classified_chars_of_utf8_string prev_orth)) in
  let cur_char = List.hd (Xunicode.classified_chars_of_utf8_string orth) in
  match prev_char,cur_char with
    Xunicode.Sign a,Xunicode.Sign b -> (*print_endline ("is_space_required 1: " ^ prev_orth ^ " " ^ orth ^ " " ^ a ^ " " ^ b);*) true
  | _,Xunicode.Sign _ -> false
  | Xunicode.Sign _,_ -> false
  | Xunicode.Digit _,Xunicode.Digit _ -> true
  | Xunicode.Digit _,_ -> false
  | _,Xunicode.Digit _ -> false
  | Xunicode.Small _,Xunicode.Small _ -> true
  | Xunicode.ForeignSmall _,Xunicode.Small _ -> true
  | Xunicode.Capital _,Xunicode.Capital _ -> true
  | Xunicode.Small _,Xunicode.Capital _ -> true
  | Xunicode.Capital _,Xunicode.Small _ -> true
  | Xunicode.ForeignCapital _,Xunicode.Small _ -> true
  | a,b -> failwith ("is_space_required: " ^ prev_orth ^ " " ^ orth ^ " " ^ Xunicode.to_string a ^ " " ^ Xunicode.to_string b))

let rec simple_allign prev_orth prev_cat rev = function
    (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l ->
       simple_allign orth cat ((SentBeg,orth,lemma,cat,interp) :: rev) l
  | (SentBegEnd,0,_,_,_,orth,lemma,cat,interp) :: l ->
       simple_allign orth cat ((SentBegEnd,orth,lemma,cat,interp) :: rev) l
  | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith ("allign 1: " ^ orth)
  | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l ->
       let rev =
         if no_spaces > 0 then space :: rev else
         if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in
       simple_allign orth cat ((sent,orth,lemma,cat,interp) :: rev) l
  | [] -> List.rev rev

let render_paragraph tokens =
  String.concat "" (List.rev (Xlist.rev_map tokens (fun (_,orth,_,_,_) -> orth)))

let rec remove_spaces rev = function
    (Space,_,_,_,_) :: l -> remove_spaces rev l
  | x :: l -> remove_spaces (x :: rev) l
  | [] -> List.rev rev

(* let rec simplify_eniam_tokens = function
    Token t -> if t.orth = "" then [] else [t.orth]
  | Seq l -> List.flatten (List.rev (Xlist.rev_map l simplify_eniam_tokens))
  | Variant l ->
      let l = Xlist.map l simplify_eniam_tokens in
      let set = Xlist.fold l StringSet.empty (fun set l ->
        StringSet.add set (String.concat "\t" l)) in
      (match StringSet.size set with
        0 -> failwith "simplify_eniam_tokens: empty Variant"
      | 1 -> List.hd l
      | _ -> failwith ("simplify_eniam_tokens: multiple interpretations\n" ^ String.concat "\n" (StringSet.to_list set))) *)

type matched =
    Matched of (sent * string * string * string * string list) list
  | NotMatched of tokens list * (sent * string * string * string * string list) list

let rec get_tokens_orth = function
    Token t -> t.orth
  | Seq l -> String.concat "" (Xlist.map l get_tokens_orth)
  | Variant l -> if l = [] then failwith "get_tokens_orth" else get_tokens_orth (List.hd l)

let rec string_of_eniam_token_orths_rec = function
    Token{orth=""} -> []
  | Token t -> [t.orth]
  | Seq l -> List.flatten (Xlist.map l string_of_eniam_token_orths_rec)
  | Variant l ->
      let set = (Xlist.fold l StringSet.empty (fun set t -> StringSet.add set (String.concat "$" (string_of_eniam_token_orths_rec t)))) in
      match StringSet.to_list set with
        [s] -> [s]
      | l -> ["[ " ^ String.concat " | " l ^ " ]"]

let string_of_eniam_token_orths l =
  String.concat "$" (string_of_eniam_token_orths_rec (Seq l))

let rec string_of_eniam_token_orths2_rec = function
    Token{orth=""} -> []
  | Token t -> [t.orth]
  | Seq l -> List.flatten (Xlist.map l string_of_eniam_token_orths2_rec)
  | Variant l ->
      let set = (Xlist.fold l StringSet.empty (fun set t -> StringSet.add set (String.concat "\";\"" (string_of_eniam_token_orths2_rec t)))) in
      match StringSet.to_list set with
        [s] -> [s]
      | l -> ["[ \"" ^ String.concat "\" | \"" l ^ "\" ]"]

let string_of_eniam_token_orths2 l =
  String.concat "\";\"" (string_of_eniam_token_orths2_rec (Seq l))

let string_of_nkjp_token_orths l =
  String.concat " " (Xlist.map l (fun (_,orth,_,_,_) -> orth))

let string_of_nkjp_token_orths2 l =
  String.concat "\";\"" (Xlist.map l (fun (_,orth,_,_,_) -> orth))

let rec match_orth_sequence = function
    Token{orth=""}, l -> Matched l
  | Token t, (sent,orth,lemma,cat,interp) :: l ->
      if t.orth = orth then Matched l else
      NotMatched([Token t], (sent,orth,lemma,cat,interp) :: l)
  | Token t, [] -> NotMatched([Token t],[])
  | Seq ets, l -> match_orth_sequence_list l ets
  | Variant ets, l -> Xlist.fold ets (NotMatched([Variant ets], l)) (fun res t ->
     (* Printf.printf "try match %s --- %s\n%!" (string_of_eniam_token_orths [t]) (string_of_nkjp_token_orths l); *)
     match match_orth_sequence (t,l) with
       Matched l -> (*print_endline "succed";*) Matched l
     | _ -> (*print_endline "failed";*) res)

and match_orth_sequence_list l = function
    t :: ets ->
     (match match_orth_sequence (t,l) with
       Matched l -> match_orth_sequence_list l ets
     | NotMatched(ets1,l) -> NotMatched(ets1@ets,l))
  | [] -> Matched l

(* let rec match_orth_sequence = function
    {orth=""} :: ets, l -> match_orth_sequence (ets,l)
  | Token t :: ets, (sent,orth,lemma,cat,interp) :: l ->
      if t.orth = orth then match_orth_sequence (ets,l) else
      false, Token t :: ets, (sent,orth,lemma,cat,interp) :: l
  | Seq s :: ets, l ->
     let b,ets1,l1 = match_orth_sequence (
  | [],[] -> true,[],[]
  | ets,l -> false,ets,l *)

let load_rules filename =
  List.rev (File.fold_tab filename [] (fun l -> function
    [e;n] -> (Xstring.split " " e, Xstring.split " " n) :: l
  | _ -> failwith "load_rules"))

let space_rules = [
  "nkjp-correct",["'99 - 26"],["'99";"-";"26"];
  "both-correct",["1 VII 1945"],["1";"VII";"1945"];
  "both-correct",["1 VII 1960"],["1";"VII";"1960"];
  "both-correct",["1,50 - 2"],["1";",";"50";"-";"2"];
  "both-correct",["10 - 17"],["10";"-";"17"];
  "both-correct",["10.00 – 15.00"],["10";".";"00";"–";"15";".";"00"];
  "both-correct",["10:10 - 13:15"],["10";":";"10";"-";"13";":";"15"];
  "nkjp-correct",["11 - 2001"],["11";"-";"2001"];
  "both-correct",["136 - 159"],["136";"-";"159"];
  "both-correct",["14 XII 1920"],["14";"XII";"1920"];
  "both-correct",["1500 - 800"],["1500";"-";"800"];
  "both-correct",["16 III 1945"],["16";"III";"1945"];
  "both-correct",["1973 - 1976"],["1973";"-";"1976"];
  "both-correct",["1981 - 1985"],["1981";"-";"1985"];
  "both-correct",["1998 - 1999"],["1998";"-";"1999"];
  "nkjp-correct",["2003 - 18"],["2003";"-";"18"];
  "both-correct",["22 - 24"],["22";"-";"24"];
  "both-correct",["24 I 1945"],["24";"I";"1945"];
  "both-correct",["24 III 1945"],["24";"III";"1945"];
  "both-correct",["24 VIII 1985"],["24";"VIII";"1985"];
  "both-correct",["25 - 30"],["25";"-";"30"];
  "eniam-correct",["261";"ha"],["261 ha"];
  "both-correct",["27 I 1945"],["27";"I";"1945"];
  "both-correct",["29 IV 1863"],["29";"IV";"1863"];
  "nkjp-correct",["3 - 2";"+";"4"],["3";"-";"2";"+";"4"];
  "both-correct",["3 - 4"],["3";"-";"4"];
  "both-correct",["300 - 400"],["300";"-";"400"];
  "both-correct",["31 I 1945"],["31";"I";"1945"];
  "nkjp-correct",["31";",";"6"],["31, 6"];
  "eniam-correct",["396 093"],["396";"093"];
  "both-correct",["4 VIII 1904"],["4";"VIII";"1904"];
  "both-correct",["5 - 8"],["5";"-";"8"];
  "eniam-correct",["510 256 732"],["510";"256";"732"];
  "eniam-correct",["517 193"],["517";"193"];
  "nkjp-correct",["52-52";"c"],["52";"-";"52c"];
  "both-correct",["6 - 11"],["6";"-";"11"];
  "nkjp-correct",["6 - 2"],["6";"-";"2"];
  "both-correct",["6 - 6,5"],["6";"-";"6,5"];
  "both-correct",["8.00 - 14.00"],["8";".";"00";"-";"14";".";"00"];
  "both-correct",["8.00 - 16.00"],["8";".";"00";"-";"16";".";"00"];
  "both-correct",["9 - 11"],["9";"-";"11"];
  "both-correct",["9 IV 1241"],["9";"IV";"1241"];
  "nkjp-correct",["K";"6-3 400"],["K6-3";"400"];
  "both-correct",["XVI - XVIII"],["XVI";"-";"XVIII"];
  "nkjp-correct",["ai";"pi";"si";"si"],["ai pi si si"];
  "nkjp-correct",["ce";"o";"dwa"],["ce o dwa"];
  "nkjp-correct",["co";"najmniej"],["co najmniej"];
  "nkjp-correct",["co";"najwyżej"],["co najwyżej"];
  "nkjp-correct",["dżi";"pi";"es"],["dżi pi es"];
  "both-correct",["m";"2"],["m 2"];
  "both-correct",["m";"3"],["m 3"];
  "both-correct",["m.";"in."],["m. in."];
  "nkjp-correct",["te";"fał";"en"],["te fał en"];
  "nkjp-correct",["te";"fał";"enu"],["te fał enu"];
  "nkjp-correct",["te";"fał";"pe"],["te fał pe"];
  "nkjp-correct",["techend";"trejt";"loj";"kropka";"bloks";"pot";"kom"],["techend trejt loj kropka bloks pot kom"];
  "nkjp-correct",["tik";"taka"],["tik taka"];
  "nkjp-correct",["w";"w";"w";"polskie";"radio";"euro";"kropka";"pe";"el"],["w w w polskie radio euro kropka pe el"];
  "nkjp-correct",["à";"la"],["à la"];
  "nkjp-correct",["à";"propos"],["à propos"];
  "both-correct",["m.";"in."],["m. in";"."];
  "nkjp-correct",["PM";"63"],["PM 63"];
  "both-correct",["R";".";"P";"."],["R. P."];
  "nkjp-correct",["1950 - 54.547"],["1950";"-";"54.547"];
  "nkjp-correct",["1950 - 82.756"],["1950";"-";"82.756"];
  "both-correct",["0,8 - 2,0"],["0,8";"-";"2,0"];
  "both-correct",["100 - 200"],["100";"-";"200"];
  "both-correct",["15 - 17"],["15";"-";"17"];
  "nkjp-correct",["2004 - 17,5"],["2004";"-";"17,5"];
  "nkjp-correct",["2005 - 17"],["2005";"-";"17"];
  "eniam-correct",["30";"m"],["30 m"];
  "nkjp-correct",["K";"6-2 400"],["K6-2";"400"];
  "nkjp-correct",["WIG";"20"],["WIG 20"];
  "eniam-correct",["Z";"DALA"],["Z DALA"];
  "nkjp-correct",["ha";"de"],["ha de"];
  "nkjp-correct",["o";"em"],["o em"];
  "nkjp-correct",["pe";"ka";"o"],["pe ka o"];
  "nkjp-correct",["pe";"pe";"en"],["pe pe en"];
  "both-correct",["w";".";"c";"."],["w. c";"."];
  "nkjp-correct",["jor";"self"],["jor self"];
(*  "both-correct",["655 tys."],["655";"tys";"."];
  "both-correct",["100 tys."],["100";"tys";"."];
  "both-correct",["1,4 tys."],["1,4";"tys";"."];
  "both-correct",["2,94 mln"],["2,94";"mln"];
  "both-correct",["900 tys."],["900";"tys";"."];
  "both-correct",["9,5 tys."],["9,5";"tys";"."];
  "both-correct",["800 tys."],["800";"tys";"."];
  "both-correct",["800 mln"],["800";"mln"];
  "both-correct",["80 tys."],["80";"tys";"."];
  "both-correct",["8 tys."],["8";"tys";"."];
  "both-correct",["750 tys."],["750";"tys";"."];
  "both-correct",["75 tys."],["75";"tys";"."];
  "both-correct",["70-75 tys."],["70";"-";"75";"tys";"."];*)
  "both-correct",["7.30 - 15.30"],["7";".";"30";"-";"15";".";"30"];
(*  "both-correct",["7,5 tys."],["7,5";"tys";"."];
  "both-correct",["7,3 mld"],["7,3";"mld"];
  "both-correct",["65 mln"],["65";"mln"];
  "both-correct",["600 tys."],["600";"tys";"."];
  "both-correct",["600 mld"],["600";"mld"];
  "both-correct",["6 tys."],["6";"tys";"."];
  "both-correct",["6 mln"],["6";"mln"];
  "both-correct",["550 tys."],["550";"tys";"."];
  "both-correct",["550 mln"],["550";"mln"];
  "both-correct",["530 mln"],["530";"mln"];
  "both-correct",["500 tys."],["500";"tys";"."];
  "both-correct",["50 tys."],["50";"tys";"."];
  "both-correct",["50 mln"],["50";"mln"];*)
  ["(032) 51 30 86"],["(";"032";")";"51";"30";"86"];
  ["0-46 855-45-26"],["0-46";"855-45-26"];
  ["02-651"],["02";"-";"651"];
  ["10-11 mld"],["10";"-";"11";"mld"];
  ["1:100";"000"],["1";":";"100 000"];
  ["2,5-3 mln"],["2,5";"-";"3";"mln"];
  ["22.12. - 20.01"],["22";".";"12";".";"-";"20";".";"01"];
  ["30-40 mln"],["30";"-";"40";"mln"];
  ["40-50 tys."],["40";"-";"50";"tys";"."];
  ["70-75 tys."],["70";"-";"75";"tys";"."];

  ]

let rules =
  Xlist.map (load_rules "data/both-correct.tab") (fun (e,n) -> "both-correct",e,n) @
  Xlist.map (load_rules "data/eniam-correct.tab") (fun (e,n) -> "eniam-correct",e,n) @
  Xlist.map (load_rules "data/nkjp-correct.tab") (fun (e,n) -> "nkjp-correct",e,n) @ space_rules

let rec match_n_pat = function
    s :: pat,(_,orth,_,_,_) :: l -> if s = orth then match_n_pat (pat,l) else raise Not_found
  | [], l -> l
  | _, [] -> raise Not_found

let rec match_e_pat pat = function
    Token{orth=""} -> pat
  | Token t ->
      if pat=[] then raise Not_found else
      if t.orth = List.hd pat then List.tl pat else raise Not_found
  | Seq l -> Xlist.fold l pat match_e_pat
  | Variant l ->
      let l = Xlist.fold l [] (fun l t ->
        try
          (match_e_pat pat t) :: l
        with Not_found -> l) in
      let map = Xlist.fold l IntMap.empty (fun map pat ->
        IntMap.add map (Xlist.size pat) pat) in
      let l = IntMap.fold map [] (fun l _ pat -> pat :: l) in
      (match l with
        [] -> raise Not_found
      | [pat] -> pat
      | _ -> failwith "match_e_pat")

let rec match_e_pat_list pat ets =
  if pat = [] then ets else
  if ets = [] then raise Not_found else
  let pat = match_e_pat pat (List.hd ets) in
  match_e_pat_list pat (List.tl ets)

let rec apply_rules stats ets l = function
    [] -> raise Not_found
  | (stat,e_pat,n_pat) :: rules ->
      try
        let l = match_n_pat (n_pat,l) in
        let ets = match_e_pat_list e_pat ets in
        StringQMap.add stats stat, ets, l
      with Not_found -> apply_rules stats ets l rules

let rec combine e_pref n_pref e_rev n_rev ets l =
  if String.length e_pref < String.length n_pref then
    if ets = [] then List.rev e_rev, List.rev n_rev @ l, [], [] else
    let t = List.hd ets in
    let s = get_tokens_orth t in
    combine (e_pref ^ s) n_pref (t :: e_rev) n_rev (List.tl ets) l
  else if String.length e_pref > String.length n_pref then
    if l = [] then List.rev e_rev @ ets, List.rev n_rev, [], [] else
    let sent,orth,lemma,cat,interp = List.hd l in
    combine e_pref (n_pref ^ orth) e_rev ((sent,orth,lemma,cat,interp) :: n_rev) ets (List.tl l)
  else if e_pref = "" then
    if ets = [] then List.rev e_rev, List.rev n_rev @ l, [], [] else
    if l = [] then List.rev e_rev @ ets, List.rev n_rev, [], [] else
    let t = List.hd ets in
    let s = get_tokens_orth t in
    let sent,orth,lemma,cat,interp = List.hd l in
    combine s orth (t :: e_rev) ((sent,orth,lemma,cat,interp) :: n_rev) (List.tl ets) (List.tl l)
  else List.rev e_rev, List.rev n_rev, ets, l

let match_tys stats ets l =
  if ets = [] then raise Not_found else
  match Xstring.split " " (get_tokens_orth (List.hd ets)),l with
    [a;"tys."],(_,b,_,_,_)::(_,"tys",_,_,_)::(_,".",_,_,_)::l -> if a=b then StringQMap.add stats "tys.",List.tl ets,l else raise Not_found
  | [a;"tys"],(_,b,_,_,_)::(_,"tys",_,_,_)::l -> if a=b then StringQMap.add stats "tys.",List.tl ets,l else raise Not_found
  | [a;"mln"],(_,b,_,_,_)::(_,"mln",_,_,_)::l -> if a=b then StringQMap.add stats "tys.",List.tl ets,l else raise Not_found
  | [a;"mld"],(_,b,_,_,_)::(_,"mld",_,_,_)::l -> if a=b then StringQMap.add stats "tys.",List.tl ets,l else raise Not_found
  | _ -> raise Not_found

let rec match_and_combine name paragraph stats ets l =
  let res = match_orth_sequence_list l ets in
  match res with
    Matched [] -> stats
  | NotMatched(ets,l) ->
      (try
        let stats,ets,l = match_tys stats ets l in
        match_and_combine name paragraph stats ets l
      with Not_found -> (try
        let stats,ets,l = apply_rules stats ets l rules in
        match_and_combine name paragraph stats ets l
      with Not_found ->
        let e_tokens,n_tokens,ets,l = combine "" "" [] [] ets l in
        (* let stats = StringQMap.add stats (string_of_eniam_token_orths e_tokens ^ "\t" ^ string_of_nkjp_token_orths n_tokens ^ "\t" ^ name) in *)
        (* let stats = StringQMap.add stats (string_of_eniam_token_orths e_tokens ^ "\t" ^ string_of_nkjp_token_orths n_tokens ^ "\t" ^ paragraph) in *)
        let stats = StringQMap.add stats ("[\"" ^ string_of_eniam_token_orths2 e_tokens ^ "\"],[\"" ^ string_of_nkjp_token_orths2 n_tokens ^ "\"];") in
        match_and_combine name paragraph stats ets l))
  | Matched l -> StringQMap.add stats ("match_and_combine: " ^ name ^ "\t" ^ string_of_nkjp_token_orths l ^ "\t" ^ paragraph)


let validate_segmentation stats name typ channel entries =
  prerr_endline name;
  Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
    (* if id_div = 3 then *)
    Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
      (* Printf.printf "%d\t%s\n" id_div paragraph; *)
      let tokens = flatten_sentences sentences in
      let tokens = simple_allign "" "" [] tokens in
      let paragraph = render_paragraph tokens in
      (* Printf.printf "rend:\t%s\n" paragraph; *)
      let tokens = remove_spaces [] tokens in
      let eniam_tokens = ENIAMtokenizer.parse paragraph in
      (* let eniam_orths = simplify_eniam_tokens (Seq eniam_tokens) in *)
      match_and_combine name paragraph stats eniam_tokens tokens))

(*let eniam_correct = StringSet.of_list (File.load_lines "data/eniam-correct.tab")
let nkjp_correct = StringSet.of_list (File.load_lines "data/nkjp-correct.tab")

let space = {empty_token_env with orth=" "; token=Symbol " "}
let query_beg = {empty_token_env with token=Interp "<query>"}
let query_end = {empty_token_env with token=Interp "</query>"}
let sencence_beg = {empty_token_env with token=Interp "<sentence>"}
let sencence_end = {empty_token_env with token=Interp "</sentence>"}
let clause_beg = {empty_token_env with token=Interp "<clause>"}
let clause_end = {empty_token_env with token=Interp "</clause>"}


let set_sent_end = function
    (Inside,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ ->
      (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  | (SentBeg,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ ->
      (SentBegEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  | _ -> failwith "set_sent_end"

let set_beg_as_zero = function
    (sent,_,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l ->
      (sent,0,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  | [] -> failwith "set_beg_as_zero"

let flatten_sentences sentences =
  List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) ->
    set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) ->
      (sent,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l, Inside))))

let make_token orth lemma cat interp =
  {empty_token_env with
         orth=orth;
         token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])}

let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; "li"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ]
(* let prefixes = StringSet.of_list [
  (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'";
  "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *)

let is_space_required prev_orth prev_cat orth cat =
  if cat = "interp" || cat = "aglt" || prev_cat = "interp" || prev_cat = "" || StringSet.mem suffixes orth then false else (
  let prev_char = List.hd (List.rev (Xunicode.classified_chars_of_utf8_string prev_orth)) in
  let cur_char = List.hd (Xunicode.classified_chars_of_utf8_string orth) in
  match prev_char,cur_char with
    Xunicode.Sign a,Xunicode.Sign b -> (*print_endline ("is_space_required 1: " ^ prev_orth ^ " " ^ orth ^ " " ^ a ^ " " ^ b);*) true
  | _,Xunicode.Sign _ -> false
  | Xunicode.Sign _,_ -> false
  | Xunicode.Digit _,Xunicode.Digit _ -> true
  | Xunicode.Digit _,_ -> false
  | _,Xunicode.Digit _ -> false
  | Xunicode.Small _,Xunicode.Small _ -> true
  | Xunicode.ForeignSmall _,Xunicode.Small _ -> true
  | Xunicode.Capital _,Xunicode.Capital _ -> true
  | Xunicode.Small _,Xunicode.Capital _ -> true
  | Xunicode.Capital _,Xunicode.Small _ -> true
  | Xunicode.ForeignCapital _,Xunicode.Small _ -> true
  | a,b -> failwith ("is_space_required: " ^ prev_orth ^ " " ^ orth ^ " " ^ Xunicode.to_string a ^ " " ^ Xunicode.to_string b))

let rec allign prev_orth prev_cat rev = function
    (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l ->
       allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l
  | (SentBegEnd,0,_,_,_,orth,lemma,cat,interp) :: l ->
       allign orth cat (List.rev [query_beg;sencence_beg;clause_beg;make_token orth lemma cat interp;clause_end;sencence_end]) l
  | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith ("allign 1: " ^ orth)
  | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l ->
       let rev =
         if no_spaces > 0 then space :: rev else
         if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in
       if sent = SentBegEnd then
         let rev = (List.rev [sencence_beg;clause_beg;make_token orth lemma cat interp;clause_end;sencence_end]) @ rev in
         allign orth cat rev l
       else
       let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in
       let rev = (make_token orth lemma cat interp) :: rev in
       let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in
       allign orth cat rev l
  | [] -> List.rev (query_end :: rev)

let rec set_lengths n rev = function
    t :: l ->
       let len =
         if t.token = Interp "<query>" || t.token = Interp "</query>" then factor else
         Xlist.size (Xunicode.utf8_chars_of_utf8_string t.orth) * factor in
       set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l
  | [] -> List.rev rev

(* FIXME: poprawić interpretacje przecinka i innych znaków interpunkcyjnych *)
let rec set_special_tokens_lengths rev = function
    ({token=Interp "<sentence>"} as sent) :: ({token=Interp "<clause>"} as cl) :: t :: l ->
       let sent = {sent with len=1; next=sent.beg+1} in
       let cl = {cl with beg=sent.next; len=1; next=sent.next+1} in
       let t = {t with beg=t.beg+2; len=t.len-2} in
       set_special_tokens_lengths (Token t :: Token cl :: Token sent :: rev) l
  | ({orth="."; token=Lemma(".","interp",[[]])} as dot) :: ({token=Interp "</clause>"} as cl) :: {token=Interp "</sentence>"} :: l ->
       let cl = {cl with beg=dot.beg; len=20; next=dot.beg+20} in
       let dot = {dot with beg=cl.next; len=80; token= Interp "</sentence>"} in
       set_special_tokens_lengths (Token dot :: Token cl :: rev) l
  | t :: l -> set_special_tokens_lengths (Token t :: rev) l
  | [] -> List.rev rev

let render_paragraph tokens =
  String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth)))

let rec get_next = function
    Token t -> t.next
  | Seq [] -> failwith "get_next"
  | Seq l -> get_next (List.hd (List.rev l))
  | Variant [] -> failwith "get_next"
  | Variant l -> get_next (List.hd l)

let rec get_beg = function
    Token t -> t.beg
  | Seq [] -> failwith "get_beg"
  | Seq l -> get_beg (List.hd l)
  | Variant [] -> failwith "get_beg"
  | Variant l -> get_beg (List.hd l)

let make_seq  = function
    [] -> failwith "make_seq"
  | [t] -> t
  | l -> Seq l

let rec match_token_sequence erev nrev rev = function
    et :: ets, nt :: nts ->
      let enext = get_next et in
      let nnext = get_next nt in
      if enext = nnext then
        match_token_sequence [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts)
      else if enext < nnext then
        match_token_sequence (et :: erev) nrev rev (ets, nt :: nts)
      else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts)
  | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l)
  | ets,nts ->
      let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (Seq ets)) in
      let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (Seq nts)) in
      (*failwith*)print_endline (Printf.sprintf "match_token_sequence: %s\n\n%s\n" s t); []

let rec compare_tokens = function
    Token et, Token nt ->
       et.orth = nt.orth && et.beg = nt.beg && et.len = nt.len && et.next = nt.next
  | et,Variant l ->
       Xlist.fold l true (fun b nt ->
         compare_tokens (et,nt) && b)
  | Variant l,nt ->
       Xlist.fold l false (fun b et ->
         compare_tokens (et,nt) || b)
  | Seq[et], nt -> compare_tokens (et,nt)
  | et, Seq[nt] -> compare_tokens (et,nt)
  | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false
  | _ -> false

let rec shift_token_rec beg = function
    Token t -> Token{t with beg=t.beg-beg; next=t.next-beg}
  | Seq l -> Seq(Xlist.map l (shift_token_rec beg))
  | Variant l -> Variant(Xlist.map l (shift_token_rec beg))

let shift_token t =
  let beg = get_beg t in
  shift_token_rec beg t

let string_of_tokens_complete eniam_token nkjp_token =
  let s = ENIAMtokens.string_of_tokens 0 (shift_token eniam_token) in
  let t = ENIAMtokens.string_of_tokens 0 (shift_token nkjp_token) in
  s ^ "\n" ^ t

let rec string_of_tokens_simple = function
    Token t -> if t.orth = "" then ENIAMtokens.get_orth t.token(*failwith "string_of_tokens_simple"*) else t.orth
  | Seq l -> String.concat " " (Xlist.map l string_of_tokens_simple)
  | Variant l ->
      (match StringSet.to_list (StringSet.of_list (Xlist.map l string_of_tokens_simple)) with
        [] -> failwith "string_of_tokens_simple"
      | [s] -> s
      | l -> "[" ^ String.concat "; " l ^ "]")

let string_of_tokens_simple eniam_token nkjp_token =
  try
    string_of_tokens_simple eniam_token ^ " <---> " ^
    string_of_tokens_simple nkjp_token
  with _ -> "EMPTY ORTH"

let validate addition_fun stats name typ channel entries =
  print_endline name;
  Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
    (* if id_div = 3 then *)
    Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
      (* Printf.printf "%d\t%s\n" id_div paragraph; *)
      let tokens = flatten_sentences sentences in
      let tokens = allign "" "" [] (set_beg_as_zero tokens) in
      let paragraph = render_paragraph tokens in
      (* Printf.printf "rend:\t%s\n" paragraph; *)
      let tokens = set_lengths 0 [] tokens in
      let tokens = set_special_tokens_lengths [] tokens in
      let tokens = ENIAMpatterns.remove_spaces [] tokens in
      let eniam_tokens = ENIAMtokenizer.parse paragraph in
      (* Printf.printf "eniam_tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq eniam_tokens));
      Printf.printf "tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq tokens)); *)
      let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in
      Xlist.fold l stats (fun stats (eniam_token,nkjp_token) ->
        if compare_tokens (eniam_token,nkjp_token) then stats else
        if StringSet.mem eniam_correct (string_of_tokens_simple eniam_token nkjp_token) then stats else
        if StringSet.mem nkjp_correct (string_of_tokens_simple eniam_token nkjp_token) then stats else
          StringQMap.add stats (addition_fun eniam_token nkjp_token)
        (*
          let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token eniam_token)) in
          let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token nkjp_token)) in
          (* Printf.printf "%s\n%s\n\n%!" s t; *)
          StringQMap.add stats (s ^ "\n" ^ t)*))) (*else stats*))

let validate_sementation addition_fun stats name typ channel entries =
  print_endline name;
  Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
    (* if id_div = 3 then *)
    Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
      (* Printf.printf "%d\t%s\n" id_div paragraph; *)
      let tokens = flatten_sentences sentences in
      let tokens = allign "" "" [] (set_beg_as_zero tokens) in
      let paragraph = render_paragraph tokens in
      (* Printf.printf "rend:\t%s\n" paragraph; *)
      let tokens = set_lengths 0 [] tokens in
      let tokens = set_special_tokens_lengths [] tokens in
      let tokens = ENIAMpatterns.remove_spaces [] tokens in
      let eniam_tokens = ENIAMtokenizer.parse paragraph in
      (* Printf.printf "eniam_tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq eniam_tokens));
      Printf.printf "tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq tokens)); *)
      let l = match_token_orth_sequence [] [] [] (eniam_tokens,tokens) in
      Xlist.fold l stats (fun stats (eniam_token,nkjp_token) ->
        if compare_tokens (eniam_token,nkjp_token) then stats else
        if StringSet.mem eniam_correct (string_of_tokens_simple eniam_token nkjp_token) then stats else
        if StringSet.mem nkjp_correct (string_of_tokens_simple eniam_token nkjp_token) then stats else
          StringQMap.add stats (addition_fun eniam_token nkjp_token)
        (*
          let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token eniam_token)) in
          let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token nkjp_token)) in
          (* Printf.printf "%s\n%s\n\n%!" s t; *)
          StringQMap.add stats (s ^ "\n" ^ t)*))) (*else stats*))
*)
let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
"620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727";
"620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056";
"711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";*)
  (* "040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083";
  "120-2-900092";"120-2-900094";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900008";"120-4-900010";"130-3-900001";"130-3-910001";"130-5-000000267";
  "130-5-000000406";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001338";"130-5-000001628";"130-5-000001742";"200-1-000011";"200-1-000026";"200-2-000078";
  "200-2-000173";"200-2-000175";"200-4-000000307";"200-4-000000316";"310-2-000007";"320-2-000000094";"320-2-000034";"320-2-000064";"320-3-000226";"330-2-000000030";
  "330-2-000000033";"330-2-000000200";"330-2-000000213";"330-2-000003";"330-2-000013";"620-3-010000057";"620-3-010000838";"620-3-010001103";"620-3-010001107";"620-3-010001108";
  "620-3-010001109";"620-3-010001125";"620-3-010001274";"620-3-010001448";"620-3-010001732";"620-3-010001772";"711-3-010000021";"712-1-900003";"712-1-900004";"720-3-000071";
  "720-3-010000323";"DP1999";"DP2002";"DP2003";"EkspressWieczorny";"forumowisko.pl_20218";"forumowisko.pl_42911";"forumowisko.pl_724";"GazetaGoleniowska";"GazetaTczewska";
  "NIE";"SuperExpress";"TrybunaSlaska" *)
  "120-2-000009";"120-2-000010";"120-2-000012";"120-2-900019";"120-2-900041";"120-2-900044";"120-2-900092";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900001";
  "120-4-900008";"130-3-900001";"130-5-000000267";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001628";"130-5-000001635";"130-5-000001742";"200-1-000011";
  "200-2-000078";"200-2-000181";"200-4-000000314";"200-4-000026";"200-4-000059";"310-2-000007";"320-2-000000087";"320-2-000000094";"320-2-000034";"330-2-000013";"620-3-010000057";
  "620-3-010000099";"620-3-010000838";"620-3-010000839";"620-3-010001729";"620-3-010001743";"620-3-010001853";"620-3-010001873";"620-3-010001895";"711-3-010000021";"720-3-000071";
  "720-3-010000323";"720-3-010000337";"DP2000";"EkspressWieczorny";"forumowisko.pl_12517";"forumowisko.pl_20218";"forumowisko.pl_42911";"GazetaTczewska";"SuperExpress"
]

let _ =
  ENIAMtokenizer.initialize ();
  let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate_segmentation stats name typ channel entries) in
  (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate_segmentation stats name typ channel entries) in *)
  (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate string_of_tokens_complete stats name typ channel entries) in
  (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate string_of_tokens_complete stats name typ channel entries) in *)
  let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
  Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\n%s\n" v k); *)
  (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate string_of_tokens_simple stats name typ channel entries) in *)
  (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection StringQMap.empty (fun stats (name,typ,channel,entries) ->
    validate string_of_tokens_simple stats name typ channel entries) in *)
  let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
  Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k);

  ()