Commit f86c717ecaa3eff28984864dc3251eb26b0fb2fc

Authored by Wojciech Jaworski
1 parent aedafaff

porównywanie sekwencji tokenów

Showing 1 changed file with 170 additions and 0 deletions
NKJP2/validateTokenizer.ml 0 → 100644
  1 +(*
  2 + * ENIAM_NKJP, an interface for National Corpus of Polish (NKJP).
  3 + * Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open ENIAMtokenizerTypes
  21 +open Xstd
  22 +
  23 +let space = {empty_token_env with orth=" "; token=Symbol " "}
  24 +let query_beg = {empty_token_env with token=Interp "<query>"}
  25 +let query_end = {empty_token_env with token=Interp "</query>"}
  26 +let sencence_beg = {empty_token_env with token=Interp "<sentence>"}
  27 +let sencence_end = {empty_token_env with token=Interp "</sentence>"}
  28 +let clause_beg = {empty_token_env with token=Interp "<clause>"}
  29 +let clause_end = {empty_token_env with token=Interp "</clause>"}
  30 +
  31 +type sent = SentBeg | SentEnd | Inside
  32 +
  33 +let set_sent_end = function
  34 + (_,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ ->
  35 + (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  36 + | _ -> failwith "set_sent_end"
  37 +
  38 +let flatten_sentences sentences =
  39 + List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) ->
  40 + set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) ->
  41 + (sent,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l, Inside))))
  42 +
  43 +let make_token orth lemma cat interp =
  44 + {empty_token_env with
  45 + orth=orth;
  46 + token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])}
  47 +
  48 +let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ]
  49 +(* let prefixes = StringSet.of_list [
  50 + (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'";
  51 + "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *)
  52 +
  53 +
  54 +let rec allign prev_orth prev_cat rev = function
  55 + (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l ->
  56 + allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l
  57 + | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign"
  58 + | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l ->
  59 + let rev =
  60 + if no_spaces > 0 then space :: rev
  61 + else if cat = "interp" || cat = "aglt" || prev_cat = "interp" || StringSet.mem suffixes orth (*|| StringSet.mem prefixes prev_orth*) then rev
  62 + else (
  63 + (* print_endline ("allign: " ^ prev_orth ^ " " ^ orth); *)
  64 + space :: rev) in
  65 + let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in
  66 + let rev = (make_token orth lemma cat interp) :: rev in
  67 + let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in
  68 + allign orth cat rev l
  69 + | [] -> List.rev (query_end :: rev)
  70 +
  71 +let rec set_lengths n rev = function
  72 + t :: l ->
  73 + let len =
  74 + if t.token = Interp "<query>" || t.token = Interp "</query>" then factor else
  75 + Xlist.size (Xunicode.utf8_chars_of_utf8_string t.orth) * factor in
  76 + set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l
  77 + | [] -> List.rev rev
  78 +
  79 +let render_paragraph tokens =
  80 + String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth)))
  81 +
  82 +let rec get_next = function
  83 + Token t -> t.next
  84 + | Seq [] -> failwith "get_next"
  85 + | Seq l -> get_next (List.hd (List.rev l))
  86 + | Variant [] -> failwith "get_next"
  87 + | Variant l -> get_next (List.hd l)
  88 +
  89 +let rec match_tokens erev nrev rev = function
  90 + et :: ets, nt :: nts ->
  91 + let next = get_next et in
  92 + if next = nt.next then
  93 + match_tokens [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts)
  94 + else if next < nt.next then
  95 + match_tokens (et :: erev) nrev rev (ets, nt :: nts)
  96 + else match_tokens erev (nt :: nrev) rev (et :: ets, nts)
  97 + | [],[] -> List.rev rev
  98 + | _ -> failwith "match_tokens"
  99 +
  100 +(* let compare_token et t =
  101 + et.orth=t.orth && et.beg=t.beg && et.len=t.len && et.next=t.next && et.token=t.token
  102 +
  103 +let get_beg = function
  104 + Token t -> t.beg
  105 + | Seq [] -> failwith "get_beg"
  106 + | Seq l -> get_beg (List.hd l)
  107 + | Variant [] -> failwith "get_next"
  108 + | Variant l -> get_beg (List.hd l)
  109 +
  110 +let rec compare_tokens stats = function
  111 + Token et :: ets, t :: ts ->
  112 + if compare_token et t then compare_tokens stats (ets,ts) else (
  113 + Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t);
  114 + stats)
  115 + | Variant l :: ets, ts -> failwith "compare_tokens 4"
  116 + | Seq l :: ets, ts -> failwith "compare_tokens 3"
  117 + | [], ts -> failwith "compare_tokens 2"
  118 + | _, [] -> failwith "compare_tokens 1"
  119 +
  120 +let rec get_subsequence_rec next rev = function
  121 + t :: tokens -> if t.next = next then List.rev (t :: rev) else get_subsequence_rec next (t :: rev) tokens
  122 + | [] -> failwith "get_subsequence_rec"
  123 +
  124 +let get_subsequence beg next = function
  125 + t :: tokens -> if t.beg = beg then get_subsequence_rec next [] (t :: tokens) else failwith "get_subsequence 2"
  126 + | [] -> failwith "get_subsequence 1"
  127 +
  128 +let compare_token stats tokens = function
  129 + Token et :: ets, t :: ts ->
  130 + if compare_token et t then compare_tokens stats (ets,ts) else (
  131 + Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t);
  132 + stats)
  133 + | Variant l :: ets, ts -> failwith "compare_tokens 4"
  134 + | Seq l :: ets, ts -> failwith "compare_tokens 3"
  135 + | [], ts -> failwith "compare_tokens 2"
  136 + | _, [] -> failwith "compare_tokens 1"
  137 +
  138 +let rec compare_tokens stats tokens = function
  139 + et :: ets ->
  140 + let ts,tokens = get_subsequence (get_beg et) (get_next et) tokens in
  141 + compare_token stats ts et
  142 + | [] -> if tokens = [] then stats else failwith "compare_tokens 1"*)
  143 +
  144 +let rec compare_tokens stats = function
  145 + (ets,nts) :: l ->
  146 + Xlist.iter ets (fun et -> Printf.printf "%s\n" (ENIAMtokens.string_of_tokens 0 et));
  147 + Xlist.iter nts (fun nt -> Printf.printf "%s\n" (ENIAMtokens.string_of_token_env nt));
  148 + print_endline "";
  149 + compare_tokens stats l
  150 + | [] -> stats
  151 +
  152 +let validate stats name typ channel entries =
  153 + (* if name = "120-2-900066" then ( *)
  154 + print_endline name;
  155 + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
  156 + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
  157 + let tokens = flatten_sentences sentences in
  158 + let tokens = allign "" "" [] tokens in
  159 + let tokens = set_lengths 0 [] tokens in
  160 + let paragraph = render_paragraph tokens in
  161 + let tokens = remove_spaces [] tokens in
  162 + let eniam_tokens = ENIAMtokenizer.parse paragraph in
  163 + let l = match_tokens [] [] [] (eniam_tokens,tokens) in
  164 + compare_tokens stats l))
  165 +
  166 +
  167 +let _ =
  168 + let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
  169 + validate stats name typ channel entries) in
  170 + ()
... ...