ENIAM_NKJP.ml 16.6 KB
(*
 *  ENIAM_NKJP, an interface for National Corpus of Polish (NKJP).
 *  Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd

type id = {corref: string; prefix: string; suffix: string; numbers: int list}

let empty_id = {corref = ""; prefix = ""; suffix = ""; numbers = []}

let parse_id id =
  (* if String.length s = 0 then empty_id else *)
  if String.length id < 6 then failwith "parse_id: za krótkie id"  else
  let corref,id = match Xstring.split "#" id with
      [corref;id] -> corref,id
    | [id] -> "",id
    | _ -> failwith ("parse_id 1: " ^ id) in
  let prefix,id = match Xstring.split "_" id with
      [prefix;id] -> prefix,id
    | _ -> failwith ("parse_id 2: " ^ id) in
  let suffix,id = match Xstring.split "-" id with
      [id;suffix] -> suffix,id
    | _ -> failwith ("parse_id 3: " ^ id) in
  let numbers =  try Xlist.map (Xstring.split "\\." id) int_of_string with _ -> failwith ("parse_id 4: " ^ id) in
  {corref=corref; prefix=prefix; suffix=suffix; numbers=numbers}

let process_header_type typ =
  if Xstring.check_prefix "#typ_" typ then Xstring.cut_prefix "#typ_" typ
  else failwith ("process_header_type: " ^ typ)

let process_header_channel c =
  if Xstring.check_prefix "#kanal_" c then Xstring.cut_prefix "#kanal_" c
  else failwith ("process_header_channel: " ^ c)

let load_header path name =
  match Xml.parse_file (path ^ name ^ "/header.xml") with
      Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_);
                   Xml.Element("profileDesc",[],[Xml.Element("textClass",[],[
                     Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]);
                     Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]);
                     Xml.Element("revisionDesc",_,_)]) ->
        process_header_type typ,process_header_channel channel
    | _ -> failwith "load_header"

let get_folders path =
  Xlist.sort (Xlist.fold (Array.to_list (Sys.readdir path)) [] (fun l folder ->
    if Sys.is_directory (path ^ folder) then folder :: l else l)) compare

let load_paragraph = function
    Xml.Element("ab",["n",_;"xml:id",id_ab],[Xml.PCData paragraph]) ->
      parse_id id_ab,paragraph
  | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml)

let load_text_entry = function
    Xml.Element("div",["xml:id",id_div;"decls",_],paragraphs) ->
      parse_id id_div,List.rev (Xlist.rev_map paragraphs load_paragraph)
  | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml)

let load_text path name =
  match Xml.parse_file (path ^ name ^ "/text.xml") with
      Xml.Element("teiCorpus", _,[Xml.Element("xi:include",_,_);
                   Xml.Element("TEI",[],[Xml.Element("xi:include",_,_);
                     Xml.Element("text",["xml:id","txt_text";"xml:lang","pl"],[Xml.Element("body",["xml:id","txt_body"],entries)])])]) ->
        List.rev (Xlist.rev_map entries load_text_entry)
    | _ -> failwith "load_text"

let remove_rejected rev = function
    Xml.Element("seg",["corresp",_;"nkjp:rejected","true";"xml:id",_],[]) -> rev
  | Xml.Element("seg",["corresp",_;"nkjp:nps","true";"nkjp:rejected","true";"xml:id",_],[]) -> rev
  | Xml.Element("seg",["corresp",_;"xml:id",_],[]) as xml -> xml :: rev
  | Xml.Element("seg",["corresp",_;"nkjp:nps","true";"xml:id",_],[]) as xml -> xml :: rev
  | Xml.Element("nkjp:paren",[],Xml.Element("seg",["corresp",_;"nkjp:rejected","true";"xml:id",_],[]) :: _) -> rev
  | Xml.Element("nkjp:paren",[],Xml.Element("seg",["corresp",_;"nkjp:nps","true";"nkjp:rejected","true";"xml:id",_],[]) :: _) -> rev
  | Xml.Element("nkjp:paren",[],Xml.Element("seg",["corresp",_;"xml:id",_],[]) :: _) as xml -> xml :: rev
  | Xml.Element("nkjp:paren",[],Xml.Element("seg",["corresp",_;"nkjp:nps","true";"xml:id",_],[]) :: _) as xml -> xml :: rev
  | xml -> failwith ("remove_rejected: " ^ Xml.to_string_fmt xml)

let rec load_segm_token = function
    Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[]) ->
      [corresp,false,parse_id id_seg]
  | Xml.Element("seg",["corresp",corresp;"nkjp:nps","true";"xml:id",id_seg],[]) ->
      [corresp,true,parse_id id_seg]
  | Xml.Element("nkjp:paren",[],tokens) -> List.flatten (Xlist.map tokens load_segm_token)
  | Xml.Element("choice",[],alt) as xml ->
      let alt = Xlist.fold alt [] remove_rejected in
      (match alt with
        [token] -> load_segm_token token
      | _ -> failwith ("load_segm_token 2: " ^ Xml.to_string_fmt xml))
  | xml -> failwith ("load_segm_token 1: " ^ Xml.to_string_fmt xml)

let load_segm_sentence = function
    Xml.Element("s",["xml:id",id_s],tokens) ->
      parse_id id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token))
  | xml -> failwith ("load_segm_sentence: " ^ Xml.to_string_fmt xml)

let load_segm_entry = function
    Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) ->
      parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_segm_sentence)
  | xml -> failwith ("load_segm_entry: " ^ Xml.to_string_fmt xml)

let load_segmentation path name =
  match Xml.parse_file (path ^ name ^ "/ann_segmentation.xml") with
      Xml.Element("teiCorpus", _,[Xml.Element("xi:include",_,_);
                   Xml.Element("TEI",[],[Xml.Element("xi:include",_,_);
                     Xml.Element("text",["xml:id","segm_text";"xml:lang","pl"],[Xml.Element("body",["xml:id","segm_body"],entries)])])]) ->
        List.rev (Xlist.rev_map entries load_segm_entry)
    | _ -> failwith "load_segmentation"

let load_disamb = function
    Xml.Element("fs",["feats",_;"type","tool_report"],
      [Xml.Element("f",["fVal",_;"name","choice"],_);
       Xml.Element("f",["name","interpretation"],[Xml.Element("string",[],[Xml.PCData interp])])]) ->
         interp
  | xml -> failwith ("load_disamb: " ^ Xml.to_string_fmt xml)

let load_morph_token = function
    Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
      [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
       Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
      parse_id corresp,parse_id id_seg,orth,load_disamb disamb
  | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
      [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
       Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]);
       Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
      parse_id corresp,parse_id id_seg,orth,load_disamb disamb
  | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml)

let load_morph_sentence = function
    Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) ->
      parse_id corresp,parse_id id_s,List.rev (Xlist.rev_map tokens load_morph_token)
  | xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml)

let load_morph_entry = function
    Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) ->
      parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_morph_sentence)
  | xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml)

let load_morphosyntax path name =
  match Xml.parse_file (path ^ name ^ "/ann_morphosyntax.xml") with
      Xml.Element("teiCorpus", _,[Xml.Element("xi:include",_,_);
                   Xml.Element("TEI",[],[Xml.Element("xi:include",_,_);
                     Xml.Element("text",[],[Xml.Element("body",[],entries)])])]) ->
        List.rev (Xlist.rev_map entries load_morph_entry)
    | _ -> failwith "load_morphosyntax"

let parse_seg_corresp corresp =
  if not (Xstring.check_prefix "text.xml#string-range(" corresp) then failwith "parse_seg_corresp" else
  if not (Xstring.check_sufix ")" corresp) then failwith "parse_seg_corresp" else
  let corresp = Xstring.cut_prefix "text.xml#string-range(" corresp in
  let corresp = Xstring.cut_sufix ")" corresp in
  let id,beg,len = match Xstring.split "," corresp with
    [id;beg;len] -> parse_id id, int_of_string beg, int_of_string len
  | _ -> failwith "parse_seg_corresp" in
  let id_div,id_ab = match id with
    {corref=""; prefix="txt"; numbers=[id_div;id_ab]; suffix="ab"} -> id_div,id_ab
  | _ -> failwith "parse_seg_corresp" in
  id_div,id_ab,beg,len

let pos_set = StringSet.of_list
         ["subst";"depr";"ppron12";"ppron3";"siebie";"prep";"adj";"adjc";"adjp";"adja";"num";
          "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt";
          "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"comp";"conj";"interj";"burk";"interp";
          "brev";"xxx";"numcol"]

let parse_disamb disamb =
  if disamb = "::interp" then ":","interp",[] else
  if disamb = ":-):interp" then ":-)","interp",[] else
  (* if Xstring.check_sufix ":interp" disamb then  Xstring.cut_sufix ":interp" disamb, "interp", [] else *)
  match Xstring.split_delim ":" disamb with
    lemma1 :: lemma2 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2,"subst",interp
  | lemma1 :: lemma2 :: lemma3 :: "subst" :: interp -> lemma1 ^ ":" ^ lemma2 ^ ":" ^ lemma3,"subst",interp
  | lemma :: pos :: interp ->
        if StringSet.mem pos_set pos then lemma,pos,interp
        else failwith ("parse_disamb: " ^ disamb)
  | _ -> failwith "parse_disamb"

let rec merge_tokens name id_p rev = function
    (corresp,nps,{corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="seg"}) :: segmentation,
    ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="seg"},
     {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="seg"},orth,disamb) :: morphosyntax ->
        (* if id_p <> id_segm_p then Printf.printf "merge_tokens inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *)
        if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_tokens 2" else
        if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_tokens 3" else
        let id_div,id_ab,beg,len = parse_seg_corresp corresp in(
        (* if id_div <> id_p then (*failwith*)print_endline (Printf.sprintf "merge_tokens 4: %s %d %s" name id_p corresp); (*else*) *)
        let lemma,cat,interp = parse_disamb disamb in
        merge_tokens name id_p ((id_div,id_ab,(beg,len,nps,orth,lemma,cat,interp)) :: rev) (segmentation,morphosyntax))
  | [],[] -> List.rev rev
  | _ -> failwith "merge_tokens 1"

let rec split_sentences id_div id_ab rev rev2 = function
    (id_div2,id_ab2,token) :: l ->
       if id_div = id_div2 && id_ab = id_ab2 then split_sentences id_div id_ab (token :: rev) rev2 l else
       split_sentences id_div2 id_ab2 [token] ((id_div,id_ab,List.rev rev) :: rev2) l
  | [] -> List.rev ((id_div,id_ab,List.rev rev) :: rev2)

let rec merge_sentences name id_p rev = function
    ({corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="s"},segm_tokens) :: segmentation,
    ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="s"},
     {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="s"},morph_tokens) :: morphosyntax ->
        (* if id_p <> id_segm_p then Printf.printf "merge_sentences inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *)
        if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_sentences 2" else
        if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_sentences 3" else
        let tokens = merge_tokens name id_p [] (segm_tokens,morph_tokens) in
        let id_s = string_of_int id_segm_p ^ "." ^ string_of_int id_segm_s in
        if tokens = [] then failwith "merge_sentences 4" else
        let id_div,id_ab,token = List.hd tokens in
        let l = match split_sentences id_div id_ab [token] [] tokens with
          [id_div,id_ab,tokens] -> [id_div,id_ab,id_s,tokens]
        | [id_div1,id_ab1,tokens1;id_div2,id_ab2,tokens2] -> [id_div2,id_ab2,id_s^"b",tokens2;id_div1,id_ab1,id_s^"a",tokens1]
        | [id_div1,id_ab1,tokens1;id_div2,id_ab2,tokens2;id_div3,id_ab3,tokens3] -> [id_div3,id_ab3,id_s^"c",tokens3;id_div2,id_ab2,id_s^"b",tokens2;id_div1,id_ab1,id_s^"a",tokens1]
        | _ -> failwith (Printf.sprintf "merge_sentences 5: %s %d %d" name id_div id_ab) in
        merge_sentences name id_p (l @ rev) (segmentation,morphosyntax)
  | [],[] -> List.rev rev
  | _ -> failwith "merge_sentences"

let rec merge_paragraph id_div id_ab rev = function
    (id_div2,id_ab2,id_s,tokens) :: sentences ->
      if id_div <> id_div2 || id_ab <> id_ab2 then List.rev rev, (id_div2,id_ab2,id_s,tokens) :: sentences
      else merge_paragraph id_div id_ab ((id_s,tokens) :: rev) sentences
  | [] -> List.rev rev, []

let rec merge_paragraphs name id_p rev = function
    ({corref=""; prefix="txt"; numbers=[id_div;id_ab]; suffix="ab"},paragraph) :: paragraphs,
    (id_div2,id_ab2,id_s,tokens) :: sentences ->
       if id_div <> id_div2 && id_ab <> id_ab2 then failwith "merge_paragraphs 1" else
       let l,sentences = merge_paragraph id_div id_ab [id_s,tokens] sentences in
       (* Printf.printf "%d.%d: %s\n" id_div id_ab (String.concat " " (Xlist.map l fst)); *)
       merge_paragraphs name id_p ((paragraph,l) :: rev) (paragraphs,sentences)
  | [],[] -> List.rev rev
  | _ -> failwith ("merge_paragraphs 2: " ^ name ^ " " ^ string_of_int id_p)

let rec merge_entries name rev = function
    ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text,
    ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"},
     {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation,
    ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"},
     {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax ->
        if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else
        let sentences = merge_sentences name id_div [] (segm_sentences,morph_sentences) in
        let paragraphs = merge_paragraphs name id_div [] (paragraphs,sentences) in
        merge_entries name ((id_div,paragraphs) :: rev) (text,segmentation,morphosyntax)
  | [],[],[] -> List.rev rev
  | _ -> failwith "merge_entries"

let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/"

let _ =
  let names = get_folders nkjp_path in
  Xlist.iter names (fun name ->
    if name = "030-2-000000012" then () else
    (* print_endline name; *)
    let typ,channel = load_header nkjp_path name in
    (* print_endline typ; *)
    (* print_endline channel; *)
    (* print_endline (typ ^ "\t" ^ channel); *)
    let text = load_text nkjp_path name in
    let segmentation = load_segmentation nkjp_path name in
    let morphosyntax = load_morphosyntax nkjp_path name in
    let entries = merge_entries name [] (text,segmentation,morphosyntax) in
    ())

(*
frekwencje typów:
    127 fakt
     56 inf-por
    283 konwers
      2 listy
    376 lit
      1 lit_poezja
     80 media
    175 nd
    161 net_interakt
    227 net_nieinterakt
     20 nklas
   1986 publ
      8 qmow
    387 urzed

frekwencje kanałów
    388 internet
    817 ksiazka
    363 mowiony
    146 prasa
   1744 prasa_dziennik
    398 prasa_inne
      5 prasa_miesiecznik
     28 prasa_tygodnik

frekwencje łączne typów-kanałów
    127 fakt    ksiazka
     56 inf-por ksiazka
    283 konwers mowiony
      2 listy   ksiazka
    376 lit     ksiazka
      1 lit_poezja      ksiazka
     80 media   mowiony
    175 nd      ksiazka
    161 net_interakt    internet
    227 net_nieinterakt internet
     20 nklas   ksiazka
     60 publ    ksiazka
    146 publ    prasa
   1744 publ    prasa_dziennik
      3 publ    prasa_inne
      5 publ    prasa_miesiecznik
     28 publ    prasa_tygodnik
      8 qmow    prasa_inne
    387 urzed   prasa_inne

    *)