ENIAMsubsyntaxStringOf.ml 3.74 KB
(*
 *  ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMsubsyntaxTypes
open Printf

let mode = function
    Raw -> "Raw"
  | Struct -> "Struct"
  | CONLL -> "CONLL"
  | ENIAM -> "ENIAM"
  | Mate -> "Mate"
  | Swigra -> "Swigra"
  | POLFIE -> "POLFIE"

let token_extarray t =
  String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size t - 1) [] (fun l id ->
    let t2 = ExtArray.get t id in
    (Printf.sprintf "%3d %s" id (ENIAMtokens.string_of_token_env t2)) :: l)))

let token_list paths (*last*) =
  String.concat "\n" (Xlist.map paths (fun t -> ENIAMtokens.string_of_token_env t))
  (* ^ (if last < 0 then "" else Printf.sprintf "\nlast=%d" last) *)

let struct_sentence spaces t paths last =
  spaces ^ " id lnode rnode orth token\n" ^
  String.concat "\n" (Xlist.map (List.sort compare paths) (fun (id,lnode,rnode) ->
    let t2 = ExtArray.get t id in
    sprintf "%s%3d %5d %5d %s %s" spaces
      id lnode rnode t2.ENIAMtokenizerTypes.orth (ENIAMtokens.string_of_token t2.ENIAMtokenizerTypes.token))) ^
  sprintf "\n%s last=%d" spaces last

let dep_sentence spaces t paths =
  spaces ^ " id conll_id super label orth token \n" ^
  String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id ->
    let id,super,label = paths.(conll_id) in
    let t2 = ExtArray.get t id in
    (sprintf "%s%3d %8d %5d %s %s %s" spaces
      id conll_id super label t2.ENIAMtokenizerTypes.orth (ENIAMtokens.string_of_token t2.ENIAMtokenizerTypes.token)) :: l)))

let rec sentence spaces t = function
    RawSentence s -> spaces ^ "RawSentence: " ^ s
  | StructSentence(paths,last) -> spaces ^ "StructSentence:\n" ^ struct_sentence "        " t paths last
  | DepSentence paths -> spaces ^ "DepSentence:\n" ^ dep_sentence "        " t paths
  | QuotedSentences sentences ->
      spaces ^ "QuotedSentences:\n" ^ String.concat "\n" (Xlist.map sentences (fun p ->
        sprintf "      id=%s beg=%d len=%d next=%d\n%s" p.id p.beg p.len p.next (sentence "      " t p.sentence)))
  | AltSentence l ->
     String.concat "\n" (Xlist.map l (fun (m,s) ->
       sprintf "%sAltSentence mode=%s %s" spaces (mode m) (sentence "" t s)))

let rec paragraph spaces t = function
    RawParagraph s -> spaces ^ "RawParagraph: " ^ s
  | StructParagraph sentences ->
      spaces ^ "StructParagraph:\n" ^ String.concat "\n" (Xlist.map sentences (fun p ->
        sprintf "    id=%s beg=%d len=%d next=%d\n%s" p.id p.beg p.len p.next (sentence "    " t p.sentence)))
  | AltParagraph l ->
     String.concat "\n" (Xlist.map l (fun (m,p) ->
       sprintf "%sAltParagraph mode=%s %s" spaces (mode m) (paragraph "" t p)))

let rec text spaces t = function
    RawText s -> spaces ^ "RawText: " ^ s
  | StructText paragraphs ->
      spaces ^ "StructText:\n" ^ String.concat "\n" (Xlist.map paragraphs (paragraph "  " t))
  | AltText l ->
     String.concat "\n" (Xlist.map l (fun (m,te) ->
       sprintf "%sAltText mode=%s %s" spaces (mode m) (text "" t te)))