ENIAMsubsyntaxTypes.ml 2.42 KB
(*
 *  ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes

type mode =
    Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE

type sentence =
    RawSentence of string
  (* | CONLL of conll list *)
  | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *)
  | DepSentence of (int * int * string) array (* (id * super * label) conll_id *)
  | QuotedSentences of paragraph_record list
  (* | NKJP1M of nkjp1m list *)
  (* | Skladnica of skladnica_tree *)
  | AltSentence of (mode * sentence) list  (* string = etykieta np raw, nkjp, krzaki *)

and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *)

and paragraph =
    RawParagraph of string
  | StructParagraph of paragraph_record list (* zdania *)
  | AltParagraph of (mode * paragraph) list

type text =
    RawText of string
  | StructText of paragraph list (* * token_record ExtArray.t*) (* akapity * tokeny *)
  | AltText of (mode * text) list

let brev_filename = resource_path ^ "/subsyntax/brev.tab"
let fixed_filename = resource_path ^ "/Walenty/fixed.tab"
let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab"
let mwe_filename = resource_path ^ "/subsyntax/mwe.tab"

let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab"

let int_of_mode = function
    Raw -> 0
  | Struct -> 1
  | CONLL -> 2
  | ENIAM -> 3
  | Mate -> 4
  | Swigra -> 5
  | POLFIE -> 6

let compare_mode x y =
  compare (int_of_mode x) (int_of_mode y)