ENIAMsubsyntaxTypes.ml
2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMtokenizerTypes
type mode =
Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE
type sentence =
RawSentence of string
(* | CONLL of conll list *)
| StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *)
| DepSentence of (int * int * string) array (* (id * super * label) conll_id *)
| QuotedSentences of paragraph_record list
(* | NKJP1M of nkjp1m list *)
(* | Skladnica of skladnica_tree *)
| AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *)
and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *)
and paragraph =
RawParagraph of string
| StructParagraph of paragraph_record list (* zdania *)
| AltParagraph of (mode * paragraph) list
type text =
RawText of string
| StructText of paragraph list (* * token_record ExtArray.t*) (* akapity * tokeny *)
| AltText of (mode * text) list
let brev_filename = resource_path ^ "/subsyntax/brev.tab"
let fixed_filename = resource_path ^ "/Walenty/fixed.tab"
let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab"
let mwe_filename = resource_path ^ "/subsyntax/mwe.tab"
let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab"
let int_of_mode = function
Raw -> 0
| Struct -> 1
| CONLL -> 2
| ENIAM -> 3
| Mate -> 4
| Swigra -> 5
| POLFIE -> 6
let compare_mode x y =
compare (int_of_mode x) (int_of_mode y)