ENIAMsubsyntaxTypes.ml
3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMtokenizerTypes
type mode =
Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE
type sentence =
RawSentence of string
(* | CONLL of conll list *)
| StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *)
| DepSentence of (int * int * string) array (* (id * super * label) conll_id *)
| QuotedSentences of sentence_env list
(* | NKJP1M of nkjp1m list *)
(* | Skladnica of skladnica_tree *)
| AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *)
and sentence_env = {id: string; beg: int; len: int; next: int; sentence: sentence; file_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *)
and paragraph =
RawParagraph of string
| StructParagraph of sentence_env list (* zdania *)
| AltParagraph of (mode * paragraph) list
type text =
RawText of string
| StructText of paragraph list (* * token_record ExtArray.t*) (* akapity * tokeny *)
| AltText of (mode * text) list
let data_path =
try Sys.getenv "ENIAM_USER_DATA_PATH"
with Not_found -> "data"
let brev_filename = resource_path ^ "/subsyntax/brev.tab"
let fixed_filename = resource_path ^ "/Walenty/fixed.tab"
(* let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab" *)
let mwe_filename = data_path ^ "/mwe.tab"
let mwe2_filename = data_path ^ "/mwe2.tab"
let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic"
let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic"
let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic"
let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab"
(* let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf.tab"
let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names.tab" *)
let proper_names_filename = resource_path ^ "/subsyntax/proper_names_sgjp_polimorf_20151020.tab"
let proper_names_filename2 = resource_path ^ "/subsyntax/proper_names_20160104.tab"
let proper_names_filename3 = data_path ^ "/ne.tab"
let int_of_mode = function
Raw -> 0
| Struct -> 1
| CONLL -> 2
| ENIAM -> 3
| Mate -> 4
| Swigra -> 5
| POLFIE -> 6
let compare_mode x y =
compare (int_of_mode x) (int_of_mode y)