|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
(*
* ENIAM: Categorial Syntactic-Semantic Parser for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
(*let single_sense_flag = ref false(*true*)
let single_frame_flag = ref false(*true*)*)
(*type pos_record = {interp: string list list list; attrs: string list; proper: string list; senses: string list}
type dict = {lemmas: pos_record StringMap.t StringMap.t; dbeg: int; dlen: int}*)
(* type selector = Orth of string | Pos of string (*| All *) *)
|
|
30
31
|
(* Długość pojedynczego znaku w tekście *)
|
|
32
33
34
35
36
37
38
39
40
41
|
let factor = 100
type labels = {
number: string;
case: string;
gender: string;
person: string;
aspect: string;
}
|
|
42
|
|
|
43
44
45
46
|
(*type type_arg =
TArg of string
| TWith of type_arg list
|
|
47
|
and type_term =
|
|
48
|
TConst of string * type_arg list
|
|
49
|
| TMod of type_term * type_term
|
|
50
51
|
| TName of string
| TVariant of type_term * type_term*)
|
|
52
53
54
55
|
type semantics =
Normal
|
|
56
57
58
59
|
| Special of string list
(* | SpecialNoun of type_arg list * type_term
| SpecialMod of string * (type_arg list * type_term)*)
| PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *)
|
|
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
type token =
SmallLetter of string (* orth *)
| CapLetter of string * string (* orth * lowercase *)
| AllSmall of string (* orth *)
| AllCap of string * string * string (* orth * lowercase * all lowercase *)
| FirstCap of string * string * string * string (* orth * all lowercase * first letter uppercase * first letter lowercase *)
| SomeCap of string (* orth *)
| RomanDig of string * string (* value * cat *)
| Interp of string (* orth *)
| Symbol of string (* orth *)
| Dig of string * string (* value * cat *)
| Other2 of string (* orth *)
| Lemma of string * string * string list list list (* lemma * cat * interp *)
| Proper of string * string * string list list list * string list (* lemma * cat * interp * senses *)
(* | Sense of string * string * string list list list * (string * string * string list) list (* lemma * cat * interp * senses *) *)
| Compound of string * token list (* sense * components *)
| Sentence of token_record list * int
(* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających
informacje o poszczególnych tokenach *)
and token_record = {
|
|
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token *)
beg: int; (* pozycja początkowa tokenu *)
len: int; (* długość tokenu *)
next: int; (* pozycja początkowa następnego tokenu *)
token: token; (* treść tokenu *)
attrs: string list; (* dodatkowe atrybuty *)
weight: float;
e: labels;
id: int;
(* lemma: string;
pos: string;
tags: string list list list;*)
valence: (int * WalTypes.frame) list;
simple_valence: (int * WalTypes.frame) list;
senses: (string * string list * float) list;
lroles: string * string;
|
|
98
99
100
101
102
|
semantics: semantics;
lnode: int;
rnode: int;
}
|
|
103
|
(* Tokeny umieszczone są w strukturze danych umożliwiającej efektywne wyszukiwanie ich sekwencji,
|
|
104
|
struktura danych sama z siebie nie wnosi informacji *)
|
|
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
type tokens =
| Token of token_record
| Variant of tokens list
| Seq of tokens list
type pat = L | CL | D of string | C of string | S of string | RD of string | O of string
let empty_labels = {
number="";
case="";
gender="";
person="";
aspect="";
}
|
|
119
|
|
|
120
|
let empty_token = {
|
|
121
122
123
124
125
126
127
128
129
130
131
132
133
|
orth="";beg=0;len=0;next=0; token=Symbol ""; weight=0.; e=empty_labels;
id=0; attrs=[]; valence=[]; simple_valence=[]; senses=[];
lroles="",""; semantics=Normal; lnode=(-1); rnode=(-1)}
(* type conll = unit
type skladnica_tree = unit
type nkjp1m = {
orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token *)
beg: int; (* pozycja początkowa tokenu *)
len: int; (* długość tokenu *)
token: token; (* treść tokenu *)
} *)
|
|
134
|
type mode =
|
|
135
|
Raw | Struct | CONLL
|
|
136
137
|
(*type sentence =
|
|
138
139
140
141
142
143
144
|
RawSentence of string
(* | CONLL of conll list *)
| StructSentence of token_record list * int * int
(* | NKJP1M of nkjp1m list *)
(* | Skladnica of skladnica_tree *)
| AltSentence of (string * sentence) list (* string = etykieta np raw, nkjp, krzaki *)
|
|
145
|
type paragraph_record = {pid: string; pbeg: int; plen: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *)
|
|
146
147
148
149
150
151
152
153
|
type paragraph =
RawParagraph of string
| StructParagraph of paragraph_record list
| AltParagraph of (string * paragraph) list
type text =
RawText of string
|
|
154
|
| StructText of paragraph list*)
|
|
155
|
|
|
156
157
158
159
160
161
162
|
(* warstwy nkjp1m do analizy:
header
text
ann_segmentation
ann_morphosyntax
ann_named
*)
|
|
163
164
165
166
167
168
169
|
(* zdania wydobyte na zewnątrz *)
(* struktura ponadzdaniowa przetwarzana przed strukturą zależnościową *)
(* istnieje ryzyko eksplozji interpretacji *)
type sentence =
RawSentence of string
(* | CONLL of conll list *)
|
|
170
|
| StructSentence of token_record list * int (* paths * last *)
|
|
171
172
173
|
| ORSentence of token_record list * int * int * paragraph
(* | NKJP1M of nkjp1m list *)
(* | Skladnica of skladnica_tree *)
|
|
174
|
| AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *)
|
|
175
|
|
|
176
|
and paragraph_record = {pid: string; pbeg: int; plen: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *)
|
|
177
|
|
|
178
|
and paragraph =
|
|
179
|
RawParagraph of string
|
|
180
181
|
| StructParagraph of paragraph_record list * int (* zdania * next_id *)
| AltParagraph of (mode * paragraph) list
|
|
182
183
184
185
|
type text =
RawText of string
| StructText of paragraph list
|
|
186
|
| AltText of (mode * text) list
|
|
187
188
189
190
191
192
193
194
195
196
197
198
199
|
(* zdania reprezentowane jako tokeny *)
(* struktura ponadzdaniowa przetwarzana po strukturze zależnościowej *)
(* problem z wtrąceniami OR *)
(*type paragraph =
RawParagraph of string
| StructParagraph of token_record list
| AltParagraph of (string * paragraph) list
type text =
RawText of string
| StructText of paragraph list*)
|