Blame view

pre/preTypes.ml 6.5 KB
Wojciech Jaworski authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd

(*let single_sense_flag = ref false(*true*)
let single_frame_flag = ref false(*true*)*)

(*type pos_record = {interp: string list list list; attrs: string list; proper: string list; senses: string list}

type dict = {lemmas: pos_record StringMap.t StringMap.t; dbeg: int; dlen: int}*)

(* type selector = Orth of string | Pos of string (*| All    *) *)
Wojciech Jaworski authored
30
31

(* Długość pojedynczego znaku w tekście *)
Wojciech Jaworski authored
32
33
34
35
36
37
38
39
40
41
let factor = 100

type labels = {
  number: string;
  case: string;
  gender: string;
  person: string;
  aspect: string;
  }
Wojciech Jaworski authored
42
Wojciech Jaworski authored
43
44
45
46
(*type type_arg =
    TArg of string
  | TWith of type_arg list
Wojciech Jaworski authored
47
and type_term =
Wojciech Jaworski authored
48
    TConst of string * type_arg list
Wojciech Jaworski authored
49
  | TMod of type_term * type_term
Wojciech Jaworski authored
50
51
  | TName of string
  | TVariant of type_term * type_term*)
Wojciech Jaworski authored
52
53
54
55


type semantics =
    Normal
Wojciech Jaworski authored
56
57
58
59
  | Special of string list
(*  | SpecialNoun of type_arg list * type_term
  | SpecialMod of string * (type_arg list * type_term)*)
  | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *)
Wojciech Jaworski authored
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

type token =
    SmallLetter of string 		(* orth *)
  | CapLetter of string * string	(* orth * lowercase *)
  | AllSmall of string			(* orth *)
  | AllCap of string * string * string	(* orth * lowercase * all lowercase *)
  | FirstCap of string * string * string * string	(* orth * all lowercase  * first letter uppercase  * first letter lowercase *)
  | SomeCap of string			(* orth *)
  | RomanDig of string * string		(* value * cat *)
  | Interp of string			(* orth *)
  | Symbol of string			(* orth *)
  | Dig of string * string		(* value * cat *)
  | Other2 of string			(* orth *)
  | Lemma of string * string * string list list list	(* lemma * cat * interp *)
  | Proper of string * string * string list list list * string list	(* lemma * cat * interp * senses *)
(*   | Sense of string * string * string list list list * (string * string * string list) list	(* lemma * cat * interp * senses *) *)
  | Compound of string * token list	(* sense * components *)
  | Sentence of token_record list * int

(* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających
   informacje o poszczególnych tokenach *)
and token_record = {
Wojciech Jaworski authored
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
  orth: string;		(* sekwencja znaków pierwotnego tekstu składająca się na token *)
  beg: int; 		(* pozycja początkowa tokenu *)
  len: int; 		(* długość tokenu *)
  next: int; 		(* pozycja początkowa następnego tokenu *)
  token: token; 	(* treść tokenu *)
  attrs: string list;	(* dodatkowe atrybuty *)
  weight: float;
  e: labels;
  id: int;
(*  lemma: string;
  pos: string;
  tags: string list list list;*)
  valence: (int * WalTypes.frame) list;
  simple_valence: (int * WalTypes.frame) list;
  senses: (string * string list * float) list;
  lroles: string * string;
Wojciech Jaworski authored
98
99
100
101
102
  semantics: semantics;
  lnode: int;
  rnode: int;
  }
Wojciech Jaworski authored
103
(* Tokeny umieszczone są w strukturze danych umożliwiającej efektywne wyszukiwanie ich sekwencji,
Wojciech Jaworski authored
104
   struktura danych sama z siebie nie wnosi informacji *)
Wojciech Jaworski authored
105
106
107
108
109
110
111
112
113
114
115
116
117
118
type tokens =
  | Token of token_record
  | Variant of tokens list
  | Seq of tokens list

type pat = L | CL | D of string | C of string | S of string | RD of string | O of string

let empty_labels = {
  number="";
  case="";
  gender="";
  person="";
  aspect="";
  }
Wojciech Jaworski authored
119
Wojciech Jaworski authored
120
let empty_token = {
Wojciech Jaworski authored
121
122
123
124
125
126
127
128
129
130
131
132
133
  orth="";beg=0;len=0;next=0; token=Symbol ""; weight=0.; e=empty_labels;
  id=0; attrs=[]; valence=[]; simple_valence=[]; senses=[];
  lroles="",""; semantics=Normal; lnode=(-1); rnode=(-1)}

(* type conll = unit
type skladnica_tree = unit
type nkjp1m = {
  orth: string;		(* sekwencja znaków pierwotnego tekstu składająca się na token *)
  beg: int; 		(* pozycja początkowa tokenu *)
  len: int; 		(* długość tokenu *)
  token: token; 	(* treść tokenu *)
} *)
Wojciech Jaworski authored
134
type mode =
Wojciech Jaworski authored
135
    Raw | Struct | CONLL
Wojciech Jaworski authored
136
137

(*type sentence =
Wojciech Jaworski authored
138
139
140
141
142
143
144
    RawSentence of string
  (* | CONLL of conll list *)
  | StructSentence of token_record list * int * int
  (* | NKJP1M of nkjp1m list *)
  (* | Skladnica of skladnica_tree *)
  | AltSentence of (string * sentence) list  (* string = etykieta np raw, nkjp, krzaki *)
Wojciech Jaworski authored
145
type paragraph_record = {pid: string; pbeg: int; plen: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *)
Wojciech Jaworski authored
146
147
148
149
150
151
152
153

type paragraph =
    RawParagraph of string
  | StructParagraph of paragraph_record list
  | AltParagraph of (string * paragraph) list

type text =
    RawText of string
Wojciech Jaworski authored
154
  | StructText of paragraph list*)
Wojciech Jaworski authored
155
Wojciech Jaworski authored
156
157
158
159
160
161
162
(* warstwy nkjp1m do analizy:
header
text
ann_segmentation
ann_morphosyntax
ann_named
*)
Wojciech Jaworski authored
163
164
165
166
167
168
169

(* zdania wydobyte na zewnątrz *)
(* struktura ponadzdaniowa przetwarzana przed strukturą zależnościową *)
(* istnieje ryzyko eksplozji interpretacji *)
type sentence =
    RawSentence of string
  (* | CONLL of conll list *)
Wojciech Jaworski authored
170
  | StructSentence of token_record list * int (* paths * last *)
Wojciech Jaworski authored
171
172
173
  | ORSentence of token_record list * int * int * paragraph
  (* | NKJP1M of nkjp1m list *)
  (* | Skladnica of skladnica_tree *)
Wojciech Jaworski authored
174
  | AltSentence of (mode * sentence) list  (* string = etykieta np raw, nkjp, krzaki *)
Wojciech Jaworski authored
175
Wojciech Jaworski authored
176
and paragraph_record = {pid: string; pbeg: int; plen: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *)
Wojciech Jaworski authored
177
Wojciech Jaworski authored
178
and paragraph =
Wojciech Jaworski authored
179
    RawParagraph of string
Wojciech Jaworski authored
180
181
  | StructParagraph of paragraph_record list * int (* zdania * next_id *)
  | AltParagraph of (mode * paragraph) list
Wojciech Jaworski authored
182
183
184
185

type text =
    RawText of string
  | StructText of paragraph list
Wojciech Jaworski authored
186
  | AltText of (mode * text) list
Wojciech Jaworski authored
187
188
189
190
191
192
193
194
195
196
197
198
199


(* zdania reprezentowane jako tokeny *)
(* struktura ponadzdaniowa przetwarzana po strukturze zależnościowej *)
(* problem z wtrąceniami OR *)
(*type paragraph =
    RawParagraph of string
  | StructParagraph of token_record list
  | AltParagraph of (string * paragraph) list

type text =
    RawText of string
  | StructText of paragraph list*)