(* * ENIAM: Categorial Syntactic-Semantic Parser for Polish * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. *) open Xstd (*let single_sense_flag = ref false(*true*) let single_frame_flag = ref false(*true*)*) (*type pos_record = {interp: string list list list; attrs: string list; proper: string list; senses: string list} type dict = {lemmas: pos_record StringMap.t StringMap.t; dbeg: int; dlen: int}*) (* type selector = Orth of string | Pos of string (*| All *) *) (* Długość pojedynczego znaku w tekście *) let factor = 100 type labels = { number: string; case: string; gender: string; person: string; aspect: string; } (*type type_arg = TArg of string | TWith of type_arg list and type_term = TConst of string * type_arg list | TMod of type_term * type_term | TName of string | TVariant of type_term * type_term*) type semantics = Normal | Special of string list (* | SpecialNoun of type_arg list * type_term | SpecialMod of string * (type_arg list * type_term)*) | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *) type token = SmallLetter of string (* orth *) | CapLetter of string * string (* orth * lowercase *) | AllSmall of string (* orth *) | AllCap of string * string * string (* orth * lowercase * all lowercase *) | FirstCap of string * string * string * string (* orth * all lowercase * first letter uppercase * first letter lowercase *) | SomeCap of string (* orth *) | RomanDig of string * string (* value * cat *) | Interp of string (* orth *) | Symbol of string (* orth *) | Dig of string * string (* value * cat *) | Other2 of string (* orth *) | Lemma of string * string * string list list list (* lemma * cat * interp *) | Proper of string * string * string list list list * string list (* lemma * cat * interp * senses *) (* | Sense of string * string * string list list list * (string * string * string list) list (* lemma * cat * interp * senses *) *) | Compound of string * token list (* sense * components *) | Sentence of token_record list * int (* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających informacje o poszczególnych tokenach *) and token_record = { orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token *) beg: int; (* pozycja początkowa tokenu *) len: int; (* długość tokenu *) next: int; (* pozycja początkowa następnego tokenu *) token: token; (* treść tokenu *) attrs: string list; (* dodatkowe atrybuty *) weight: float; e: labels; id: int; (* lemma: string; pos: string; tags: string list list list;*) valence: (int * WalTypes.frame) list; simple_valence: (int * WalTypes.frame) list; senses: (string * string list * float) list; lroles: string * string; semantics: semantics; lnode: int; rnode: int; } (* Tokeny umieszczone są w strukturze danych umożliwiającej efektywne wyszukiwanie ich sekwencji, struktura danych sama z siebie nie wnosi informacji *) type tokens = | Token of token_record | Variant of tokens list | Seq of tokens list type pat = L | CL | D of string | C of string | S of string | RD of string | O of string let empty_labels = { number=""; case=""; gender=""; person=""; aspect=""; } let empty_token = { orth="";beg=0;len=0;next=0; token=Symbol ""; weight=0.; e=empty_labels; id=0; attrs=[]; valence=[]; simple_valence=[]; senses=[]; lroles="",""; semantics=Normal; lnode=(-1); rnode=(-1)} (* type conll = unit type skladnica_tree = unit type nkjp1m = { orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token *) beg: int; (* pozycja początkowa tokenu *) len: int; (* długość tokenu *) token: token; (* treść tokenu *) } *) type mode = Raw | Struct | CONLL (*type sentence = RawSentence of string (* | CONLL of conll list *) | StructSentence of token_record list * int * int (* | NKJP1M of nkjp1m list *) (* | Skladnica of skladnica_tree *) | AltSentence of (string * sentence) list (* string = etykieta np raw, nkjp, krzaki *) type paragraph_record = {pid: string; pbeg: int; plen: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) type paragraph = RawParagraph of string | StructParagraph of paragraph_record list | AltParagraph of (string * paragraph) list type text = RawText of string | StructText of paragraph list*) (* warstwy nkjp1m do analizy: header text ann_segmentation ann_morphosyntax ann_named *) (* zdania wydobyte na zewnątrz *) (* struktura ponadzdaniowa przetwarzana przed strukturą zależnościową *) (* istnieje ryzyko eksplozji interpretacji *) type sentence = RawSentence of string (* | CONLL of conll list *) | StructSentence of token_record list * int (* paths * last *) | ORSentence of token_record list * int * int * paragraph (* | NKJP1M of nkjp1m list *) (* | Skladnica of skladnica_tree *) | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *) and paragraph_record = {pid: string; pbeg: int; plen: int; psentence: sentence} (* beg i len liczone po znakach unicode ( * 100 ???) *) and paragraph = RawParagraph of string | StructParagraph of paragraph_record list * int (* zdania * next_id *) | AltParagraph of (mode * paragraph) list type text = RawText of string | StructText of paragraph list | AltText of (mode * text) list (* zdania reprezentowane jako tokeny *) (* struktura ponadzdaniowa przetwarzana po strukturze zależnościowej *) (* problem z wtrąceniami OR *) (*type paragraph = RawParagraph of string | StructParagraph of token_record list | AltParagraph of (string * paragraph) list type text = RawText of string | StructText of paragraph list*)