ENIAMsemLexicon.ml
4.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
(*
* ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAM_LCGtypes
open Lexer
open ENIAMwalTypes
open ENIAMlexSemanticsTypes
let remove_comments line =
try
let n = String.index line '#' in
String.sub line 0 n
with Not_found -> line
let rec manage_tokens = function
[arg;[T role]] -> [arg,role]
| arg :: (T role :: arg2) :: tokens -> (arg,role) :: manage_tokens (arg2 :: tokens)
| _ -> failwith "manage_tokens"
let parse_dir p = function
T "/" :: tokens -> tokens, {p with dir=Forward_}
| T "\\" :: tokens -> tokens, {p with dir=Backward_}
| T "|" :: tokens -> tokens, {p with dir=Both_}
| tokens -> failwith ("parse_dir: " ^ Lexer.string_of_token_list tokens)
let parse_multi p = function
T "?" :: tokens -> tokens, {p with is_necessary=Multi}
| tokens -> tokens,p
let parse_morf p = function
[T "1"] -> {p with is_necessary=Opt}
| tokens ->
let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function
[T s] -> Atom s
| tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in
{p with morfs=LCG (Tensor l) :: p.morfs}
let parse_arg tokens p =
(* Printf.printf "parse_arg: %s\n" (Lexer.string_of_token_list tokens); *)
let tokens,p = parse_dir p tokens in
let tokens,p = parse_multi p tokens in
match Lexer.find_brackets ["(",")"] [] tokens with
[B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf
| tokens -> parse_morf p tokens
let parse_role p = function
"adjunct" -> {p with gf=ADJUNCT}
| "unk" -> {p with role="unk"}
| "nosem" -> {p with gf=NOSEM}
| "Count" -> {p with role="Count"}
| "Measure" -> {p with role="Measure"}
| s -> failwith ("parse_role: " ^ s)
let parse_entry = function
[T symbol; T ":"; T "null"] -> symbol,[]
| T symbol :: T ":" :: tokens ->
(* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *)
let tokens = Lexer.split_symbol (T ":") [] tokens in
let tokens = manage_tokens tokens in
let positions = Xlist.map tokens (fun (arg,role) ->
parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in
symbol,positions
| tokens -> failwith ("parse_entry: " ^ Lexer.string_of_token_list tokens)
let load_lexicon filename =
let lines = File.load_lines filename in
let lines = List.rev (Xlist.rev_map lines remove_comments) in
let tokens = List.flatten (List.rev (Xlist.rev_map lines (Lexer.split "\\]\\| \\|\t\\|\r\\|\\?\\|:\\|;\\|&\\|!\\|=\\|}\\|{\\|,\\|\\*\\|/\\|\\+\\|)\\|(\\||\\|\\[\\|\\"))) in
let tokens = List.rev (Xlist.fold tokens [] (fun tokens -> function
T " " -> tokens
| T "\t" -> tokens
| T "\r" -> tokens
| t -> t :: tokens)) in
let entries = Lexer.split_symbol (T ";") [] tokens in
Xlist.fold entries StringMap.empty (fun map entry ->
let symbol,args = parse_entry entry in
StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol)))
let sem_lexicon = load_lexicon "resources/lexicon-pl.dic"
let extend_frame symbol frame =
try
let positions = StringMap.find sem_lexicon symbol in
{frame with positions=positions @ frame.positions}
with Not_found -> failwith ("extend_frame: " ^ symbol)