interface.ml
7.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMsubsyntaxTypes
let theories_paths = [
"/home/yacheu/Dokumenty/ENIAM/theories/numbers";
"/home/yacheu/Dokumenty/ENIAM/theories/persons";
(* "theories/numbers"; *)
]
type output = Text | Xml | Html | Marsh | Graphviz | Conll
type sentence_split = Full | Partial | None
let output = ref Text
let comm_stdio = ref true
let sentence_split = ref Full
let port = ref 5439
let par_names = ref false
let spec_list = [
"-s", Arg.Unit (fun () -> sentence_split:=Full), "Split input into sentences (default)";
"-a", Arg.Unit (fun () -> sentence_split:=Partial), "Split input into paragraphs, do not split input into sentences";
"-n", Arg.Unit (fun () -> sentence_split:=None), "Do not split input into sentences";
"-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)";
"-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number";
"-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)";
"-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
"-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
"-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
"--conll", Arg.Unit (fun () -> output:=Conll), "Output as conll";
"-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=None), "Output as graphviz dot file; turns sentence split off";
"--strong-disamb", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.strong_disambiguate_flag:=true), "Perform strong disambiguation";
"--no-strong-disamb", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.strong_disambiguate_flag:=false), "Do not perform strong disambiguation (default)";
"--internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=true), "Relaxed attitude towards interpunction";
"--no-internet-mode", Arg.Unit (fun () -> ENIAMtokenizerTypes.internet_mode:=false), "Strict attitude towards interpunction (default)";
"--par-names", Arg.Unit (fun () -> par_names:=true), "Identifiers of paragraphs provided";
"--no-par-names", Arg.Unit (fun () -> par_names:=false), "No identifiers of paragraphs provided (default)";
(* "--proper-names", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.recognize_proper_names:=true), "Recognize proper names (default)";
"--no-proper-names", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.recognize_proper_names:=false), "Do not recognize proper names";*)
"--merge-lemmata", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.merge_lemmata:=true), "Merge lemmata (default)";
"--no-merge-lemmata", Arg.Unit (fun () -> ENIAMsubsyntaxTypes.merge_lemmata:=false), "Do not merge lemmata";
"--concraft", Arg.Unit (fun () ->
concraft_enabled := true), "Enable Concraft tagger";
"--no-concraft", Arg.Unit (fun () ->
concraft_enabled := false), "Disable Concraft tagger (default)";
"--concraft-port", Arg.Int (fun p ->
concraft_enabled:=true;
concraft_port:=p), "<port> Connect to Concraft tagger on a given port (by default " ^ string_of_int !concraft_port ^ ")";
"--concraft-host", Arg.String (fun s ->
concraft_enabled:=true;
concraft_host_name:=s), "<hostname> Connect to Concraft tagger on a given host (by default localhost)";
"--concraft-disambiguate", Arg.Unit (fun () ->
concraft_disambiguate := true;
concraft_enabled := true), "Disambiguate using Concraft tagger";
"--no-concraft-disambiguate", Arg.Unit (fun () ->
concraft_disambiguate := false), "Do not disambiguate using Concraft tagger (default)";
]
let usage_msg =
"Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:"
let message = "ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish\n\
Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\
Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences"
let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s))
let input_text channel =
let s = ref (try input_line channel with End_of_file -> "") in
let lines = ref [] in
while !s <> "" do
lines := !s :: !lines;
s := try input_line channel with End_of_file -> ""
done;
String.concat "\n" (List.rev !lines)
let rec main_loop in_chan out_chan =
let text = input_text in_chan in
if text = "" then () else (
(* print_endline "input text begin";
print_endline text;
print_endline "input text end"; *)
(if !sentence_split = Full || !sentence_split = Partial then
let text,tokens,msg =
if !sentence_split = Full then ENIAMsubsyntax.catch_parse_text true !par_names text
else ENIAMsubsyntax.catch_parse_text false !par_names text in
(match !output with
Text ->
if msg = "" then output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^
ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n")
else output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ msg ^ "\n\n")
| Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens msg) ^ "\n\n")
| Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens msg ^ "\n\n")
| Marsh -> Marshal.to_channel out_chan (text,tokens,msg) []
| Graphviz -> failwith "main_loop: ni"
| Conll -> failwith "main_loop: ni")
else
let tokens,msg = ENIAMsubsyntax.catch_parse text in
(match !output with
Text ->
if msg = "" then output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n")
else output_string out_chan (text ^ "\n" ^ msg ^ "\n\n")
| Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens msg) ^ "\n\n")
| Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens msg ^ "\n\n")
| Marsh -> Marshal.to_channel out_chan (tokens,msg) []
| Conll ->
if msg = "" then output_string out_chan (ENIAMsubsyntaxStringOf.token_list_conll tokens ^ "\n\n")
else output_string out_chan (text ^ "\n" ^ msg ^ "\n\n")
| Graphviz -> if msg = "" then output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n")
else output_string out_chan (text ^ "\n" ^ msg ^ "\n\n")));
flush out_chan;
main_loop in_chan out_chan)
let _ =
prerr_endline message;
ENIAMtokenizerTypes.theories_paths := theories_paths;
Arg.parse spec_list anon_fun usage_msg;
ENIAMsubsyntax.initialize ();
Gc.compact ();
prerr_endline "Ready!";
if !comm_stdio then main_loop stdin stdout
else
let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in
Unix.establish_server main_loop sockaddr