CONLL.ml
3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
(*open Types*)
let string_of_token token =
String.concat "\t" [string_of_int token.c_id;
token.c_orth;
token.c_lemma;
token.c_cat;
token.c_cat;
(if token.c_interp = []
then "_"
else String.concat "|" token.c_interp);
(if token.c_super = -1
then "_"
else string_of_int token.c_super);
(if token.c_label = ""
then "_"
else token.c_label);
"_";"_"]
let string_of_sentence sentence =
String.concat "\n" (sentence.s_id :: (List.map string_of_token sentence.s_tokens)) ^ "\n"
(******************)
exception Empty_line
exception Empty_sentence
exception Id_line of string
let load_token stream =
let line = input_line stream in
if line = ""
then raise Empty_line
else if line.[0] = '#'
then
let id = String.sub line 8 ((String.length line)-17) in
raise (Id_line id)
else
let pom = Str.split (Str.regexp "\t") line in
{ c_id = int_of_string (List.nth pom 0);
c_orth = List.nth pom 1;
c_lemma = List.nth pom 2;
c_cat = List.nth pom 3;
c_interp = (let interp = List.nth pom 5 in
if interp = "_"
then []
else Str.split (Str.regexp "|") interp);
c_super = -1; c_label = ""; c_beg = -1; c_len = -1}
let load_sentence stream =
let rec pom rev_tokens id =
try
let token = load_token stream in
pom (token :: rev_tokens) id
with Id_line new_id -> pom rev_tokens new_id
| Empty_line -> rev_tokens, id
| End_of_file -> if rev_tokens = []
then raise Empty_sentence
else rev_tokens, id in
let rev_tokens, id = pom [] "" in
{s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)}
let load_corpus stream =
let rec pom res =
try
let conll_sentence = load_sentence stream in
pom (conll_sentence :: res)
with _ -> res in
pom []
(******************)
exception ErrorInfoFile
module Info =
struct
type t = string
let compare a b = Pervasives.compare a b
end
module InfoMap = Map.Make(Info)
let info_file = "resources/info_sentences.txt"
let file_in filename f =
let file = open_in filename in
let x = f file in
close_in file;
x
let load_file filename =
let size = (Unix.stat filename).Unix.st_size in
let buf = Bytes.create size in
file_in filename (fun file ->
ignore (really_input file buf 0 size));
buf
let info = Str.split (Str.regexp "\n\n") @@ load_file info_file
let add_to_map map info_str =
match Str.split (Str.regexp "\n") info_str with
[id; text; info_token] -> InfoMap.add info_token (id, text) map
| _ -> raise ErrorInfoFile
let info_map =
List.fold_left add_to_map InfoMap.empty info
let match_sentence sentence =
let info_token = String.concat " " @@ List.map (fun x -> x.c_orth) sentence.s_tokens in
try
let id, text = InfoMap.find info_token info_map in
{s_id = id; s_text = text; s_tokens = sentence.s_tokens}
with _ -> sentence
let match_corpus corpus =
List.map match_sentence corpus