CONLL.ml
3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
open PreTypes
let string_of_token token =
String.concat "\t" [string_of_int token.c_id;
token.c_orth;
token.c_lemma;
token.c_cat;
token.c_cat;
(if token.c_interp = []
then "_"
else String.concat "|" token.c_interp);
(if token.c_super = -1
then "_"
else string_of_int token.c_super);
(if token.c_label = ""
then "_"
else token.c_label);
"_";"_"]
let string_of_sentence sentence =
String.concat "\n" (sentence.s_id :: (List.map string_of_token sentence.s_tokens)) ^ "\n"
(******************)
exception Empty_line
exception Empty_sentence
exception Id_line of string
let load_token stream =
let line = input_line stream in
if line = ""
then raise Empty_line
else if line.[0] = '#'
then
let id = String.sub line 8 ((String.length line)-17) in
raise (Id_line id)
else
match Str.split (Str.regexp "\t") line with
[id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
if cat <> cat2 then failwith ("load_token: " ^ line) else
let interp = if interp = "_" then [[]] else
[Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])]
(* {empty_token with id = id; orth = orth; token = Lemma(lemma,cat,interp)}
c_lemma = List.nth pom 2;
c_cat = List.nth pom 3;
c_interp = (let interp = List.nth pom 5 in
if interp = "_"
then []
else Str.split (Str.regexp "|") interp);
c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)
| _ -> failwith ("load_token: " ^ line)
let load_sentence stream =
let rec pom rev_tokens id =
try
let token = load_token stream in
pom (token :: rev_tokens) id
with Id_line new_id -> pom rev_tokens new_id
| Empty_line -> rev_tokens, id
| End_of_file -> if rev_tokens = []
then raise Empty_sentence
else rev_tokens, id in
let rev_tokens, id = pom [] "" in
{s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)}
(* {pid=pi; pbeg=-1; plen=-1; psentence=StructSentence(List.rev rev_tokens,-1)} *)
let load_corpus stream =
let rec pom res =
try
let conll_sentence = load_sentence stream in
pom (conll_sentence :: res)
with _ -> res in
pom []
(******************)
exception ErrorInfoFile
module Info =
struct
type t = string
let compare a b = Pervasives.compare a b
end
module InfoMap = Map.Make(Info)
let info_file = "resources/info_sentences.txt"
let file_in filename f =
let file = open_in filename in
let x = f file in
close_in file;
x
let load_file filename =
let size = (Unix.stat filename).Unix.st_size in
let buf = Bytes.create size in
file_in filename (fun file ->
ignore (really_input file buf 0 size));
buf
let info = Str.split (Str.regexp "\n\n") @@ load_file info_file
let add_to_map map info_str =
match Str.split (Str.regexp "\n") info_str with
[id; text; info_token] -> InfoMap.add info_token (id, text) map
| _ -> raise ErrorInfoFile
let info_map =
List.fold_left add_to_map InfoMap.empty info
let match_sentence sentence =
let info_token = String.concat " " @@ List.map (fun x -> x.c_orth) sentence.s_tokens in
try
let id, text = InfoMap.find info_token info_map in
{s_id = id; s_text = text; s_tokens = sentence.s_tokens}
(* {pid=sentence.pid; psentence=AltSentence[Raw, RawSentence text; CONLL, psentence.s_tokens]} *)
with _ -> sentence
let match_corpus corpus =
List.map match_sentence corpus