resources.ml
3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
(*open Types*)
module Id =
struct
type t = string
let compare a b = Pervasives.compare a b
end
module IdMap = Map.Make(Id)
let skladnica_frazowa_filename = "resources/skladnica_walencyjna"
let get_filenames path =
List.fold_left
(fun files_list1 folder1 -> if folder1 = ".DS_Store" then files_list1 else
List.fold_left
(fun files_list2 folder2 -> if folder2 = ".DS_Store" then files_list2 else
List.fold_left
(fun files_list3 file -> if file = ".DS_Store" then files_list3 else
(path ^ "/" ^ folder1 ^ "/" ^ folder2 ^ "/" ^file) :: files_list3)
files_list2
(Array.to_list (Sys.readdir (path ^ "/" ^ folder1 ^ "/" ^ folder2))))
files_list1
(Array.to_list (Sys.readdir (path ^ "/" ^ folder1))))
[]
(Array.to_list (Sys.readdir path))
let add_to_map map = function
Xml.Element("forest",("sent_id",sent_id) :: _,
Xml.Element("text",[],[Xml.PCData text]) :: _) -> IdMap.add sent_id text map
| _ -> failwith "add_to_map"
let i = ref 1
(* map(id,text) *)
let sentencesIdText = List.fold_left (fun acc filename -> print_endline (string_of_int !i); i := !i + 1; add_to_map acc (Xml.parse_file filename)) IdMap.empty (get_filenames skladnica_frazowa_filename)
let number_of_sentences_skladnica_frazowa = List.length (get_filenames skladnica_frazowa_filename)
(*************************************************************************************************************)
module Info =
struct
type t = string list
let compare a b = Pervasives.compare a b
end
module InfoMap = Map.Make(Info)
let krzaki_filename = "resources/krzaki.conll"
let file_in filename f =
let file = open_in filename in
let x = f file in
close_in file;
x
let load_file filename =
let size = (Unix.stat filename).Unix.st_size in
let buf = Bytes.create size in
file_in filename (fun file ->
ignore (really_input file buf 0 size));
buf
let load_krzaki filename = Str.split (Str.regexp "\n\n") (load_file filename)
(* let krzaki = Str.split (Str.regexp "\n\n") (load_file filename) in
rev_map (fun krzak ->
print_endline ("krzak: " ^ krzak); krzak) krzaki *)
let split_word stringname =
let pom = Str.split (Str.regexp "\t") stringname in
{ c_id = int_of_string (List.nth pom 0);
c_orth = List.nth pom 1;
c_lemma = List.nth pom 2;
c_cat = List.nth pom 3;
c_interp = (Str.split (Str.regexp "|") (List.nth pom 5));
c_super = int_of_string (List.nth pom 6);
c_label = List.nth pom 7;
c_beg = -1;
c_len = -1}
let split_krzak stringname =
let pom = Str.split (Str.regexp "\n") stringname in
let s_id = String.sub stringname 8 ((String.length @@ List.hd pom)-17) in
{ s_id = s_id;
s_text =
(try
IdMap.find s_id sentencesIdText
with _ -> prerr_endline s_id; "not_found");
s_tokens = List.map (fun word -> split_word word) (List.tl pom)}
let parse_krzaki list_of_string =
List.map (fun krzak -> split_krzak krzak) list_of_string
let number_of_sentences_krzaki = List.length (load_krzaki krzaki_filename)
(* conll_sequence list *)
let data_conll = parse_krzaki (load_krzaki krzaki_filename)
(* map(form_sequence,conll_sentence) *)
let conll_info = List.fold_left (fun map sentence ->
InfoMap.add (List.map (fun token -> token.c_orth) sentence.s_tokens) sentence map) InfoMap.empty data_conll
(*let info_file () =
let oc = open_out "info_sentences.txt" in
List.iter (fun (key, sentence) ->
output_string oc (sentence.s_id^"\n"^sentence.s_text^"\n"^(String.concat " " key)^"\n\n")) (InfoMap.bindings conll_info)*)
(*let frazowa_info =
let got_info = List.map (fun (_, sentence) -> sentence.s_id, sentence.s_text) (InfoMap.bindings conll_info) in
List.fold_left (fun map (id, text) -> if List.mem (id, text) got_info
then map
else IdMap.add text id map) IdMap.empty (IdMap.bindings sentecesIdText) *)