resources.ml
3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
open Xstd
open Types
let skladnica_frazowa_filename = "resources/skladnica_walencyjna"
let get_filenames path =
Xlist.fold (Array.to_list (Sys.readdir path)) []
(fun files_list1 folder1 -> if folder1 = ".DS_Store" then files_list1 else
Xlist.fold (Array.to_list (Sys.readdir (path ^ "/" ^ folder1))) files_list1
(fun files_list2 folder2 -> if folder2 = ".DS_Store" then files_list2 else
Xlist.fold (Array.to_list (Sys.readdir (path ^ "/" ^ folder1 ^ "/" ^ folder2))) files_list2
(fun files_list3 file -> if file = ".DS_Store" then files_list3 else
(path ^ "/" ^ folder1 ^ "/" ^ folder2 ^ "/" ^file) :: files_list3)))
let add_to_map map = function
Xml.Element("forest",("sent_id",sent_id) :: _,
Xml.Element("text",[],[Xml.PCData text]) :: _) -> StringMap.add map sent_id text
| _ -> failwith "add_to_map"
let i = ref 1
(* map(id,text) *)
let sentencesIdText = Xlist.fold (get_filenames skladnica_frazowa_filename) StringMap.empty
(fun acc filename -> print_endline (string_of_int !i); i := !i + 1; add_to_map acc (Xml.parse_file filename))
let number_of_sentences_skladnica_frazowa = List.length (get_filenames skladnica_frazowa_filename)
(*************************************************************************************************************)
module Info =
struct
type t = string list
let compare a b = Pervasives.compare a b
end
module InfoMap = Map.Make(Info)
let krzaki_filename = "resources/krzaki.conll"
let load_krzaki filename = Xstring.split_delim "\n\n" (File.load_file_gen filename)
let split_word stringname =
let pom = Xstring.split_delim "\t" stringname in
{ c_id = int_of_string (List.nth pom 0);
c_orth = List.nth pom 1;
c_lemma = List.nth pom 2;
c_cat = List.nth pom 3;
c_interp = (Xstring.split_delim "|" (List.nth pom 5));
c_super = int_of_string (List.nth pom 6);
c_label = List.nth pom 7;
c_beg = -1;
c_len = -1}
let split_krzak stringname =
let pom = Xstring.split_delim "\n" stringname in
let s_id = String.sub stringname 8 ((String.length @@ List.hd pom)-17) in
{ s_id = s_id;
s_text =
(try
StringMap.find sentencesIdText s_id
with _ -> prerr_endline s_id; "not_found");
s_tokens = Xlist.map (List.tl pom) (fun word -> split_word word)}
let parse_krzaki list_of_string =
Xlist.map list_of_string (fun krzak -> split_krzak krzak)
let number_of_sentences_krzaki = List.length (load_krzaki krzaki_filename)
(* conll_sequence list *)
let data_conll = parse_krzaki (load_krzaki krzaki_filename)
(* map(form_sequence,conll_sentence) *)
let conll_info = Xlist.fold data_conll InfoMap.empty
(fun map sentence -> InfoMap.add (List.map (fun token -> token.c_orth) sentence.s_tokens) sentence map)
(*let info_file () =
let oc = open_out "info_sentences.txt" in
List.iter (fun (key, sentence) ->
output_string oc (sentence.s_id^"\n"^sentence.s_text^"\n"^(String.concat " " key)^"\n\n")) (InfoMap.bindings conll_info)*)
(*let frazowa_info =
let got_info = List.map (fun (_, sentence) -> sentence.s_id, sentence.s_text) (InfoMap.bindings conll_info) in
List.fold_left (fun map (id, text) -> if List.mem (id, text) got_info
then map
else StringMap.add map text id) StringMap.empty (StringMap.bindings sentecesIdText) *)