conllParser.ml
4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
(*open Types*)
let skladnica_zaleznosciowa_filename = "resources/skladnica_zaleznosciowa.conll"
let oc = open_out "resources/info_sentences.txt"
let empty_token = { c_id = 0; c_orth = ""; c_lemma = ""; c_cat = "";
c_interp = []; c_super = 0; c_label = ""; c_beg = 0; c_len = 0}
let quote_open = ref false
let hyphenated = ref false
let reset () =
quote_open := false;
hyphenated := false
let maybe_add_space pre_previous previous token next =
if previous.c_orth = "" && token.c_orth = "\""
then quote_open := true;
if token.c_cat = "aglt" ||
(token.c_orth = "by" && previous.c_cat = "praet") ||
(previous.c_orth = "\"" && !quote_open) ||
previous.c_orth = "(" ||
previous.c_orth = "„" ||
previous.c_orth = "" ||
token.c_orth = "ń" || (* wyrażenie nań *)
(token.c_orth = "że" && (previous.c_orth = "czym" || previous.c_orth = "Czym")) || (*wyrażenie czymże*)
(* (token.c_orth = "r" && token.c_cat = "brev") || (*skrót r. - np. 1991r. *) *)
(pre_previous.c_cat = "adj" && previous.c_orth = "." && token.c_cat = "num" && token.c_interp = ["pl";"nom";"f";"rec"]) (* godzina - np 13.15*)
then token.c_orth
else if !hyphenated
then (hyphenated := false; token.c_orth)
else match token.c_orth with
"." -> "."
| "…" -> "…"
| "?" -> "?"
| "!" -> "!"
| "," -> ","
| ":" -> ":"
| ";" -> ";"
| ")" -> ")"
| "”" -> "”"
| "-" -> if previous.c_cat = "adja" ||
(previous.c_cat = "subst" && next.c_cat = "subst" && previous.c_interp = next.c_interp)
then (hyphenated := true; "-")
else " -"
| "\"" -> if !quote_open
then (quote_open := false; "\"")
else (quote_open := true; " \"")
| s -> " "^s
(*FIXME: cudzysłowy*)
let getSentence tokens =
let rec fold4 acc = function
a::b::c::d::t -> fold4 (acc^maybe_add_space a b c d) (b::c::d::t)
| a::b::c::[] -> fold4 (acc^maybe_add_space a b c empty_token) (b::c::[])
| a::b::[] -> acc in
reset ();
fold4 "" (empty_token::empty_token::tokens)
let split_word stringname =
let pom = Str.split (Str.regexp "\t") stringname in
{ c_id = int_of_string (List.nth pom 0);
c_orth = List.nth pom 1;
c_lemma = List.nth pom 2;
c_cat = List.nth pom 3;
c_interp = (Str.split (Str.regexp "|") (List.nth pom 5));
c_super = int_of_string (List.nth pom 6);
c_label = List.nth pom 7;
c_beg = -1;
c_len = -1}
let any_difference string1 string2 = if string1 = string2
then false
else (String.sub string2 0 (String.length string2 -1)) ^ " " ^ (String.sub string2 (String.length string2 -1) 1) <> string1
let find_info tokens =
let text_generated = getSentence tokens in
try
let sentence = (*Resources.*)InfoMap.find (List.map (fun token -> token.c_orth) tokens) (*Resources.*)conll_info in
let id, text = sentence.s_id, sentence.s_text in
(*if any_difference text text_generated && text <> "not_found"
then print_endline (text ^ "\n" ^ text_generated ^ "\n\n");*)
if text = "not_found"
then { s_id = id;
s_text = "Auto-generated text: "^text_generated;
s_tokens = tokens}
else { s_id = id;
s_text = text;
s_tokens = tokens}
with _ -> (*prerr_endline ("Id not found\n" ^ text_generated ^ "\n\n");*) { s_id = "Id not found";
s_text = text_generated;
s_tokens = tokens}
let process_sentence sentenceString =
let pom = Str.split (Str.regexp "\n") sentenceString in
let tokens = List.map (fun word -> split_word word) pom in
find_info tokens
let print_info sentence =
let sentence = process_sentence sentence in
let form_sequence = String.concat " " @@ List.map (fun token -> token.c_orth) sentence.s_tokens in
output_string oc (sentence.s_id^"\n"^sentence.s_text^"\n"^form_sequence^"\n\n");
flush oc
let processSkladnica =
List.iter (fun sentence -> print_info sentence) (Str.split (Str.regexp "\n\n") ((*Resources.*)load_file skladnica_zaleznosciowa_filename))