Commit 0b6dd720b5d555d212ecb7927d1cf27c27c9acc4
1 parent
5007a782
uruchomienie MateParsera
Showing
2 changed files
with
140 additions
and
137 deletions
corpora/CONLL.ml
1 | 1 | open Xstd |
2 | 2 | open PreTypes |
3 | 3 | |
4 | -let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts | |
5 | - then f (snd @@ List.find (fun (m,_) -> m = mode) alts) | |
6 | - else f (snd @@ List.find (fun (m,_) -> m = Struct) alts) | |
7 | - | |
8 | -let string_of_token mode token id super label = | |
9 | - let decompose_lemma = function | |
10 | - | Lemma(a,b,c) -> a,b,if c = [[]] | |
11 | - then "_" | |
12 | - else String.concat "][" @@ Xlist.map c (fun x -> | |
13 | - String.concat "|" @@ Xlist.map x ( fun y -> | |
14 | - String.concat "." y)) | |
15 | - | _ -> failwith ("string_of_token: not Lemma") in | |
16 | - match mode with | |
17 | - | Raw -> token.orth | |
18 | - | Struct -> failwith ("function string_of_token for mode Struct is not defined") | |
19 | - | CONLL -> let lemma,cat,interp = decompose_lemma token.token in | |
20 | - String.concat "\t" [id; | |
21 | - token.orth; lemma; cat; cat; interp; "_"; "_"; | |
22 | - string_of_int token.beg; string_of_int token.len] | |
23 | - | Mate -> let lemma,cat,interp = decompose_lemma token.token in | |
24 | - String.concat "\t" [id; | |
25 | - token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"] | |
26 | - | _ -> failwith "string_of_token: ni" | |
27 | - | |
28 | -let string_of_paths mode tokens paths = | |
29 | - Array.fold_left (fun acc (id,super,label) -> | |
30 | - acc ^ "\n" ^ (string_of_token mode (ExtArray.get tokens id) (string_of_int id) super label)) "" paths | |
31 | - | |
32 | -let rec string_of_sentence mode tokens = function | |
33 | - RawSentence s -> if mode = Raw then s else "" | |
34 | - | StructSentence (_,tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) | |
35 | - | DepSentence (_, paths) -> string_of_paths mode tokens paths | |
36 | - | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") | |
37 | - | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts | |
38 | - | |
39 | -let string_of_p_record mode tokens p_record = | |
40 | - (if p_record.pid = "" then "" else p_record.pid ^ "\n") ^ | |
41 | - string_of_sentence mode tokens p_record.psentence | |
42 | - | |
43 | -(*let rec string_of_paragraph mode tokens = function | |
44 | - RawParagraph s -> if mode = Raw then s else "" | |
45 | - | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens) | |
46 | - | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts | |
47 | - | |
48 | -let rec string_of_text mode tokens = function | |
49 | - RawText s -> if mode = Raw then s else "" | |
50 | - | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens) | |
51 | - | AltText alts -> alternative_string (string_of_text mode) mode alts*) | |
52 | - | |
53 | - | |
54 | -(******************) | |
55 | - | |
56 | -let establish_next tokens paths = | |
57 | - let n = ExtArray.size tokens in | |
58 | - Int.iter 1 (n - 2) (fun i -> | |
59 | - let f = ExtArray.get tokens i in | |
60 | - let s = ExtArray.get tokens (i+1) in | |
61 | - ExtArray.set tokens i {f with next = s.beg}); | |
62 | - let last = ExtArray.get tokens (n-1) in | |
63 | - ExtArray.set tokens (n-1) {last with next = last.beg + last.len} | |
64 | - | |
65 | - | |
66 | - (*let rec pom res = function | |
67 | - h :: t -> let next = if res = [] | |
68 | - then h.beg+h.len | |
69 | - else (List.hd res).beg in | |
70 | - pom ({h with next = next} :: res) t | |
71 | - | [] -> res in | |
72 | - pom [] rev_tokens*) | |
73 | - | |
74 | -let rec establish_for_token i text tokens = function | |
75 | - (id,_,_) :: t as l-> | |
76 | - let h = ExtArray.get tokens id in | |
77 | - if Xstring.check_prefix " " text | |
78 | - then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l | |
79 | - else if Xstring.check_prefix h.orth text | |
80 | - then | |
81 | - let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in | |
82 | - let n_h = {h with beg = i ; len = n} in | |
83 | - ExtArray.set tokens id n_h; | |
84 | - establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t | |
85 | - else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text) | |
86 | - | [] -> 100, i | |
87 | - | |
88 | -let rec establish_lengths text paths tokens = | |
89 | - let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in | |
90 | - establish_next tokens paths; | |
91 | - pbeg, plen-100 | |
92 | - | |
93 | -(******************) | |
94 | - | |
95 | -exception ErrorInfoFile of string | |
96 | - | |
97 | -let info_file = "../corpora/info_sentences.txt" | |
98 | - | |
99 | -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file | |
100 | - | |
101 | -let add_to_map map info_str = | |
102 | - match Xstring.split "\n" info_str with | |
103 | - [id; text; info_token] -> StringMap.add map info_token (id, text) | |
104 | - | _ -> raise (ErrorInfoFile info_str) | |
105 | - | |
106 | -let info_map = | |
107 | - Xlist.fold info StringMap.empty add_to_map | |
108 | - | |
109 | -let match_sentence (p_record,tokens) = | |
110 | - let rec info_token s = match s with | |
111 | - RawSentence text -> failwith ("match_sentence: " ^ text) | |
112 | - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) | |
113 | - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | |
114 | - | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") | |
115 | - | AltSentence alts -> failwith ("match_sentence: AltSentence") | |
116 | - (*if List.exists (fun (mode, s) -> mode = CONLL) alts | |
117 | - then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) | |
118 | - else failwith ("match_sentence: no CONLL mode in AltSentence")*) in | |
119 | - let info_token, paths = info_token p_record.psentence in | |
120 | - try | |
121 | - let id, text = StringMap.find info_map info_token in | |
122 | - let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in | |
123 | - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; | |
124 | - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] | |
125 | -(* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) | |
126 | - with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] | |
127 | - | |
128 | -let match_corpus corpus = | |
4 | +let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts | |
5 | + then f (snd @@ List.find (fun (m,_) -> m = mode) alts) | |
6 | + else f (snd @@ List.find (fun (m,_) -> m = PreTypes.Struct) alts) | |
7 | + | |
8 | +let string_of_token mode token conll_id super label = | |
9 | + let decompose_lemma = function | |
10 | + | PreTypes.Lemma(a,b,c) -> a,b,if c = [[]] | |
11 | + then "_" | |
12 | + else String.concat "][" @@ Xlist.map c (fun x -> | |
13 | + String.concat "|" @@ Xlist.map x ( fun y -> | |
14 | + String.concat "." y)) | |
15 | + | t -> failwith ("string_of_token: not Lemma") in | |
16 | + match mode with | |
17 | + | PreTypes.Raw -> token.PreTypes.orth | |
18 | + | PreTypes.Struct -> failwith ("function string_of_token for mode Struct is not defined") | |
19 | + | PreTypes.CONLL -> let lemma,cat,interp = decompose_lemma token.PreTypes.token in | |
20 | + String.concat "\t" [string_of_int conll_id; | |
21 | + token.PreTypes.orth; lemma; cat; cat; interp; "_"; "_"; | |
22 | + string_of_int token.PreTypes.beg; string_of_int token.PreTypes.len] | |
23 | + | PreTypes.Mate -> let lemma,cat,interp = decompose_lemma token.PreTypes.token in | |
24 | + String.concat "\t" [string_of_int conll_id; | |
25 | + token.PreTypes.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"] | |
26 | + | _ -> failwith "string_of_token: ni" | |
27 | + | |
28 | +let string_of_paths mode tokens paths = | |
29 | + let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id -> | |
30 | + let id,super,label = paths.(conll_id) in | |
31 | + (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in | |
32 | + String.concat "\n" (List.rev l) ^ "\n\n" | |
33 | + | |
34 | +let rec string_of_sentence mode tokens = function | |
35 | + RawSentence s -> if mode = Raw then s else "" | |
36 | + | StructSentence (_,tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) | |
37 | + | DepSentence (_, paths) -> string_of_paths mode tokens paths | |
38 | + | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") | |
39 | + | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts | |
40 | + | |
41 | +let string_of_p_record mode tokens p_record = | |
42 | + (if p_record.pid = "" then "" else p_record.pid ^ "\n") ^ | |
43 | + string_of_sentence mode tokens p_record.psentence | |
44 | + | |
45 | +(*let rec string_of_paragraph mode tokens = function | |
46 | + RawParagraph s -> if mode = Raw then s else "" | |
47 | + | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens) | |
48 | + | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts | |
49 | + | |
50 | +let rec string_of_text mode tokens = function | |
51 | + RawText s -> if mode = Raw then s else "" | |
52 | + | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens) | |
53 | + | AltText alts -> alternative_string (string_of_text mode) mode alts*) | |
54 | + | |
55 | + | |
56 | +(******************) | |
57 | + | |
58 | +let establish_next tokens paths = | |
59 | + let n = ExtArray.size tokens in | |
60 | + Int.iter 1 (n - 2) (fun i -> | |
61 | + let f = ExtArray.get tokens i in | |
62 | + let s = ExtArray.get tokens (i+1) in | |
63 | + ExtArray.set tokens i {f with next = s.beg}); | |
64 | + let last = ExtArray.get tokens (n-1) in | |
65 | + ExtArray.set tokens (n-1) {last with next = last.beg + last.len} | |
66 | + | |
67 | + | |
68 | + (*let rec pom res = function | |
69 | + h :: t -> let next = if res = [] | |
70 | + then h.beg+h.len | |
71 | + else (List.hd res).beg in | |
72 | + pom ({h with next = next} :: res) t | |
73 | + | [] -> res in | |
74 | + pom [] rev_tokens*) | |
75 | + | |
76 | +let rec establish_for_token i text tokens = function | |
77 | + (id,_,_) :: t as l-> | |
78 | + let h = ExtArray.get tokens id in | |
79 | + if Xstring.check_prefix " " text | |
80 | + then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l | |
81 | + else if Xstring.check_prefix h.orth text | |
82 | + then | |
83 | + let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in | |
84 | + let n_h = {h with beg = i ; len = n} in | |
85 | + ExtArray.set tokens id n_h; | |
86 | + establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t | |
87 | + else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text) | |
88 | + | [] -> 100, i | |
89 | + | |
90 | +let rec establish_lengths text paths tokens = | |
91 | + let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in | |
92 | + establish_next tokens paths; | |
93 | + pbeg, plen-100 | |
94 | + | |
95 | +(******************) | |
96 | + | |
97 | +exception ErrorInfoFile of string | |
98 | + | |
99 | +let info_file = "../corpora/info_sentences.txt" | |
100 | + | |
101 | +let info = Xstring.split "\n\n" @@ File.load_file_gen info_file | |
102 | + | |
103 | +let add_to_map map info_str = | |
104 | + match Xstring.split "\n" info_str with | |
105 | + [id; text; info_token] -> StringMap.add map info_token (id, text) | |
106 | + | _ -> raise (ErrorInfoFile info_str) | |
107 | + | |
108 | +let info_map = | |
109 | + Xlist.fold info StringMap.empty add_to_map | |
110 | + | |
111 | +let match_sentence (p_record,tokens) = | |
112 | + let rec info_token s = match s with | |
113 | + RawSentence text -> failwith ("match_sentence: " ^ text) | |
114 | + | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) | |
115 | + | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | |
116 | + | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") | |
117 | + | AltSentence alts -> failwith ("match_sentence: AltSentence") | |
118 | + (*if List.exists (fun (mode, s) -> mode = CONLL) alts | |
119 | + then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) | |
120 | + else failwith ("match_sentence: no CONLL mode in AltSentence")*) in | |
121 | + let info_token, paths = info_token p_record.psentence in | |
122 | + try | |
123 | + let id, text = StringMap.find info_map info_token in | |
124 | + let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in | |
125 | + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; | |
126 | + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)] | |
127 | +(* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) | |
128 | + with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] | |
129 | + | |
130 | +let match_corpus corpus = | |
129 | 131 | Xlist.map corpus match_sentence |
130 | 132 | |
131 | 133 | (******************) |
... | ... |
parser/exec.ml
... | ... | @@ -198,10 +198,10 @@ let conll_parse_sentence timeout test_only_flag id paths tokens = |
198 | 198 | let result = {result with lex_time=time3 -. time2} in |
199 | 199 | try |
200 | 200 | (* print_endline "conll_parse_sentence 1"; *) |
201 | - LCGlatexOf.print_references "results/" "references1" references; | |
201 | + (* LCGlatexOf.print_references "results/" "references1" references; *) | |
202 | 202 | let parsed_dep_chart = LCGchart.dep_parse dep_chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *) |
203 | 203 | (* print_endline "conll_parse_sentence 2"; *) |
204 | - LCGlatexOf.print_references "results/" "references2" references; | |
204 | + (* LCGlatexOf.print_references "results/" "references2" references; *) | |
205 | 205 | let time4 = time_fun () in |
206 | 206 | let result = if test_only_flag then result else {result with parsed_dep_chart=parsed_dep_chart} in |
207 | 207 | let result = {result with parse_time=time4 -. time3} in |
... | ... | @@ -253,6 +253,10 @@ let conll_parse_sentence timeout test_only_flag id paths tokens = |
253 | 253 | |
254 | 254 | let mate_in, mate_out = Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test" |
255 | 255 | |
256 | +let get_paths = function | |
257 | + {PreTypes.psentence=PreTypes.DepSentence(_,paths)},_ -> paths | |
258 | + | _ -> failwith "get_paths" | |
259 | + | |
256 | 260 | let rec parse_sentence timeout test_only_flag mode tokens = function |
257 | 261 | RawSentence s -> RawSentence s |
258 | 262 | | StructSentence(id,paths,last) -> |
... | ... | @@ -272,19 +276,16 @@ let rec parse_sentence timeout test_only_flag mode tokens = function |
272 | 276 | let result = {empty_eniam_parse_result with status=Parsed; term=graph} in |
273 | 277 | ENIAMSentence result, next_id *) |
274 | 278 | | Mate -> |
275 | - (*print_endline "parse_sentence 1"; | |
276 | - let conll = CONLL.string_of_sentence PreTypes.Mate (PreTypes.StructSentence(paths,last)) in | |
279 | + print_endline "parse_sentence 1"; | |
280 | + let conll = CONLL.string_of_paths PreTypes.Mate tokens paths in | |
277 | 281 | print_endline "parse_sentence 2"; |
278 | - printf "%s\n" conll; | |
282 | + printf "|%s|\n" conll; | |
279 | 283 | Printf.fprintf mate_out "%s\n\n%!" conll; |
280 | 284 | print_endline "parse_sentence 3"; |
281 | - let conll = CONLL.load_sentence mate_in in | |
282 | - print_endline "parse_sentence 4";*) | |
283 | - (*konwersja na strukturę danych*) | |
284 | - (* FIXME: tu trzeba wstawić konwersję na tekstowy format CONLL, | |
285 | - uruchomienie MateParser i | |
286 | - powtórną konwersję wyniku. *) | |
287 | - RawSentence "" | |
285 | + let new_paths = get_paths (CONLL.load_sentence mate_in) in | |
286 | + print_endline "parse_sentence 4"; | |
287 | + let result = conll_parse_sentence timeout test_only_flag id new_paths tokens in | |
288 | + CONLLSentence result | |
288 | 289 | | _ -> failwith "parse_sentence") |
289 | 290 | | QuotedSentences sentences -> |
290 | 291 | let sentences = Xlist.rev_map sentences (fun p -> |
... | ... |