Commit 7dc3851b214629035560071402796b957b9bf7de
1 parent
e2dcc521
poprawienie błędu Not_found
Showing
2 changed files
with
103 additions
and
80 deletions
corpora/CONLL.ml
1 | 1 | open Xstd |
2 | 2 | open PreTypes |
3 | 3 | |
4 | -let string_of_token token = | |
5 | - let lemma,cat,interp = match token.token with | |
6 | - Lemma(a,b,c) -> a,b,if c = [[]] | |
4 | +let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts | |
5 | + then f mode (snd @@ List.find (fun (m,_) -> m = mode) alts) | |
6 | + else f mode (snd @@ List.find (fun (m,_) -> m = Struct) alts) | |
7 | + | |
8 | +let string_of_token mode token = match mode with | |
9 | + | Raw -> token.orth | |
10 | + | Struct -> failwith ("function string_of_token for mode Struct is not defined") | |
11 | + | CONLL -> let lemma,cat,interp = match token.token with | |
12 | + | Lemma(a,b,c) -> a,b,if c = [[]] | |
7 | 13 | then "_" |
8 | 14 | else String.concat "][" @@ Xlist.map c (fun x -> |
9 | 15 | String.concat "|" @@ Xlist.map x ( fun y -> |
10 | 16 | String.concat "." y)) |
11 | - | _ -> failwith ("string_of_token: not Lemma") in | |
12 | - String.concat "\t" [string_of_int token.id; | |
13 | - token.orth; lemma; cat; cat; interp; "_"; "_"; "_"; "_"] | |
14 | - | |
15 | -let string_of_sentence sentence = | |
16 | - let rec pom = function | |
17 | - RawSentence text -> failwith ("string_of_sentence: " ^ text) | |
18 | - | StructSentence (tokens, n) -> String.concat "\n" @@ List.map (fun x -> string_of_token x) tokens | |
17 | + | _ -> failwith ("string_of_token: not Lemma") in | |
18 | + String.concat "\t" [string_of_int token.id; | |
19 | + token.orth; lemma; cat; cat; interp; "_"; "_"; | |
20 | + string_of_int token.beg; string_of_int token.len] | |
21 | + | |
22 | +let rec string_of_sentence mode = function | |
23 | + RawSentence s -> if mode = Raw then s else "" | |
24 | + | StructSentence (tokens, _) -> String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x) | |
19 | 25 | | ORSentence (_,_,_,_) -> failwith ("string_of_sentence: ORSentence") |
20 | - | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts | |
21 | - then pom (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) | |
22 | - else failwith ("string_of_sentence: no CONLL mode in AltSentence") in | |
23 | - (if sentence.pid = "" | |
24 | - then "" | |
25 | - else sentence.pid ^ "\n") ^ (pom sentence.psentence) | |
26 | + | AltSentence alts -> alternative_string string_of_sentence mode alts | |
26 | 27 | |
27 | -(******************) | |
28 | +let string_of_p_record mode p_record = | |
29 | + (if p_record.pid = "" then "" else p_record.pid ^ "\n") ^ | |
30 | + string_of_sentence mode p_record.psentence | |
28 | 31 | |
29 | -exception Empty_line | |
30 | -exception Empty_sentence | |
31 | -exception Id_line of string | |
32 | +let rec string_of_paragraph mode = function | |
33 | + RawParagraph s -> if mode = Raw then s else "" | |
34 | + | StructParagraph (p_records, _) -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode) | |
35 | + | AltParagraph alts -> alternative_string string_of_paragraph mode alts | |
32 | 36 | |
33 | -let load_token stream = | |
34 | - let line = input_line stream in | |
35 | - if line = "" | |
36 | - then raise Empty_line | |
37 | - else if line.[0] = '#' | |
38 | - then | |
39 | - if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line | |
40 | - then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in | |
41 | - raise (Id_line id) | |
42 | - else failwith ("load_token: " ^ line) | |
43 | - else | |
44 | - match Xstring.split "\t" line with | |
45 | - [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] -> | |
46 | - (*if cat <> cat2 | |
47 | - then failwith ("load_token: " ^ line) | |
48 | - else *) | |
49 | - let interp = if interp = "_" | |
50 | - then [[]] | |
51 | - else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in | |
52 | - {empty_token with id = int_of_string id; orth = orth; token = Lemma(lemma,cat,interp)} | |
53 | - | _ -> failwith ("load_token: " ^ line) | |
54 | -(* {c_id = List.nth pom 1; | |
55 | - c_lemma = List.nth pom 2; | |
56 | - c_cat = List.nth pom 3; | |
57 | - c_interp = (let interp = List.nth pom 5 in | |
58 | - if interp = "_" | |
59 | - then [] | |
60 | - else Str.split (Str.regexp "|") interp); | |
61 | - c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *) | |
37 | +let rec string_of_text mode = function | |
38 | + RawText s -> if mode = Raw then s else "" | |
39 | + | StructText paragraphs -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode) | |
40 | + | AltText alts -> alternative_string string_of_text mode alts | |
62 | 41 | |
63 | -let load_sentence stream = | |
64 | - let rec pom rev_tokens id = | |
65 | - try | |
66 | - let token = load_token stream in | |
67 | - pom (token :: rev_tokens) id | |
68 | - with Id_line new_id -> pom rev_tokens new_id | |
69 | - | Empty_line -> rev_tokens, id | |
70 | - | End_of_file -> if rev_tokens = [] | |
71 | - then raise End_of_file | |
72 | - else rev_tokens, id in | |
73 | - let rev_tokens, id = pom [] "" in | |
74 | - {pid = id; pbeg = -1; plen = -1; psentence = StructSentence(List.rev rev_tokens,-1)} | |
75 | -(* {s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)} *) | |
76 | - | |
77 | -let load_corpus stream = | |
78 | - let rec pom res = | |
79 | - try | |
80 | - let conll_sentence = load_sentence stream in | |
81 | - pom (conll_sentence :: res) | |
82 | - with e -> print_endline (Printexc.to_string e); res in | |
83 | - pom [] | |
84 | 42 | |
85 | 43 | (******************) |
86 | 44 | |
... | ... | @@ -121,7 +79,7 @@ let add_to_map map info_str = |
121 | 79 | let info_map = |
122 | 80 | Xlist.fold info StringMap.empty add_to_map |
123 | 81 | |
124 | -let match_sentence sentence = | |
82 | +let match_sentence p_record = | |
125 | 83 | let rec info_token s = match s with |
126 | 84 | RawSentence text -> failwith ("match_sentence: " ^ text) |
127 | 85 | | StructSentence (tokens, n) -> String.concat " " @@ List.map (fun x -> x.orth) tokens |
... | ... | @@ -129,14 +87,79 @@ let match_sentence sentence = |
129 | 87 | | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts |
130 | 88 | then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) |
131 | 89 | else failwith ("match_sentence: no CONLL mode in AltSentence") in |
132 | - let info_token = info_token sentence.psentence in | |
133 | - (* try *) | |
90 | + let info_token = info_token p_record.psentence in | |
91 | + try | |
134 | 92 | let id, text = StringMap.find info_map info_token in |
135 | - let pbeg, plen, n_sentence = establish_lengths text sentence.psentence (* -1, -1, sentence.psentence *) in | |
136 | - AltText[Raw,RawText text;CONLL,StructText([StructParagraph([{pid = sentence.pid; pbeg = pbeg; plen = plen; | |
93 | + let beg, len, n_sentence = establish_lengths text p_record.psentence (* -1, -1, p_record.psentence *) in | |
94 | + AltText[Raw,RawText text;CONLL,StructText([StructParagraph([{pid = id; pbeg = beg; plen = len; | |
137 | 95 | psentence = AltSentence[Raw, RawSentence text; CONLL, n_sentence]}],-1)])] |
138 | 96 | (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) |
139 | - (* with _ -> sentence *) | |
97 | + with _ -> StructText([StructParagraph([p_record],-1)]) | |
140 | 98 | |
141 | 99 | let match_corpus corpus = |
142 | 100 | Xlist.map corpus match_sentence |
101 | + | |
102 | +(******************) | |
103 | + | |
104 | +exception Empty_line | |
105 | +exception Empty_sentence | |
106 | +exception Id_line of string | |
107 | + | |
108 | +let load_token in_channel = | |
109 | + let fail line = | |
110 | + (* failwith ("load_token: " ^ line) *) | |
111 | + () in | |
112 | + let n_token id orth lemma cat interp = | |
113 | + let interp = if interp = "_" | |
114 | + then [[]] | |
115 | + else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in | |
116 | + {empty_token with id = int_of_string id; orth = orth; token = Lemma(lemma,cat,interp)} in | |
117 | + let line = input_line in_channel in | |
118 | + if line = "" | |
119 | + then raise Empty_line | |
120 | + else if line.[0] = '#' | |
121 | + then | |
122 | + if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line | |
123 | + then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in | |
124 | + raise (Id_line id) | |
125 | + else failwith ("load_token: " ^ line) | |
126 | + else | |
127 | + match Xstring.split "\t" line with | |
128 | + [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] -> | |
129 | + (if cat <> cat2 then fail line; n_token id orth lemma cat interp) | |
130 | + | id :: orth :: lemma :: cat :: cat2 :: interp :: e -> | |
131 | + (fail line; n_token id orth lemma cat interp) | |
132 | + | _ -> failwith ("load_token: " ^ line) | |
133 | +(* {c_id = List.nth pom 1; | |
134 | + c_lemma = List.nth pom 2; | |
135 | + c_cat = List.nth pom 3; | |
136 | + c_interp = (let interp = List.nth pom 5 in | |
137 | + if interp = "_" | |
138 | + then [] | |
139 | + else Str.split (Str.regexp "|") interp); | |
140 | + c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *) | |
141 | + | |
142 | +let load_sentence in_channel = | |
143 | + let rec pom rev_tokens id = | |
144 | + try | |
145 | + let token = load_token in_channel in | |
146 | + pom (token :: rev_tokens) id | |
147 | + with Id_line new_id -> pom rev_tokens new_id | |
148 | + | Empty_line -> rev_tokens, id | |
149 | + | End_of_file -> if rev_tokens = [] | |
150 | + then raise End_of_file | |
151 | + else rev_tokens, id in | |
152 | + let rev_tokens, id = pom [] "" in | |
153 | + {pid = id; pbeg = -1; plen = -1; psentence = StructSentence(List.rev rev_tokens,-1)} | |
154 | +(* {s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)} *) | |
155 | + | |
156 | +let load_corpus in_channel = | |
157 | + let rec pom res = | |
158 | + try | |
159 | + let conll_sentence = load_sentence in_channel in | |
160 | + pom (conll_sentence :: res) | |
161 | + with End_of_file -> res in | |
162 | + (* match_corpus @@ *) List.rev @@ pom [] | |
163 | + | |
164 | + | |
165 | + | |
... | ... |
parser/exec.ml
... | ... | @@ -142,8 +142,8 @@ let process_query ic oc timeout test_only_flag id full_query max_n = |
142 | 142 | let time2 = time_fun () in |
143 | 143 | let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1; |
144 | 144 | paths_size=let _,_,next_id = paths in next_id-1} in |
145 | - if msg <> "" then {result with status=PreprocessingError; msg=msg} else | |
146 | - try | |
145 | + (*if msg <> "" then*) {result with status=PreprocessingError; msg=msg} (*else*) | |
146 | + (*try | |
147 | 147 | let graph = LCGlexicon.create query paths in |
148 | 148 | let graph,references,next_reference = LCGchart.lazify graph in |
149 | 149 | let time3 = time_fun () in |
... | ... | @@ -222,7 +222,7 @@ let process_query ic oc timeout test_only_flag id full_query max_n = |
222 | 222 | {result with status=ParseError; msg=Printexc.to_string e; parse_time=time4 -. time3} |
223 | 223 | with e -> |
224 | 224 | let time3 = time_fun () in |
225 | - {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2} | |
225 | + {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}*) | |
226 | 226 | |
227 | 227 | let print_result file result = |
228 | 228 | Printf.fprintf file "query: %s\n" result.query; |
... | ... |