Commit 0b0d4af3a9070341c24d391ddcd082f4cf5e15c9
Merge branch 'dep_trees' into integration
Showing
13 changed files
with
586 additions
and
662 deletions
LCGlexicon/resources/lexicon-pl.dic
... | ... | @@ -7,7 +7,7 @@ |
7 | 7 | month-lex month-interval year-interval roman roman-interval |
8 | 8 | hour-minute-interval hour-interval obj-id match-result |
9 | 9 | url email day-month day year date hour hour-minute |
10 | - się nie by s <root> or or2 <colon> <speaker> <speaker-end> <squery> | |
10 | + się nie by s <root> <conll_root> or or2 <colon> <speaker> <speaker-end> <squery> | |
11 | 11 | |
12 | 12 | @WEIGHTS |
13 | 13 | symbol_weight=1 |
... | ... | @@ -272,6 +272,8 @@ pos=unk: np*number*case*gender*person; |
272 | 272 | # [LCGrenderer.make_frame false tokens lex_sems [] schema_list ["<conll_root>"] d batrs] |
273 | 273 | # | lemma,c,l -> failwith ("process_interp: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) in |
274 | 274 | |
275 | +lemma=<conll_root>,pos=interp: <conll_root>/(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj); | |
276 | + | |
275 | 277 | pos=sinterj: BRACKET interj; |
276 | 278 | |
277 | 279 | lemma=</sentence>,pos=interp: BRACKET s\?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj); |
... | ... |
LCGparser/ENIAM_LCGrules.ml
... | ... | @@ -446,8 +446,8 @@ let backward_cross_composition references args functs = |
446 | 446 | let rules = [ |
447 | 447 | backward_application; |
448 | 448 | forward_application; |
449 | - backward_cross_composition; | |
450 | - forward_cross_composition; | |
449 | + (* backward_cross_composition; *) | |
450 | + (* forward_cross_composition; *) | |
451 | 451 | ] |
452 | 452 | |
453 | 453 | let rec flatten_functor2 l seml = function |
... | ... |
corpora/CONLL.ml
... | ... | @@ -3,133 +3,55 @@ open ENIAMsubsyntaxTypes |
3 | 3 | open ENIAMtokenizerTypes |
4 | 4 | |
5 | 5 | let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts |
6 | - then f (snd @@ List.find (fun (m,_) -> m = mode) alts) | |
7 | - else f (snd @@ List.find (fun (m,_) -> m = Struct) alts) | |
8 | - | |
9 | -let string_of_token mode token conll_id super label = | |
10 | - let decompose_lemma = function | |
11 | - | Lemma(a,b,c) -> a,b,if c = [[]] | |
12 | - then "_" | |
13 | - else String.concat "][" @@ Xlist.map c (fun x -> | |
14 | - String.concat "|" @@ Xlist.map x ( fun y -> | |
15 | - String.concat "." y)) | |
16 | - | t -> failwith ("string_of_token: not Lemma") in | |
17 | - match mode with | |
18 | - | Raw -> token.orth | |
19 | - | Struct -> failwith ("function string_of_token for mode Struct is not defined") | |
20 | - | CONLL -> let lemma,cat,interp = decompose_lemma token.token in | |
21 | - String.concat "\t" [string_of_int conll_id; | |
22 | - token.orth; lemma; cat; cat; interp; "_"; "_"; | |
23 | - string_of_int token.beg; string_of_int token.len] | |
24 | - | Mate -> let lemma,cat,interp = decompose_lemma token.token in | |
25 | - String.concat "\t" [string_of_int conll_id; | |
26 | - token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"] | |
27 | - | _ -> failwith "string_of_token: ni" | |
28 | - | |
29 | -let string_of_paths mode tokens paths = | |
30 | - let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id -> | |
31 | - let id,super,label = paths.(conll_id) in | |
32 | - (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in | |
33 | - String.concat "\n" (List.rev l) ^ "\n\n" | |
34 | - | |
35 | -let rec string_of_sentence mode tokens = function | |
36 | - RawSentence s -> if mode = Raw then s else "" | |
37 | - | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) | |
38 | - | DepSentence (paths) -> string_of_paths mode tokens paths | |
39 | - | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") | |
40 | - | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts | |
41 | - | |
42 | -let string_of_p_record mode tokens p_record = | |
43 | - (if p_record.id = "" then "" else p_record.id ^ "\n") ^ | |
44 | - string_of_sentence mode tokens p_record.sentence | |
45 | - | |
46 | -(*let rec string_of_paragraph mode tokens = function | |
47 | - RawParagraph s -> if mode = Raw then s else "" | |
48 | - | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens) | |
49 | - | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts | |
50 | - | |
51 | -let rec string_of_text mode tokens = function | |
52 | - RawText s -> if mode = Raw then s else "" | |
53 | - | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens) | |
54 | - | AltText alts -> alternative_string (string_of_text mode) mode alts*) | |
55 | - | |
56 | - | |
57 | -(******************) | |
58 | -(*** | |
59 | -let establish_next tokens paths = | |
60 | - let n = ExtArray.size tokens in | |
61 | - Int.iter 1 (n - 2) (fun i -> | |
62 | - let f = ExtArray.get tokens i in | |
63 | - let s = ExtArray.get tokens (i+1) in | |
64 | - ExtArray.set tokens i {f with next = s.beg}); | |
65 | - let last = ExtArray.get tokens (n-1) in | |
66 | - ExtArray.set tokens (n-1) {last with next = last.beg + last.len} | |
67 | - | |
68 | - | |
69 | - (*let rec pom res = function | |
70 | - h :: t -> let next = if res = [] | |
71 | - then h.beg+h.len | |
72 | - else (List.hd res).beg in | |
73 | - pom ({h with next = next} :: res) t | |
74 | - | [] -> res in | |
75 | - pom [] rev_tokens*) | |
76 | - | |
77 | -let rec establish_for_token i text tokens = function | |
78 | - (id,_,_) :: t as l-> | |
79 | - let h = ExtArray.get tokens id in | |
80 | - if Xstring.check_prefix " " text | |
81 | - then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l | |
82 | - else if Xstring.check_prefix h.orth text | |
83 | - then | |
84 | - let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in | |
85 | - let n_h = {h with beg = i ; len = n} in | |
86 | - ExtArray.set tokens id n_h; | |
87 | - establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t | |
88 | - else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text) | |
89 | - | [] -> 100, i | |
90 | - | |
91 | -let rec establish_lengths text paths tokens = | |
92 | - let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in | |
93 | - establish_next tokens paths; | |
94 | - pbeg, plen-100 | |
95 | - | |
96 | -(******************) | |
97 | - | |
98 | -exception ErrorInfoFile of string | |
99 | - | |
100 | -let info_file = "../corpora/info_sentences.txt" | |
101 | - | |
102 | -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file | |
103 | - | |
104 | -let add_to_map map info_str = | |
105 | - match Xstring.split "\n" info_str with | |
106 | - [id; text; info_token] -> StringMap.add map info_token (id, text) | |
107 | - | _ -> raise (ErrorInfoFile info_str) | |
108 | - | |
109 | -let info_map = | |
110 | - Xlist.fold info StringMap.empty add_to_map | |
111 | - | |
112 | -let match_sentence (p_record,tokens) = | |
113 | - let rec info_token s = match s with | |
114 | - RawSentence text -> failwith ("match_sentence: " ^ text) | |
115 | - | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*) | |
116 | - | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths | |
117 | - | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences") | |
118 | - | AltSentence alts -> failwith ("match_sentence: AltSentence") | |
119 | - (*if List.exists (fun (mode, s) -> mode = CONLL) alts | |
120 | - then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) | |
121 | - else failwith ("match_sentence: no CONLL mode in AltSentence")*) in | |
122 | - let info_token, paths = info_token p_record.psentence in | |
123 | - try | |
124 | - let id, text = StringMap.find info_map info_token in | |
125 | - let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in | |
126 | - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; | |
127 | - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] | |
128 | -(* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) | |
129 | - with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] | |
130 | - | |
131 | -let match_corpus corpus = | |
132 | - Xlist.map corpus match_sentence***) | |
6 | + then f (snd @@ List.find (fun (m,_) -> m = mode) alts) | |
7 | + else f (snd @@ List.find (fun (m,_) -> m = Struct) alts) | |
8 | + | |
9 | +let string_of_token mode token conll_id super label = | |
10 | + let decompose_lemma = function | |
11 | + | Lemma(a,b,c) -> a,b,if c = [[]] | |
12 | + then "_" | |
13 | + else String.concat "][" @@ Xlist.map c (fun x -> | |
14 | + String.concat "|" @@ Xlist.map x ( fun y -> | |
15 | + String.concat "." y)) | |
16 | + | t -> failwith ("string_of_token: not Lemma") in | |
17 | + match mode with | |
18 | + | Raw -> token.orth | |
19 | + | Struct -> failwith ("function string_of_token for mode Struct is not defined") | |
20 | + | CONLL -> let lemma,cat,interp = decompose_lemma token.token in | |
21 | + String.concat "\t" [string_of_int conll_id; | |
22 | + token.orth; lemma; cat; cat; interp; "_"; "_"; | |
23 | + string_of_int token.beg; string_of_int token.len] | |
24 | + | Mate -> let lemma,cat,interp = decompose_lemma token.token in | |
25 | + String.concat "\t" [string_of_int conll_id; | |
26 | + token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"] | |
27 | + | _ -> failwith "string_of_token: ni" | |
28 | + | |
29 | +let string_of_paths mode tokens paths = | |
30 | + let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id -> | |
31 | + let id,super,label = paths.(conll_id) in | |
32 | + (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in | |
33 | + String.concat "\n" (List.rev l) ^ "\n\n" | |
34 | + | |
35 | +let rec string_of_sentence mode tokens = function | |
36 | + RawSentence s -> if mode = Raw then s else "" | |
37 | + | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*) | |
38 | + | DepSentence (paths) -> string_of_paths mode tokens paths | |
39 | + | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences") | |
40 | + | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts | |
41 | + | |
42 | +let string_of_p_record mode tokens p_record = | |
43 | + (if p_record.id = "" then "" else p_record.id ^ "\n") ^ | |
44 | + string_of_sentence mode tokens p_record.sentence | |
45 | + | |
46 | +(*let rec string_of_paragraph mode tokens = function | |
47 | + RawParagraph s -> if mode = Raw then s else "" | |
48 | + | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens) | |
49 | + | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts | |
50 | + | |
51 | +let rec string_of_text mode tokens = function | |
52 | + RawText s -> if mode = Raw then s else "" | |
53 | + | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens) | |
54 | + | AltText alts -> alternative_string (string_of_text mode) mode alts*) | |
133 | 55 | |
134 | 56 | (******************) |
135 | 57 | |
... | ... | @@ -207,15 +129,6 @@ let establish_next tokens paths = |
207 | 129 | let last = ExtArray.get tokens (n-1) in |
208 | 130 | ExtArray.set tokens (n-1) {last with next = last.beg + last.len} |
209 | 131 | |
210 | - | |
211 | - (*let rec pom res = function | |
212 | - h :: t -> let next = if res = [] | |
213 | - then h.beg+h.len | |
214 | - else (List.hd res).beg in | |
215 | - pom ({h with next = next} :: res) t | |
216 | - | [] -> res in | |
217 | - pom [] rev_tokens*) | |
218 | - | |
219 | 132 | let rec establish_for_token i text tokens = function |
220 | 133 | (id,_,_) :: t as l-> |
221 | 134 | let h = ExtArray.get tokens id in |
... | ... | @@ -245,15 +158,15 @@ exception ErrorInfoFile of string |
245 | 158 | |
246 | 159 | let info_file = "../corpora/info_sentences2.txt" |
247 | 160 | |
248 | -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file | |
161 | +let info () = Xstring.split "\n\n" @@ File.load_file_gen info_file | |
249 | 162 | |
250 | 163 | let add_to_map map info_str = |
251 | 164 | match Xstring.split "\n" info_str with |
252 | 165 | [id; text; info_token] -> StringMap.add map info_token (id, text) |
253 | 166 | | _ -> raise (ErrorInfoFile info_str) |
254 | 167 | |
255 | -let info_map = | |
256 | - Xlist.fold (List.tl info) StringMap.empty add_to_map | |
168 | +let info_map () = | |
169 | + Xlist.fold (List.tl (info ())) StringMap.empty add_to_map | |
257 | 170 | |
258 | 171 | let match_sentence (p_record,tokens) = |
259 | 172 | let rec info_token s = match s with |
... | ... | @@ -268,7 +181,7 @@ let match_sentence (p_record,tokens) = |
268 | 181 | let info_token, paths = info_token p_record.sentence in |
269 | 182 | (* try *) |
270 | 183 | let id, text = try |
271 | - StringMap.find info_map info_token | |
184 | + StringMap.find (info_map ()) info_token | |
272 | 185 | with |
273 | 186 | | _ -> p_record.id, get_text tokens in |
274 | 187 | let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in |
... | ... | @@ -282,7 +195,7 @@ let match_corpus corpus = |
282 | 195 | [] -> [] |
283 | 196 | | a::l -> try |
284 | 197 | let r = f a in r :: pom f l |
285 | - with e -> (*print_endline (Printexc.to_string e);*) pom f l in | |
198 | + with e -> pom f l in | |
286 | 199 | pom match_sentence corpus |
287 | 200 | |
288 | 201 | (******************) |
... | ... | @@ -304,7 +217,6 @@ let load_token in_channel = |
304 | 217 | else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in |
305 | 218 | {empty_token_env with orth = orth; token = Lemma(lemma,cat,interp);}, int_of_string id, int_of_super super, label in |
306 | 219 | let line = input_line in_channel in |
307 | - (* print_endline ("load_token: " ^ line); *) | |
308 | 220 | if line = "" |
309 | 221 | then raise Empty_line |
310 | 222 | else if line.[0] = '#' |
... | ... | @@ -329,30 +241,19 @@ let load_token in_channel = |
329 | 241 | let label = Xstring.cut_sufix "_" label_err in |
330 | 242 | n_token id orth lemma cat interp super label) |
331 | 243 | | _ -> failwith ("load_token: " ^ line) |
332 | -(* {c_id = List.nth pom 1; | |
333 | - c_lemma = List.nth pom 2; | |
334 | - c_cat = List.nth pom 3; | |
335 | - c_interp = (let interp = List.nth pom 5 in | |
336 | - if interp = "_" | |
337 | - then [] | |
338 | - else Str.split (Str.regexp "|") interp); | |
339 | - c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *) | |
340 | 244 | |
341 | 245 | let load_sentence in_channel = |
342 | 246 | let tokens = ExtArray.make 100 empty_token_env in |
343 | 247 | let _ = ExtArray.add tokens {empty_token_env with token = Interp "<conll_root>"} in |
344 | 248 | let rec pom rev_paths id = |
345 | - (* print_endline "pom 1"; *) | |
346 | 249 | try |
347 | - (* print_endline "pom 2"; *) | |
348 | 250 | let token, conll_id, super, label = load_token in_channel in |
349 | 251 | let id_a = ExtArray.add tokens token in |
350 | 252 | if id_a <> conll_id then failwith "load_sentence: different ids" else |
351 | - (* print_endline "pom 3"; *) | |
352 | 253 | pom ((id_a,super,label) :: rev_paths) id |
353 | - with Id_line new_id -> (*print_endline "pom 4";*)pom rev_paths new_id | |
354 | - | Empty_line -> (*print_endline "pom 5";*)rev_paths, id | |
355 | - | End_of_file -> (*print_endline "pom 6";*)if rev_paths = [] | |
254 | + with Id_line new_id -> pom rev_paths new_id | |
255 | + | Empty_line -> rev_paths, id | |
256 | + | End_of_file -> if rev_paths = [] | |
356 | 257 | then raise End_of_file |
357 | 258 | else rev_paths, id in |
358 | 259 | let rev_paths, id = pom [] "" in |
... | ... | @@ -366,4 +267,4 @@ let load_corpus in_channel = |
366 | 267 | pom ((conll_sentence, tokens) :: res) |
367 | 268 | with End_of_file -> res |
368 | 269 | | e -> prerr_endline (Printexc.to_string e); res in |
369 | - (* match_corpus @@ *) List.rev @@ pom [] | |
270 | + List.rev @@ pom [] | |
... | ... |
corpora/CONLL_adapter.ml
1 | +open Xstd | |
2 | +open ENIAMsubsyntaxTypes | |
3 | +open ENIAMtokenizerTypes | |
1 | 4 | |
2 | -let convert_dep_tree id first_try paths tokens lex_sems = | |
3 | - let do_if cond f paths = if cond then f paths tokens else paths in | |
5 | +let if_lemma lemmas = function | |
6 | + Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas | |
7 | + | _ -> false | |
8 | + | |
9 | +let if_cat cats = function | |
10 | + Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats | |
11 | + | _ -> false | |
12 | + | |
13 | +let if_interps interps token = | |
14 | + let interp = match token with | |
15 | + Lemma(_,_,i) -> i | |
16 | + | _ -> [[[]]] in | |
17 | + let if_interp nr value = | |
18 | + List.exists (fun x -> | |
19 | + try | |
20 | + List.exists (fun y -> | |
21 | + y = value) (List.nth x nr) | |
22 | + with _ -> false | |
23 | + ) interp in | |
24 | + Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) | |
25 | + | |
26 | +let correct_coordination1 paths tokens = | |
27 | + let paths_ls = List.mapi (fun i (id,super,label) -> | |
28 | + (i,id,super,label)) (Array.to_list paths) in | |
29 | + | |
30 | + let l = [("subst:nom",0),(["fin";"praet"],0); | |
31 | + ("subst:acc",0),(["inf"],0); | |
32 | + ("ppron3:nom",0),(["fin";"praet"],0); | |
33 | + ("ppron3:acc",0),(["fin";"praet"],0); | |
34 | + ("adv",0),(["fin";"praet"],0); | |
35 | + ("adv",0),(["inf"],0); | |
36 | + ("adv",0),(["adj"],0); | |
37 | + ("prep",0),(["fin";"praet"],0); | |
38 | + ("prep",0),(["inf"],0); | |
39 | + ("prep",0),(["ppas"],0); | |
40 | + ("prep",0),(["subst"],0); | |
41 | + ("prep:gen",0),(["subst:gen"],0); | |
42 | + ("adj:nom",0),(["fin";"praet"],0); | |
43 | + ("adj:nom",0),(["subst:nom"],0); | |
44 | + ("adj:gen",0),(["subst:gen"],0); | |
45 | + ("adj:dat",0),(["subst:dat"],0); | |
46 | + ("adj:acc",0),(["subst:acc"],0); | |
47 | + ("adj:inst",0),(["subst:inst"],0); | |
48 | + ("adj:loc",0),(["subst:loc"],0); | |
49 | + ("subst:gen",0),(["subst:nom"],0); | |
50 | + (* ("subst:gen",0),(["subst:gen"],0); *) | |
51 | + ("subst:gen",0),(["subst:dat"],0); | |
52 | + ("subst:gen",0),(["subst:acc"],0); | |
53 | + ("subst:gen",0),(["subst:inst"],0); | |
54 | + ("subst:gen",0),(["subst:loc"],0); | |
55 | + ("ppron3:gen",0),(["subst:nom"],0); | |
56 | + ("ppron3:gen",0),(["subst:dat"],0); | |
57 | + ("ppron3:gen",0),(["subst:acc"],0); | |
58 | + ("ppron3:gen",0),(["subst:inst"],0); | |
59 | + ("ppron3:gen",0),(["subst:loc"],0); | |
60 | + ("qub",0),(["fin";"praet"],0); | |
61 | + ("qub",0),(["subst"],0); | |
62 | + ("qub",0),(["adj"],0); | |
63 | + ("pact",0),(["subst"],0); | |
64 | + ("ppas",0),(["subst"],0) | |
65 | + ] in | |
66 | + | |
67 | + let find_dependents sons = | |
68 | + | |
69 | + let is (i,id,super,label) pattern = match Xstring.split ":" pattern with | |
70 | + ["prep";case] -> if_cat ["prep"] (ExtArray.get tokens id).token && | |
71 | + if_interps [0,case] (ExtArray.get tokens id).token | |
72 | + | [cat;case] -> if_cat [cat] (ExtArray.get tokens id).token && | |
73 | + if_interps [1,case] (ExtArray.get tokens id).token | |
74 | + | [cat] -> if_cat [cat] (ExtArray.get tokens id).token | |
75 | + | _ -> failwith "is (in correct_coordination1)" in | |
76 | + | |
77 | + let incr_representative acc son = Xlist.map acc (fun ((one,a),(rest,b)) -> | |
78 | + if is son one | |
79 | + then (one,a + 1), (rest,b) | |
80 | + else if List.exists (is son) rest | |
81 | + then (one,a), (rest,b + 1) | |
82 | + else (one,a), (rest,b)) in | |
83 | + | |
84 | + let get_from sons pattern = List.find (fun x -> is x pattern) sons in | |
85 | + | |
86 | + let l = Xlist.fold sons l incr_representative in | |
87 | + let results = List.filter (fun ((_,a),(_,b)) -> a = 1 && b > 1) l in | |
88 | + Xlist.map results (fun result -> | |
89 | + get_from sons @@ fst @@ fst result, | |
90 | + List.filter (fun son -> | |
91 | + List.exists (fun one -> is son one) (fst (snd result))) sons) in | |
92 | + | |
93 | + let establish_neighbour super ((i_d,id_d,super_d,label_d),sons) = | |
94 | + let not_between (i_s,_,_,_) = | |
95 | + (super < i_d && super < i_s) || | |
96 | + (super > i_d && super > i_s) in | |
97 | + let (i_n,id_n,super_n,label_n) = List.find (fun son -> | |
98 | + not_between son) sons in | |
99 | + paths.(i_d) <- (id_d, i_n, label_d) in | |
100 | + | |
101 | + let examine_coords (i,id,super,label) sons = | |
102 | + try | |
103 | + let dependents = find_dependents sons in | |
104 | + Xlist.iter dependents (establish_neighbour super) | |
105 | + with | |
106 | + | _ -> () in | |
107 | + | |
108 | + Array.iteri (fun i (id,super,label) -> | |
109 | + if if_cat ["conj"] (ExtArray.get tokens id).token | |
110 | + then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in | |
111 | + if (List.length sons > 2) | |
112 | + then examine_coords (i,id,super,label) sons)) paths; | |
113 | + paths | |
114 | + | |
115 | +let correct_coordination2 paths tokens = | |
116 | + let paths_c = Array.copy paths in | |
117 | + let paths_ls () = List.mapi (fun i (id,super,label) -> | |
118 | + (i,id,super,label)) (Array.to_list paths_c) in | |
119 | + | |
120 | + (* let ps a sons = | |
121 | + print_endline a; | |
122 | + List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; | |
123 | + print_endline "" in *) | |
124 | + | |
125 | + let rec correct_rec (i,id,super,label) sons = | |
126 | + let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in | |
127 | + (* ps "left:" (List.rev left_s); | |
128 | + ps "right:" right_s; *) | |
129 | + find_father i (List.rev left_s); | |
130 | + find_father i right_s | |
131 | + | |
132 | + and find_father i0 = function | |
133 | + [(i,id,super,label)] -> paths_c.(i) <- (id,i0,label) | |
134 | + | (a,b,c,d) :: (i,id,super,label) :: t -> | |
135 | + paths_c.(i) <- (id,i0,label); | |
136 | + if not (if_cat ["conj"] (ExtArray.get tokens i).token || | |
137 | + (ExtArray.get tokens i).orth = ",") | |
138 | + then failwith "find_father"; | |
139 | + correct_rec (i,id,super,label) (if a < i | |
140 | + then (a,b,c,d) :: t | |
141 | + else List.rev @@ (a,b,c,d) :: t) | |
142 | + | _ -> failwith "find_father" in | |
143 | + | |
144 | + let check_previous_for_interp i = | |
145 | + if i >= 0 && (ExtArray.get tokens i).orth = "," && | |
146 | + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c)) | |
147 | + then paths_c.(i) <- (0,-1,"") in | |
148 | + | |
149 | + Array.iteri (fun i (id,super,label) -> | |
150 | + if if_cat ["conj"] (ExtArray.get tokens i).token || | |
151 | + (ExtArray.get tokens i).orth = "," | |
152 | + then | |
153 | + (check_previous_for_interp (i-1); | |
154 | + let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in | |
155 | + if (List.length sons > 2) | |
156 | + then correct_rec (i,id,super,label) sons)) paths_c; | |
157 | + paths_c | |
158 | + | |
159 | +let praet_qub_aglt paths tokens = | |
160 | + Array.iteri (fun i (id,super,label) -> | |
161 | + if super >= 0 then | |
162 | + (let id_s, super_s, label_s = paths.(super) in | |
163 | + if if_cat ["aglt"] (ExtArray.get tokens id).token && | |
164 | + (ExtArray.get tokens id_s).orth = "by" | |
165 | + then let id_gf,super_gf,label_gf = paths.(super_s) in | |
166 | + if if_cat ["praet"] (ExtArray.get tokens id_gf).token | |
167 | + then paths.(i) <- (id,super_s,label))) paths; | |
168 | + paths | |
169 | + | |
170 | +let replace_tokens paths tokens = | |
171 | +(* for i = 0 to ExtArray.size tokens - 1 do | |
172 | + print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth) | |
173 | +done; *) | |
174 | + let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> | |
175 | + if (ExtArray.get tokens i).orth = orth then i else acc) in | |
176 | + | |
177 | + let multidot i id0 super0 label0 = | |
178 | + let id1, super1, label1 = paths.(super0) in | |
179 | + if super1 >= 0 then | |
180 | + let id2, super2, label2 = paths.(super1) in | |
181 | + if (ExtArray.get tokens id1).orth = "." && | |
182 | + (ExtArray.get tokens id2).orth = "." | |
183 | + then | |
184 | + (paths.(super1) <- (find_token "..." ,super2, label2); | |
185 | + paths.(super0) <- (0,-1,""); | |
186 | + paths.(i) <- (0,-1,"")) in | |
187 | + | |
188 | + let brev i id super label = | |
189 | + let if_the_last_dot () = | |
190 | + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> | |
191 | + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in | |
192 | + Array.fold_left (fun acc (i2,s,l) -> | |
193 | + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in | |
194 | + | |
195 | + let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () | |
196 | + then "" | |
197 | + else "." in | |
198 | + | |
199 | + let n_orth = (ExtArray.get tokens id).orth ^ dot in | |
200 | + paths.(i) <- (find_token n_orth,super,label) in | |
201 | + | |
202 | + Array.iteri (fun i (id,super,label) -> | |
203 | + if (ExtArray.get tokens id).orth = "." | |
204 | + then multidot i id super label; | |
205 | + if if_cat ["brev"] (ExtArray.get tokens id).token | |
206 | + then brev i id super label) | |
207 | + paths; | |
208 | + paths | |
209 | + | |
210 | +let replace_hyphens paths tokens = | |
211 | + let ref_paths = ref paths in | |
212 | + let find_token token = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> | |
213 | + if (ExtArray.get tokens i).token = token then i else acc) in | |
214 | + let find_specific_token token beg next = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> | |
215 | + if (ExtArray.get tokens i).token = token && | |
216 | + beg <= (ExtArray.get tokens i).beg && | |
217 | + (ExtArray.get tokens i).next <= next | |
218 | + then i else acc) in | |
219 | + | |
220 | + let correct_last sons_of_zero = (* TODO: synowie zamiast syna *) | |
221 | + let i1,s1,l1 = !ref_paths.(Array.length !ref_paths - 1) in | |
222 | + if (ExtArray.get tokens i1).orth = "." | |
223 | + then | |
224 | + !ref_paths.(Array.length !ref_paths - 1) <- (find_token (Interp "</sentence>"),1,l1) | |
225 | + else | |
226 | + (ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),1,"-") |]; | |
227 | + !ref_paths.(Array.length !ref_paths - 2) <- (i1,Array.length !ref_paths - 1,l1)); | |
228 | + Xlist.iter sons_of_zero (fun son_of_zero -> | |
229 | + let i2,s2,l2 = !ref_paths.(son_of_zero) in | |
230 | + !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) in | |
231 | + | |
232 | + let one_hyphen sons_of_zero = | |
233 | + let i2,s2,l2 = !ref_paths.(1) in | |
234 | + Xlist.iter sons_of_zero (fun son_of_zero -> | |
235 | + let i1,s1,l1 = !ref_paths.(son_of_zero) in | |
236 | + !ref_paths.(son_of_zero) <- (i1,1,l1)); | |
237 | + !ref_paths.(1) <- (find_token (Interp "<or-sentence>"),0,l2); | |
238 | + correct_last sons_of_zero in | |
239 | + | |
240 | + let two_hyphens first second son parent = | |
241 | + let i1,s1,l1 = !ref_paths.(first) in | |
242 | + let i2,s2,l2 = !ref_paths.(second) in | |
243 | + let beg, next = (ExtArray.get tokens i2).beg, (ExtArray.get tokens i2).next in | |
244 | + let i3,s3,l3 = !ref_paths.(son) in | |
245 | + let i4,s4,l4 = !ref_paths.(parent) in | |
246 | + ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),first,"-") |]; | |
247 | + !ref_paths.(first) <- (find_token (Interp "<or-sentence>"),0,l1); | |
248 | + !ref_paths.(second) <- (find_specific_token (Interp "</or-sentence>") beg next,first,l2); | |
249 | + !ref_paths.(son) <- (i3,second,l3); | |
250 | + !ref_paths.(parent) <- (i4,first,l4) in | |
251 | + | |
252 | + let rec is_dep_correct a b out zero res i (id,super,label) = (* out = how many words in (a,b) have parent outside [a,b]*) | |
253 | + (* print_endline ((string_of_int a) ^ " " ^ (string_of_int b) ^ " " ^ (string_of_int out) ^ " " ^ (string_of_int zero) ^ " " ^ (string_of_int i)); *) | |
254 | + if out > 1 || zero > 1 || (* zero = how many words (not interps) have parent 0 *) | |
255 | + (a < i && i < b && super < a && label <> "interp") || | |
256 | + (a < super && super < b && (i < a || b < i)) | |
257 | + then false, res | |
258 | + else | |
259 | + if i+1 = Array.length !ref_paths | |
260 | + then out = 1 && zero = 1, res | |
261 | + else | |
262 | + if a < i && i < b && b < super | |
263 | + then is_dep_correct a b (out+1) zero (i,super) (i+1) !ref_paths.(i+1) | |
264 | + else | |
265 | + if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token) | |
266 | + then is_dep_correct a b out (zero+1) res (i+1) !ref_paths.(i+1) | |
267 | + else is_dep_correct a b out zero res (i+1) !ref_paths.(i+1) in | |
268 | + | |
269 | + let hyphens = snd @@ Array.fold_left (fun (i,acc) (id,super,label) -> | |
270 | + if (ExtArray.get tokens id).orth = "-" | |
271 | + then i+1, i :: acc | |
272 | + else i+1, acc) (0,[]) !ref_paths in | |
273 | + | |
274 | + let sons_of_zero = snd @@ Array.fold_left (fun (i,acc) (id,super,label) -> | |
275 | + if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token) | |
276 | + then i+1, i :: acc | |
277 | + else i+1, acc) (0,[]) !ref_paths in | |
278 | + | |
279 | + (if List.length sons_of_zero = 1 | |
280 | + then | |
281 | + if List.length hyphens = 1 && hyphens = [1] | |
282 | + then one_hyphen sons_of_zero | |
283 | + else | |
284 | + if List.length hyphens = 2 | |
285 | + then let a, b = List.nth hyphens 1, List.nth hyphens 0 in | |
286 | + let is_good, (son,parent) = is_dep_correct a b 0 0 (0,0) 1 !ref_paths.(1) in | |
287 | + if a = 1 && is_good | |
288 | + then two_hyphens a b son parent); | |
289 | + !ref_paths | |
290 | + | |
291 | +let correct_interp_with_father_0 paths tokens = | |
292 | + Array.iteri (fun i (id,super,label) -> | |
293 | + if (super = 0 || | |
294 | + (ExtArray.get tokens id).token = Interp "<or-sentence>" || | |
295 | + (ExtArray.get tokens id).token = Interp "</or-sentence>") && (ExtArray.get tokens id).orth = "," | |
296 | + then Array.iteri (fun i1 (id1,super1,label1) -> | |
297 | + if super1 = i | |
298 | + then paths.(i1) <- (id1,0,label1)) paths) paths; | |
299 | + paths | |
300 | + | |
301 | +let remove_interps interp paths tokens = | |
302 | + let paths_ls = Array.to_list paths in | |
303 | + Array.iteri (fun i (id,super,label) -> | |
304 | + if (ExtArray.get tokens id).orth = interp && | |
305 | + not (List.exists (fun (_,super,_) -> super = i) paths_ls) | |
306 | + then paths.(i) <- (0,-1,"")) paths; | |
307 | + paths | |
308 | + | |
309 | +let correct_passive_voice paths tokens = | |
310 | + Array.iteri (fun i (id,super,label) -> | |
311 | + if super >= 0 then | |
312 | + (let id_s, super_s, label_s = paths.(super) in | |
313 | + if (if_cat ["praet"] (ExtArray.get tokens id).token && | |
314 | + if_cat ["ppas"] (ExtArray.get tokens id_s).token) | |
315 | + then (paths.(i) <- (id,super_s,label); | |
316 | + paths.(super) <- (id_s,i,label_s); | |
317 | + Array.iteri (fun i_p (id_p,super_p,label_p) -> | |
318 | + if super_p = super | |
319 | + then paths.(i_p) <- (id_p,i,label_p)) paths))) paths; | |
320 | + paths | |
321 | + | |
322 | +let swap_dep paths tokens = | |
323 | + let change_dep i (id,super,label) = | |
324 | + let id_S, super_S, label_S = paths.(super) in | |
325 | + paths.(i) <- (id,super_S,label); | |
326 | + paths.(super) <- (id_S, id, label_S) in | |
327 | + let rec correct_dep i (id,super,label) = | |
328 | + let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; | |
329 | + "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in | |
330 | + if (if_cat ["comp"] (ExtArray.get tokens id).token && | |
331 | + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token) || | |
332 | + (if_cat ["conj"] (ExtArray.get tokens id).token && | |
333 | + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token && | |
334 | + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) || | |
335 | + (if_cat ["ppron3"] (ExtArray.get tokens id).token && | |
336 | + if_interps [5,"praep"] (ExtArray.get tokens id).token) || | |
337 | + (if_lemma adv_relators (ExtArray.get tokens id).token && | |
338 | + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token) | |
339 | + then | |
340 | + change_dep i (id,super,label); | |
341 | + if (if_lemma adv_relators (ExtArray.get tokens id).token && | |
342 | + if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) | |
343 | + then correct_dep i paths.(i) in | |
344 | + Array.iteri correct_dep paths; paths | |
345 | + | |
346 | + (* | |
347 | + correct_coordination1 -> sąsiad słowem najbliższym po prawej, jeśli pomiędzy nim a mną spójnik, to najbliższym po lewej | |
348 | + nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy | |
349 | + nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *) | |
350 | + | |
351 | +let convert_dep_tree id first_try paths tokens = | |
4 | 352 | let paths = Array.copy paths in |
5 | - let paths = do_if first_try TreeChange.replace_tokens paths in | |
6 | - let paths = do_if first_try (TreeChange.remove_interps ".") paths in | |
7 | - let paths = do_if first_try TreeChange.replace_hyphens paths in | |
8 | - let paths = do_if first_try TreeChange.correct_coordination1 paths in | |
9 | - let paths = do_if first_try TreeChange.correct_interp_with_father_0 paths in | |
10 | - let paths = do_if first_try TreeChange.correct_coordination2 paths in | |
11 | - let paths = do_if first_try (TreeChange.remove_interps ",") paths in | |
12 | - let paths = do_if first_try TreeChange.correct_passive_voice paths in | |
13 | - let paths = do_if first_try TreeChange.praet_qub_aglt paths in | |
14 | - let paths = do_if (not first_try) TreeChange.swap_dep paths in | |
15 | - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | |
16 | - fprintf file "%s\n" Visualization.html_header; | |
17 | - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); | |
18 | - fprintf file "%s\n" Visualization.html_trailer); | |
19 | - (* let paths = do_if first_try TreeChange.replace_tokens paths in | |
20 | - let paths = do_if first_try TreeChange.replace_hyphens paths in | |
21 | - let paths = do_if first_try (TreeChange.remove_interps ".") paths in | |
22 | - let paths = do_if (not first_try) TreeChange.swap_dep paths in | |
23 | - let paths = do_if first_try TreeChange.correct_coordination1 paths in | |
24 | - let paths = try | |
25 | - do_if first_try TreeChange.correct_coordination2 paths | |
26 | - with | |
27 | - | _ -> ( | |
28 | - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | |
29 | - fprintf file "%s\n" Visualization.html_header; | |
30 | - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); | |
31 | - fprintf file "%s\n" Visualization.html_trailer); | |
32 | - do_if first_try TreeChange.correct_interp_with_father_0 paths; | |
33 | - do_if first_try (TreeChange.remove_interps ",") paths; | |
34 | - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ "2.html") (fun file -> | |
35 | - fprintf file "%s\n" Visualization.html_header; | |
36 | - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); | |
37 | - fprintf file "%s\n" Visualization.html_trailer); | |
38 | - do_if first_try TreeChange.correct_coordination2 paths) in | |
39 | - let paths = do_if first_try TreeChange.praet_qub_aglt paths in | |
40 | - let paths = do_if first_try TreeChange.correct_interp_with_father_0 paths in | |
41 | - let paths = do_if first_try (TreeChange.remove_interps ",") paths in | |
42 | - let paths = do_if first_try (TreeChange.remove_interps "-") paths in | |
43 | - let paths = do_if first_try TreeChange.correct_passive_voice paths in | |
44 | - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | |
45 | - fprintf file "%s\n" Visualization.html_header; | |
46 | - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); | |
47 | - fprintf file "%s\n" Visualization.html_trailer); *) | |
353 | + let paths = | |
354 | + if first_try | |
355 | + then | |
356 | + let pom = replace_tokens paths tokens in | |
357 | + let pom = (remove_interps ".") pom tokens in | |
358 | + let pom = replace_hyphens pom tokens in | |
359 | + let pom = correct_coordination1 pom tokens in | |
360 | + let pom = correct_interp_with_father_0 pom tokens in | |
361 | + let pom = correct_coordination2 pom tokens in | |
362 | + let pom = remove_interps "," pom tokens in | |
363 | + let pom = correct_passive_voice pom tokens in | |
364 | + praet_qub_aglt pom tokens | |
365 | + else | |
366 | + swap_dep paths tokens in | |
367 | + (* File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | |
368 | + Printf.fprintf file "%s\n" Visualization.html_header; | |
369 | + Printf.fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); | |
370 | + Printf.fprintf file "%s\n" Visualization.html_trailer); *) | |
371 | + paths | |
... | ... |
diagnostics/LCGfields.ml renamed to corpora/LCGfields.ml
... | ... | @@ -83,7 +83,7 @@ let field_of_dependency_tree str_node fields dep_tree = |
83 | 83 | Array.fold_left (fun acc x -> |
84 | 84 | acc ^ (field_of_linear_term str_node field x) ^ "\n\t\t" ) "" dep_tree)) |
85 | 85 | |
86 | -let field_of_eniam_sentence fields tokens (result : eniam_parse_result) = | |
86 | +let field_of_eniam_sentence fields (result : eniam_parse_result) = | |
87 | 87 | match result.status with |
88 | 88 | Idle -> "Idle" |
89 | 89 | (* | PreprocessingError -> "PreprocessingError" *) |
... | ... | @@ -99,7 +99,7 @@ let field_of_eniam_sentence fields tokens (result : eniam_parse_result) = |
99 | 99 | | Parsed -> ignore ("Parsed\n\t\t" ^ (field_of_dependency_tree eniam fields result.dependency_tree)); "Parsed\n" |
100 | 100 | | _ -> failwith "field_of_eniam_sentence" |
101 | 101 | |
102 | -let field_of_conll_sentence fields tokens (result : conll_parse_result) = | |
102 | +let field_of_conll_sentence fields (result : conll_parse_result) = | |
103 | 103 | stat_map := StatMap.add !stat_map result.status; |
104 | 104 | match result.status with |
105 | 105 | Idle -> "Idle" |
... | ... | @@ -117,33 +117,36 @@ let field_of_conll_sentence fields tokens (result : conll_parse_result) = |
117 | 117 | | _ -> failwith "field_of_conll_sentence" |
118 | 118 | |
119 | 119 | |
120 | -let rec field_of_sentence fields tokens = function | |
120 | +let rec field_of_sentence fields = function | |
121 | 121 | RawSentence s -> s |
122 | 122 | | StructSentence _ -> "StructSentence" |
123 | 123 | | DepSentence _ -> "DepSentence" |
124 | - | ENIAMSentence result -> field_of_eniam_sentence fields tokens result | |
125 | - | CONLLSentence result -> field_of_conll_sentence fields tokens result | |
124 | + | ENIAMSentence result -> field_of_eniam_sentence fields result | |
125 | + | CONLLSentence result -> field_of_conll_sentence fields result | |
126 | 126 | | QuotedSentences sentences -> "QuotedSentences" |
127 | 127 | | AltSentence l -> String.concat "\n\t" (Xlist.map l (fun (m, s) -> |
128 | - Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields tokens s))) | |
128 | + Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields s))) | |
129 | 129 | | _ -> failwith "field_of_sentence: ni" |
130 | 130 | |
131 | -let rec field_of_paragraph fields tokens = function | |
131 | +let rec field_of_paragraph fields = function | |
132 | 132 | RawParagraph s -> print_endline "no fields detected: only raw paragraph"; s |
133 | 133 | | StructParagraph sentences -> |
134 | - String.concat "\n\t" (Xlist.map sentences (fun p -> field_of_sentence fields tokens p.psentence)) | |
134 | + String.concat "\n\t" (Xlist.map sentences (fun p -> field_of_sentence fields p.psentence)) | |
135 | 135 | | AltParagraph l -> |
136 | 136 | String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> (*m = ENIAM ||*) m = CONLL) l) (fun (m,t) -> |
137 | - Visualization.string_of_mode m ^ "\n\t" ^ (field_of_paragraph fields tokens t))) | |
138 | - (* field_of_paragraph fields tokens (snd @@ List.find (fun (mode,text) -> mode = ENIAM || mode = CONLL) l) *) | |
137 | + Visualization.string_of_mode m ^ "\n\t" ^ (field_of_paragraph fields t))) | |
138 | + (* field_of_paragraph fields (snd @@ List.find (fun (mode,text) -> mode = ENIAM || mode = CONLL) l) *) | |
139 | 139 | |
140 | 140 | let rec print_fields_rec fields = function |
141 | - RawText s -> print_endline "no fields detected: only raw text"; | |
142 | -| StructText(paragraphs,tokens) -> | |
143 | - print_endline (String.concat "\n\n" (Xlist.map paragraphs (field_of_paragraph fields tokens)) ^ "\n") | |
141 | + RawText s -> s | |
142 | + (* print_endline "no fields detected: only raw text"; *) | |
143 | +| StructText(paragraphs) -> | |
144 | + String.concat "\n\n" (Xlist.map paragraphs (field_of_paragraph fields)) ^ "\n" | |
144 | 145 | | AltText l -> |
145 | - print_fields_rec fields (snd @@ List.find (fun (m,t) -> m = Struct (*|| m = ENIAM*) || m = CONLL) l) | |
146 | + String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> m = Struct || m = CONLL) l) (fun (m,t) -> | |
147 | + Visualization.string_of_mode m ^ "\n\t" ^ (print_fields_rec fields t))) | |
148 | + (* print_fields_rec fields (snd @@ List.find (fun (m,t) -> m = Struct (*|| m = ENIAM*) || m = CONLL) l) *) | |
146 | 149 | |
147 | 150 | let print_fields fields text = |
148 | - print_fields_rec fields text | |
151 | + print_endline @@ print_fields_rec fields text | |
149 | 152 | (* ; print_field_map () *) |
... | ... |
corpora/makefile
... | ... | @@ -16,9 +16,9 @@ lib: |
16 | 16 | freq_test: |
17 | 17 | $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(MODS) freq_test.ml |
18 | 18 | |
19 | -test: CONLL.ml test_conll2.ml | |
19 | +test: CONLL.ml CONLL_adapter.ml test_conll2.ml | |
20 | 20 | mkdir -p results |
21 | - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) CONLL.ml test_conll2.ml | |
21 | + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ | |
22 | 22 | |
23 | 23 | |
24 | 24 | .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx |
... | ... |
corpora/test_conll.ml
1 | +open Xstd | |
2 | +open ENIAMsubsyntaxTypes | |
3 | +open ENIAMtokenizerTypes | |
4 | +open LCGtypes | |
5 | +open ExecTypes | |
1 | 6 | |
2 | 7 | let empty_result = { |
3 | 8 | input_text=RawText ""; |
... | ... | @@ -146,7 +151,7 @@ let eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems = |
146 | 151 | let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens lex_sems = |
147 | 152 | let result = empty_conll_parse_result in |
148 | 153 | let time2 = time_fun () in |
149 | - let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems | |
154 | + let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in | |
150 | 155 | try |
151 | 156 | let dep_chart = LCGlexicon.dep_create paths tokens lex_sems in |
152 | 157 | let dep_chart,references = LCGchart.dep_lazify dep_chart in |
... | ... | @@ -193,7 +198,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le |
193 | 198 | let time5 = time_fun () in |
194 | 199 | {result with status=ReductionError; msg=Printexc.to_string e; reduction_time=time5 -. time4} |
195 | 200 | else if first_try |
196 | - then conll_parse_sentence timeout test_only_flag id false paths tokens | |
201 | + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems | |
197 | 202 | else {result with status=NotParsed} |
198 | 203 | with |
199 | 204 | Timeout t -> |
... | ... | @@ -201,7 +206,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le |
201 | 206 | {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3} |
202 | 207 | | NotDepParsed(id_ndp,left,l,right) -> |
203 | 208 | if first_try |
204 | - then conll_parse_sentence timeout test_only_flag id false paths tokens | |
209 | + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems | |
205 | 210 | else let time4 = time_fun () in |
206 | 211 | {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3} |
207 | 212 | | e -> |
... | ... | @@ -210,7 +215,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le |
210 | 215 | with e -> (*print_endline (Printexc.to_string e);*) |
211 | 216 | let time3 = time_fun () in |
212 | 217 | if first_try |
213 | - then conll_parse_sentence timeout test_only_flag id false paths tokens | |
218 | + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems | |
214 | 219 | else {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2} |
215 | 220 | |
216 | 221 | |
... | ... | @@ -243,11 +248,7 @@ let get_paths old_paths = function |
243 | 248 | paths |
244 | 249 | | _ -> failwith "get_paths" |
245 | 250 | |
246 | -<<<<<<< HEAD | |
247 | -let rec parse_sentence timeout test_only_flag mode file_prefix tokens lex_sems = function | |
248 | -======= | |
249 | -let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = function | |
250 | ->>>>>>> dep_trees | |
251 | +let rec parse_sentence timeout test_only_flag mode id file_prefix tokens lex_sems = function | |
251 | 252 | RawSentence s -> |
252 | 253 | (match mode with |
253 | 254 | Swigra -> |
... | ... | @@ -259,23 +260,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct |
259 | 260 | | StructSentence(paths,last) -> |
260 | 261 | (match mode with |
261 | 262 | ENIAM -> |
262 | -<<<<<<< HEAD | |
263 | 263 | let result = eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems in |
264 | -======= | |
265 | - let result = empty_eniam_parse_result in | |
266 | - (* let result = print_endline "eniam_parse_sentence"; eniam_parse_sentence timeout test_only_flag paths last tokens in *) | |
267 | ->>>>>>> dep_trees | |
264 | + (* let result = empty_eniam_parse_result in *) | |
268 | 265 | let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in |
269 | 266 | ENIAMSentence result |
270 | 267 | | _ -> failwith "parse_sentence") |
271 | 268 | | DepSentence(paths) -> |
272 | 269 | (match mode with |
273 | 270 | CONLL -> |
274 | -<<<<<<< HEAD | |
275 | - let result = conll_parse_sentence timeout test_only_flag paths tokens lex_sems in | |
276 | -======= | |
277 | - let result = (*print_endline "conll_parse_sentence";*) conll_parse_sentence timeout test_only_flag id true paths tokens in | |
278 | ->>>>>>> dep_trees | |
271 | + let result = conll_parse_sentence timeout test_only_flag id true paths tokens lex_sems in | |
279 | 272 | let result = {result with |
280 | 273 | file_prefix = file_prefix_of_mode mode ^ file_prefix; |
281 | 274 | paths = paths} in |
... | ... | @@ -289,19 +282,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct |
289 | 282 | if not Paths.config.Paths.mate_parser_enabled then DepSentence paths else ( |
290 | 283 | print_endline "parse_sentence 1"; |
291 | 284 | (* print_endline (Visualization.html_of_dep_sentence tokens paths); *) |
292 | - let conll = ENIAM_CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in | |
285 | + let conll = CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in | |
293 | 286 | print_endline "parse_sentence 2"; |
294 | 287 | (* printf "|%s|\n" conll; *) |
295 | 288 | Printf.fprintf mate_out "%s%!" conll; |
296 | 289 | print_endline "parse_sentence 3"; |
297 | - let new_paths = get_paths paths (ENIAM_CONLL.load_sentence mate_in) in | |
290 | + let new_paths = get_paths paths (CONLL.load_sentence mate_in) in | |
298 | 291 | print_endline "parse_sentence 4"; |
299 | 292 | (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *) |
300 | -<<<<<<< HEAD | |
301 | - let result = conll_parse_sentence timeout test_only_flag new_paths tokens lex_sems in | |
302 | -======= | |
303 | - let result = conll_parse_sentence timeout test_only_flag id true new_paths tokens in | |
304 | ->>>>>>> dep_trees | |
293 | + let result = conll_parse_sentence timeout test_only_flag id true new_paths tokens lex_sems in | |
305 | 294 | let result = {result with |
306 | 295 | file_prefix = file_prefix_of_mode mode ^ file_prefix; |
307 | 296 | paths=new_paths} in |
... | ... | @@ -309,66 +298,94 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct |
309 | 298 | | _ -> failwith "parse_sentence") |
310 | 299 | | QuotedSentences sentences -> |
311 | 300 | let sentences = Xlist.rev_map sentences (fun p -> |
312 | -<<<<<<< HEAD | |
313 | - let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens lex_sems p.psentence in | |
314 | -======= | |
315 | - let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens p.psentence in | |
316 | ->>>>>>> dep_trees | |
301 | + let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens lex_sems p.psentence in | |
317 | 302 | {p with psentence=sentence}) in |
318 | 303 | QuotedSentences(List.rev sentences) |
319 | 304 | | AltSentence l -> |
320 | 305 | let l = Xlist.rev_map l (fun (mode,sentence) -> |
321 | -<<<<<<< HEAD | |
322 | - mode, parse_sentence timeout test_only_flag mode file_prefix tokens lex_sems sentence) in | |
306 | + mode, parse_sentence timeout test_only_flag mode id file_prefix tokens lex_sems sentence) in | |
323 | 307 | AltSentence(List.rev l) |
324 | 308 | | _ -> failwith "parse_sentence" |
325 | 309 | |
326 | -let rec parse_paragraph timeout test_only_flag mode tokens lex_sems = function | |
310 | +let rec parse_paragraph timeout test_only_flag mode id tokens lex_sems = function | |
327 | 311 | RawParagraph s -> RawParagraph s |
328 | 312 | | StructParagraph sentences -> |
329 | 313 | let sentences = Xlist.rev_map sentences (fun p -> |
330 | - let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens lex_sems p.psentence in | |
331 | -======= | |
332 | - mode, parse_sentence timeout test_only_flag mode id file_prefix tokens sentence) in | |
333 | - AltSentence(List.rev l) | |
334 | - | _ -> failwith "parse_sentence" | |
314 | + let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens lex_sems p.psentence in | |
315 | + {p with psentence=sentence}) in | |
316 | + StructParagraph(List.rev sentences) | |
317 | + | AltParagraph l -> | |
318 | + let l = Xlist.rev_map l (fun (mode,paragraph) -> | |
319 | + mode, parse_paragraph timeout test_only_flag mode id tokens lex_sems paragraph) in | |
320 | + AltParagraph(List.rev l) | |
321 | + | |
322 | +let rec parse_text timeout test_only_flag mode id tokens lex_sems = function | |
323 | + RawText s -> RawText s | |
324 | + | StructText paragraphs -> | |
325 | + let paragraphs = Xlist.rev_map paragraphs (fun paragraph -> | |
326 | + parse_paragraph timeout test_only_flag mode id tokens lex_sems paragraph) in | |
327 | + StructText(List.rev paragraphs) | |
328 | + | AltText l -> AltText(Xlist.map l (fun (mode,text) -> | |
329 | + mode, parse_text timeout test_only_flag mode id tokens lex_sems text)) | |
330 | + | |
331 | +let select_mode = function | |
332 | + (Raw,_),_ -> failwith "select_mode" | |
333 | + | _,(Raw,_) -> failwith "select_mode" | |
334 | + | (Struct,_),_ -> failwith "select_mode" | |
335 | + | _,(Struct,_) -> failwith "select_mode" | |
336 | + | (CONLL,s),_ -> CONLL,s | |
337 | + | _,(CONLL,s) -> CONLL,s | |
338 | + | (ENIAM,s),_ -> ENIAM,s | |
339 | + | _,(ENIAM,s) -> ENIAM,s | |
340 | + | (Swigra,s),_ -> Swigra,s | |
341 | + | _,(Swigra,s) -> Swigra,s | |
342 | + | (Mate,s),_ -> Mate,s | |
343 | + | _,(Mate,s) -> Mate,s | |
344 | + | _ -> failwith "select_mode: ni" | |
335 | 345 | |
336 | -let rec parse_paragraph timeout test_only_flag mode id tokens = function | |
346 | +let rec select_sentences_sentence = function | |
347 | + RawSentence s -> failwith "select_sentences_sentence" | |
348 | + | StructSentence(paths,last) -> failwith "select_sentences_sentence" | |
349 | + | DepSentence paths -> failwith "select_sentences_sentence" | |
350 | + | QuotedSentences sentences -> | |
351 | + let sentences = Xlist.rev_map sentences (fun p -> | |
352 | + let sentence,_ = select_sentences_sentence p.psentence in | |
353 | + {p with psentence=sentence}) in | |
354 | + QuotedSentences(List.rev sentences), Parsed | |
355 | + | AltSentence l -> | |
356 | + let raw,selected = Xlist.fold l ([],[]) (fun (raw,selected) (mode,sentence) -> | |
357 | + if mode = Raw then (mode,sentence) :: raw, selected else | |
358 | + let sentence,status = select_sentences_sentence sentence in | |
359 | + if status <> Parsed && status <> NotTranslated then raw,selected else | |
360 | + match selected with | |
361 | + [] -> raw,[mode,sentence] | |
362 | + | [mode2,sentence2] -> raw,[select_mode ((mode,sentence),(mode2,sentence2))] | |
363 | + | _ -> failwith "select_sentences_sentence") in | |
364 | + AltSentence(raw @ selected), Parsed | |
365 | + | ENIAMSentence result -> ENIAMSentence result, result.status | |
366 | + | CONLLSentence result -> CONLLSentence result, result.status | |
367 | + | SemSentence result -> SemSentence result, result.status | |
368 | + | |
369 | +let rec select_sentences_paragraph = function | |
337 | 370 | RawParagraph s -> RawParagraph s |
338 | 371 | | StructParagraph sentences -> |
339 | 372 | let sentences = Xlist.rev_map sentences (fun p -> |
340 | - let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens p.psentence in | |
341 | ->>>>>>> dep_trees | |
373 | + let sentence,_ = select_sentences_sentence p.psentence in | |
342 | 374 | {p with psentence=sentence}) in |
343 | 375 | StructParagraph(List.rev sentences) |
344 | 376 | | AltParagraph l -> |
345 | 377 | let l = Xlist.rev_map l (fun (mode,paragraph) -> |
346 | -<<<<<<< HEAD | |
347 | - mode, parse_paragraph timeout test_only_flag mode tokens lex_sems paragraph) in | |
348 | - AltParagraph(List.rev l) | |
349 | - | |
350 | -let rec parse_text timeout test_only_flag mode tokens lex_sems = function | |
351 | -======= | |
352 | - mode, parse_paragraph timeout test_only_flag mode id tokens paragraph) in | |
378 | + mode, select_sentences_paragraph paragraph) in | |
353 | 379 | AltParagraph(List.rev l) |
354 | 380 | |
355 | -let rec parse_text timeout test_only_flag mode id = function | |
356 | ->>>>>>> dep_trees | |
381 | +let rec select_sentences_text = function | |
357 | 382 | RawText s -> RawText s |
358 | 383 | | StructText paragraphs -> |
359 | 384 | let paragraphs = Xlist.rev_map paragraphs (fun paragraph -> |
360 | -<<<<<<< HEAD | |
361 | - parse_paragraph timeout test_only_flag mode tokens lex_sems paragraph) in | |
385 | + select_sentences_paragraph paragraph) in | |
362 | 386 | StructText(List.rev paragraphs) |
363 | 387 | | AltText l -> AltText(Xlist.map l (fun (mode,text) -> |
364 | - mode, parse_text timeout test_only_flag mode tokens lex_sems text)) | |
365 | -======= | |
366 | - parse_paragraph timeout test_only_flag mode id tokens paragraph) in | |
367 | - StructText(List.rev paragraphs, tokens) | |
368 | - | AltText l -> AltText(Xlist.map l (fun (mode,text) -> | |
369 | - mode, parse_text timeout test_only_flag mode id text)) | |
370 | ->>>>>>> dep_trees | |
371 | - | |
388 | + mode, select_sentences_text text)) | |
372 | 389 | |
373 | 390 | let rec extract_query_text = function |
374 | 391 | RawText s -> s |
... | ... | @@ -392,11 +409,7 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n = |
392 | 409 | let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1} in |
393 | 410 | if msg <> "" then {result with status=PreprocessingError; msg=msg} else ( |
394 | 411 | (* print_endline "process_query 3"; *) |
395 | -<<<<<<< HEAD | |
396 | - let parsed_text = parse_text timeout test_only_flag Struct tokens lex_sems (translate_text pre_text) in | |
397 | -======= | |
398 | - let parsed_text = parse_text timeout test_only_flag Struct id (translate_text pre_text) in | |
399 | ->>>>>>> dep_trees | |
412 | + let parsed_text = parse_text timeout test_only_flag Struct id tokens lex_sems (translate_text pre_text) in | |
400 | 413 | (* print_endline "process_query 4"; *) |
401 | 414 | let time3 = time_fun () in |
402 | 415 | let result = if test_only_flag then result else {result with status=Parsed; parsed_text=parsed_text} in |
... | ... | @@ -421,23 +434,50 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n = |
421 | 434 | let result = {result with semantic_time=time4 -. time3} in |
422 | 435 | result) |
423 | 436 | |
437 | +let get_sock_addr host_name port = | |
438 | + let he = Unix.gethostbyname host_name in | |
439 | + let addr = he.Unix.h_addr_list in | |
440 | + Unix.ADDR_INET(addr.(0),port) | |
441 | + | |
442 | +let id_counter = ref 0 | |
443 | + | |
444 | +let get_id () = | |
445 | + incr id_counter; | |
446 | + "ID_" ^ (string_of_int !id_counter) | |
447 | + | |
448 | +let get_query_id = function | |
449 | + ENIAMsubsyntaxTypes.AltText[_;ENIAMsubsyntaxTypes.CONLL,ENIAMsubsyntaxTypes.StructText([ENIAMsubsyntaxTypes.StructParagraph[p]])] -> if p.ENIAMsubsyntaxTypes.pid = "" then get_id () else p.ENIAMsubsyntaxTypes.pid | |
450 | + | ENIAMsubsyntaxTypes.AltText[ENIAMsubsyntaxTypes.CONLL,ENIAMsubsyntaxTypes.StructText([ENIAMsubsyntaxTypes.StructParagraph[p]])] -> if p.ENIAMsubsyntaxTypes.pid = "" then get_id () else p.ENIAMsubsyntaxTypes.pid | |
451 | + | _ -> failwith "get_query_id" | |
452 | + | |
453 | +let process_id s = | |
454 | + if Xstring.check_prefix "ID_" s then s else | |
455 | + let a,b,c = match Xstring.split_delim "/" s with | |
456 | + [a;b;c] -> a,b,c | |
457 | + | _ -> failwith ("process_id: " ^ s) in | |
458 | + if Xstring.check_prefix "NKJP_1M_" a && Xstring.check_prefix "morph_" b && Xstring.check_sufix "-p" b && | |
459 | + Xstring.check_prefix "morph_" c && Xstring.check_sufix "-s" c then | |
460 | + Xstring.cut_prefix "NKJP_1M_" a ^ "." ^ Xstring.cut_sufix "-s" (Xstring.cut_prefix "morph_" c) | |
461 | + else failwith ("process_id: " ^ s) | |
424 | 462 | |
425 | 463 | let process_conll_corpus filename = |
464 | + print_endline "process_conll_corpus: START"; | |
426 | 465 | let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in |
427 | - print_endline "process_conll_corpus"; | |
428 | - let corpus = [List.hd corpus] in | |
466 | + print_endline "process_conll_corpus: DONE"; | |
467 | + (* let corpus = [List.hd corpus] in *) | |
429 | 468 | let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in |
430 | - Xlist.iter corpus (fun query -> | |
469 | + print_endline "connection_opened"; | |
470 | + Xlist.iter corpus (fun (query,tokens) -> | |
431 | 471 | let id = process_id (get_query_id query) in |
432 | 472 | let path = "results/" ^ id ^ "/" in |
433 | 473 | ignore (Sys.command ("mkdir -p " ^ path)); |
434 | - let result = process_query ic oc 30. false "x" query 10 in | |
435 | - Visualization.print_html_text path "input_text" result.input_text; | |
436 | - Visualization.print_html_text path "pre_text" result.pre_text; | |
437 | - Visualization.print_html_text path "parsed_text" result.parsed_text; | |
438 | - Visualization.print_html_text path "selected_sent_text" result.selected_sent_text; | |
439 | - Visualization.print_html_text path "semantic_text" result.semantic_text; | |
440 | - Visualization.print_html_text path "selected_semantic_text" result.selected_semantic_text; | |
474 | + let result = process_query ic oc 30. false "x" (query,tokens) 10 in | |
475 | + (* Visualization.print_html_text path "input_text" result.input_text tokens; | |
476 | + Visualization.print_html_text path "pre_text" result.pre_text tokens; | |
477 | + Visualization.print_html_text path "parsed_text" result.parsed_text tokens; | |
478 | + Visualization.print_html_text path "selected_sent_text" result.selected_sent_text tokens; | |
479 | + Visualization.print_html_text path "semantic_text" result.semantic_text tokens; | |
480 | + Visualization.print_html_text path "selected_semantic_text" result.selected_semantic_text tokens; *) | |
441 | 481 | (* printf "input_text:\n%s\n" (Visualization.string_of_text result.input_text); |
442 | 482 | printf "pre_text:\n%s\n" (Visualization.string_of_text result.pre_text); *) |
443 | 483 | (* Exec.print_result stdout result; *) |
... | ... | @@ -445,13 +485,15 @@ let process_conll_corpus filename = |
445 | 485 | (* CompTrees.compare_results result.parsed_text; *) |
446 | 486 | (* Visualization.print_paths "results/" "paths" result.paths; *) |
447 | 487 | ()); |
448 | - Marshal.to_channel oc (PreTypes.RawText "",ExtArray.make 1 ENIAMtokenizerTypes.empty_token) []; | |
488 | + Marshal.to_channel oc (ENIAMsubsyntaxTypes.RawText "",ExtArray.make 1 ENIAMtokenizerTypes.empty_token) []; | |
449 | 489 | flush oc; |
450 | 490 | let _ = Unix.shutdown_connection ic in |
451 | 491 | () |
452 | 492 | |
453 | 493 | let _ = |
494 | + LCGfields.reset(); | |
454 | 495 | (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *) |
455 | - (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) | |
456 | - process_conll_corpus "../testy/skladnica-test1.conll"; | |
496 | + process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; | |
497 | + (* process_conll_corpus "../testy/skladnica-test1.conll"; *) | |
498 | + LCGfields.print_results(); | |
457 | 499 | () |
... | ... |
corpora/test_conll2.ml
... | ... | @@ -116,7 +116,7 @@ let test_example path id tokens lex_sems paths last = |
116 | 116 | let test_dep_example path id tokens lex_sems paths = |
117 | 117 | try |
118 | 118 | ENIAM_LCGreductions.reset_variant_label (); |
119 | - (* let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in *) | |
119 | + let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in | |
120 | 120 | ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; |
121 | 121 | let chart = create_dep_chart tokens lex_sems paths in |
122 | 122 | ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; |
... | ... | @@ -150,7 +150,7 @@ let test_dep_example path id tokens lex_sems paths = |
150 | 150 | let rec parse_sentence name id tokens lex_sems = function |
151 | 151 | RawSentence s -> id |
152 | 152 | | StructSentence(paths,last) -> |
153 | - test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; | |
153 | + (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *) | |
154 | 154 | id + 1 |
155 | 155 | | DepSentence(paths) -> |
156 | 156 | test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths; |
... | ... |
diagnostics/treeChange.ml deleted
1 | -open Xstd | |
2 | -open PreTypes | |
3 | - | |
4 | -let if_lemma lemmas = function | |
5 | - Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas | |
6 | - | _ -> false | |
7 | - | |
8 | -let if_cat cats = function | |
9 | - Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats | |
10 | - | _ -> false | |
11 | - | |
12 | -let if_interps interps token = | |
13 | - let interp = match token with | |
14 | - Lemma(_,_,i) -> i | |
15 | - | _ -> [[[]]] in | |
16 | - let if_interp nr value = | |
17 | - List.exists (fun x -> | |
18 | - try | |
19 | - List.exists (fun y -> | |
20 | - y = value) (List.nth x nr) | |
21 | - with _ -> false | |
22 | - ) interp in | |
23 | - Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) | |
24 | - | |
25 | -let correct_coordination1 paths tokens = | |
26 | - let paths_ls = List.mapi (fun i (id,super,label) -> | |
27 | - (i,id,super,label)) (Array.to_list paths) in | |
28 | - | |
29 | - let l = [("subst:nom",0),(["fin";"praet"],0); | |
30 | - ("subst:acc",0),(["inf"],0); | |
31 | - ("ppron3:nom",0),(["fin";"praet"],0); | |
32 | - ("ppron3:acc",0),(["fin";"praet"],0); | |
33 | - ("adv",0),(["fin";"praet"],0); | |
34 | - ("adv",0),(["inf"],0); | |
35 | - ("adv",0),(["adj"],0); | |
36 | - ("prep",0),(["fin";"praet"],0); | |
37 | - ("prep",0),(["inf"],0); | |
38 | - ("prep",0),(["ppas"],0); | |
39 | - ("prep",0),(["subst"],0); | |
40 | - ("prep:gen",0),(["subst:gen"],0); | |
41 | - ("adj:nom",0),(["fin";"praet"],0); | |
42 | - ("adj:nom",0),(["subst:nom"],0); | |
43 | - ("adj:gen",0),(["subst:gen"],0); | |
44 | - ("adj:dat",0),(["subst:dat"],0); | |
45 | - ("adj:acc",0),(["subst:acc"],0); | |
46 | - ("adj:inst",0),(["subst:inst"],0); | |
47 | - ("adj:loc",0),(["subst:loc"],0); | |
48 | - ("subst:gen",0),(["subst:nom"],0); | |
49 | - (* ("subst:gen",0),(["subst:gen"],0); *) | |
50 | - ("subst:gen",0),(["subst:dat"],0); | |
51 | - ("subst:gen",0),(["subst:acc"],0); | |
52 | - ("subst:gen",0),(["subst:inst"],0); | |
53 | - ("subst:gen",0),(["subst:loc"],0); | |
54 | - ("ppron3:gen",0),(["subst:nom"],0); | |
55 | - ("ppron3:gen",0),(["subst:dat"],0); | |
56 | - ("ppron3:gen",0),(["subst:acc"],0); | |
57 | - ("ppron3:gen",0),(["subst:inst"],0); | |
58 | - ("ppron3:gen",0),(["subst:loc"],0); | |
59 | - ("qub",0),(["fin";"praet"],0); | |
60 | - ("qub",0),(["subst"],0); | |
61 | - ("qub",0),(["adj"],0); | |
62 | - ("pact",0),(["subst"],0); | |
63 | - ("ppas",0),(["subst"],0) | |
64 | - ] in | |
65 | - | |
66 | - let find_dependents sons = | |
67 | - | |
68 | - let is (i,id,super,label) pattern = match Xstring.split ":" pattern with | |
69 | - ["prep";case] -> if_cat ["prep"] (ExtArray.get tokens id).token && | |
70 | - if_interps [0,case] (ExtArray.get tokens id).token | |
71 | - | [cat;case] -> if_cat [cat] (ExtArray.get tokens id).token && | |
72 | - if_interps [1,case] (ExtArray.get tokens id).token | |
73 | - | [cat] -> if_cat [cat] (ExtArray.get tokens id).token | |
74 | - | _ -> failwith "is (in correct_coordination1)" in | |
75 | - | |
76 | - let incr_representative acc son = Xlist.map acc (fun ((one,a),(rest,b)) -> | |
77 | - if is son one | |
78 | - then (one,a + 1), (rest,b) | |
79 | - else if List.exists (is son) rest | |
80 | - then (one,a), (rest,b + 1) | |
81 | - else (one,a), (rest,b)) in | |
82 | - | |
83 | - let get_from sons pattern = List.find (fun x -> is x pattern) sons in | |
84 | - | |
85 | - let l = Xlist.fold sons l incr_representative in | |
86 | - let results = List.filter (fun ((_,a),(_,b)) -> a = 1 && b > 1) l in | |
87 | - Xlist.map results (fun result -> | |
88 | - get_from sons @@ fst @@ fst result, | |
89 | - List.filter (fun son -> | |
90 | - List.exists (fun one -> is son one) (fst (snd result))) sons) in | |
91 | - | |
92 | - let establish_neighbour super ((i_d,id_d,super_d,label_d),sons) = | |
93 | - let not_between (i_s,_,_,_) = | |
94 | - (super < i_d && super < i_s) || | |
95 | - (super > i_d && super > i_s) in | |
96 | - let (i_n,id_n,super_n,label_n) = List.find (fun son -> | |
97 | - not_between son) sons in | |
98 | - paths.(i_d) <- (id_d, i_n, label_d) in | |
99 | - | |
100 | - let examine_coords (i,id,super,label) sons = | |
101 | - try | |
102 | - let dependents = find_dependents sons in | |
103 | - Xlist.iter dependents (establish_neighbour super) | |
104 | - with | |
105 | - | _ -> () in | |
106 | - | |
107 | - Array.iteri (fun i (id,super,label) -> | |
108 | - if if_cat ["conj"] (ExtArray.get tokens id).token | |
109 | - then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in | |
110 | - if (List.length sons > 2) | |
111 | - then examine_coords (i,id,super,label) sons)) paths; | |
112 | - paths | |
113 | - | |
114 | -let correct_coordination2 paths tokens = | |
115 | - let paths_c = Array.copy paths in | |
116 | - let paths_ls () = List.mapi (fun i (id,super,label) -> | |
117 | - (i,id,super,label)) (Array.to_list paths_c) in | |
118 | - | |
119 | - (* let ps a sons = | |
120 | - print_endline a; | |
121 | - List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; | |
122 | - print_endline "" in *) | |
123 | - | |
124 | - let rec correct_rec (i,id,super,label) sons = | |
125 | - let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in | |
126 | - (* ps "left:" (List.rev left_s); | |
127 | - ps "right:" right_s; *) | |
128 | - find_father i (List.rev left_s); | |
129 | - find_father i right_s | |
130 | - | |
131 | - and find_father i0 = function | |
132 | - [(i,id,super,label)] -> paths_c.(i) <- (id,i0,label) | |
133 | - | (a,b,c,d) :: (i,id,super,label) :: t -> | |
134 | - paths_c.(i) <- (id,i0,label); | |
135 | - if not (if_cat ["conj"] (ExtArray.get tokens i).token || | |
136 | - (ExtArray.get tokens i).orth = ",") | |
137 | - then failwith "find_father"; | |
138 | - correct_rec (i,id,super,label) (if a < i | |
139 | - then (a,b,c,d) :: t | |
140 | - else List.rev @@ (a,b,c,d) :: t) | |
141 | - | _ -> failwith "find_father" in | |
142 | - | |
143 | - let check_previous_for_interp i = | |
144 | - if i >= 0 && (ExtArray.get tokens i).orth = "," && | |
145 | - not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c)) | |
146 | - then paths_c.(i) <- (0,-1,"") in | |
147 | - | |
148 | - Array.iteri (fun i (id,super,label) -> | |
149 | - if if_cat ["conj"] (ExtArray.get tokens i).token || | |
150 | - (ExtArray.get tokens i).orth = "," | |
151 | - then | |
152 | - (check_previous_for_interp (i-1); | |
153 | - let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in | |
154 | - if (List.length sons > 2) | |
155 | - then correct_rec (i,id,super,label) sons)) paths_c; | |
156 | - paths_c | |
157 | - | |
158 | -let praet_qub_aglt paths tokens = | |
159 | - Array.iteri (fun i (id,super,label) -> | |
160 | - if super >= 0 then | |
161 | - (let id_s, super_s, label_s = paths.(super) in | |
162 | - if if_cat ["aglt"] (ExtArray.get tokens id).token && | |
163 | - (ExtArray.get tokens id_s).orth = "by" | |
164 | - then let id_gf,super_gf,label_gf = paths.(super_s) in | |
165 | - if if_cat ["praet"] (ExtArray.get tokens id_gf).token | |
166 | - then paths.(i) <- (id,super_s,label))) paths; | |
167 | - paths | |
168 | - | |
169 | -let replace_tokens paths tokens = | |
170 | -(* for i = 0 to ExtArray.size tokens - 1 do | |
171 | - print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth) | |
172 | -done; *) | |
173 | - let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> | |
174 | - if (ExtArray.get tokens i).orth = orth then i else acc) in | |
175 | - | |
176 | - let multidot i id0 super0 label0 = | |
177 | - let id1, super1, label1 = paths.(super0) in | |
178 | - if super1 >= 0 then | |
179 | - let id2, super2, label2 = paths.(super1) in | |
180 | - if (ExtArray.get tokens id1).orth = "." && | |
181 | - (ExtArray.get tokens id2).orth = "." | |
182 | - then | |
183 | - (paths.(super1) <- (find_token "..." ,super2, label2); | |
184 | - paths.(super0) <- (0,-1,""); | |
185 | - paths.(i) <- (0,-1,"")) in | |
186 | - | |
187 | - let brev i id super label = | |
188 | - let if_the_last_dot () = | |
189 | - let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> | |
190 | - s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in | |
191 | - Array.fold_left (fun acc (i2,s,l) -> | |
192 | - acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in | |
193 | - | |
194 | - let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () | |
195 | - then "" | |
196 | - else "." in | |
197 | - | |
198 | - let n_orth = (ExtArray.get tokens id).orth ^ dot in | |
199 | - paths.(i) <- (find_token n_orth,super,label) in | |
200 | - | |
201 | - Array.iteri (fun i (id,super,label) -> | |
202 | - if (ExtArray.get tokens id).orth = "." | |
203 | - then multidot i id super label; | |
204 | - if if_cat ["brev"] (ExtArray.get tokens id).token | |
205 | - then brev i id super label) | |
206 | - paths; | |
207 | - paths | |
208 | - | |
209 | -let replace_hyphens paths tokens = | |
210 | - let ref_paths = ref paths in | |
211 | - let find_token token = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> | |
212 | - if (ExtArray.get tokens i).token = token then i else acc) in | |
213 | - let find_specific_token token beg next = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> | |
214 | - if (ExtArray.get tokens i).token = token && | |
215 | - beg <= (ExtArray.get tokens i).beg && | |
216 | - (ExtArray.get tokens i).next <= next | |
217 | - then i else acc) in | |
218 | - | |
219 | - let correct_last son_of_zero = | |
220 | - let i1,s1,l1 = !ref_paths.(Array.length !ref_paths - 1) in | |
221 | - let i2,s2,l2 = !ref_paths.(son_of_zero) in | |
222 | - if (ExtArray.get tokens i1).orth = "." | |
223 | - then | |
224 | - (!ref_paths.(Array.length !ref_paths - 1) <- (find_token (Interp "</sentence>"),1,l1); | |
225 | - !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) | |
226 | - else | |
227 | - (ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),1,"-") |]; | |
228 | - !ref_paths.(Array.length !ref_paths - 2) <- (i1,Array.length !ref_paths - 1,l1); | |
229 | - !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) in | |
230 | - | |
231 | - let one_hyphen sons_of_zero = | |
232 | - let i2,s2,l2 = !ref_paths.(1) in | |
233 | - Xlist.iter sons_of_zero (fun son_of_zero -> | |
234 | - let i1,s1,l1 = !ref_paths.(son_of_zero) in | |
235 | - !ref_paths.(son_of_zero) <- (i1,1,l1)); | |
236 | - !ref_paths.(1) <- (find_token (Interp "<or-sentence>"),0,l2); | |
237 | - correct_last son_of_zero in | |
238 | - | |
239 | - let two_hyphens first second son parent = | |
240 | - let i1,s1,l1 = !ref_paths.(first) in | |
241 | - let i2,s2,l2 = !ref_paths.(second) in | |
242 | - let beg, next = (ExtArray.get tokens i2).beg, (ExtArray.get tokens i2).next in | |
243 | - let i3,s3,l3 = !ref_paths.(son) in | |
244 | - let i4,s4,l4 = !ref_paths.(parent) in | |
245 | - ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),first,"-") |]; | |
246 | - !ref_paths.(first) <- (find_token (Interp "<or-sentence>"),0,l1); | |
247 | - !ref_paths.(second) <- (find_specific_token (Interp "</or-sentence>") beg next,first,l2); | |
248 | - !ref_paths.(son) <- (i3,second,l3); | |
249 | - !ref_paths.(parent) <- (i4,first,l4) in | |
250 | - | |
251 | - let rec is_dep_correct a b out zero res i (id,super,label) = (* out = how many words in (a,b) have parent outside [a,b]*) | |
252 | - (* print_endline ((string_of_int a) ^ " " ^ (string_of_int b) ^ " " ^ (string_of_int out) ^ " " ^ (string_of_int zero) ^ " " ^ (string_of_int i)); *) | |
253 | - if out > 1 || zero > 1 || (* zero = how many words (not interps) have parent 0 *) | |
254 | - (a < i && i < b && super < a && label <> "interp") || | |
255 | - (a < super && super < b && (i < a || b < i)) | |
256 | - then false, res | |
257 | - else | |
258 | - if i+1 = Array.length !ref_paths | |
259 | - then out = 1 && zero = 1, res | |
260 | - else | |
261 | - if a < i && i < b && b < super | |
262 | - then is_dep_correct a b (out+1) zero (i,super) (i+1) !ref_paths.(i+1) | |
263 | - else | |
264 | - if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token) | |
265 | - then is_dep_correct a b out (zero+1) res (i+1) !ref_paths.(i+1) | |
266 | - else is_dep_correct a b out zero res (i+1) !ref_paths.(i+1) in | |
267 | - | |
268 | - let hyphens = snd @@ Array.fold_left (fun (i,acc) (id,super,label) -> | |
269 | - if (ExtArray.get tokens id).orth = "-" | |
270 | - then i+1, i :: acc | |
271 | - else i+1, acc) (0,[]) !ref_paths in | |
272 | - | |
273 | - let sons_of_zero = snd @@ Array.fold_left (fun (i,acc) (id,super,label) -> | |
274 | - if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token) | |
275 | - then i+1, i :: acc | |
276 | - else i+1, acc) (0,[]) !ref_paths in | |
277 | - | |
278 | - (if List.length sons_of_zero = 1 | |
279 | - then | |
280 | - if List.length hyphens = 1 && hyphens = [1] | |
281 | - then one_hyphen sons_of_zero | |
282 | - else | |
283 | - if List.length hyphens = 2 | |
284 | - then let a, b = List.nth hyphens 1, List.nth hyphens 0 in | |
285 | - let is_good, (son,parent) = is_dep_correct a b 0 0 (0,0) 1 !ref_paths.(1) in | |
286 | - if a = 1 && is_good | |
287 | - then two_hyphens a b son parent); | |
288 | - !ref_paths | |
289 | - | |
290 | -let correct_interp_with_father_0 paths tokens = | |
291 | - Array.iteri (fun i (id,super,label) -> | |
292 | - if (super = 0 || | |
293 | - (ExtArray.get tokens id).token = Interp "<or-sentence>" || | |
294 | - (ExtArray.get tokens id).token = Interp "</or-sentence>") && (ExtArray.get tokens id).orth = "," | |
295 | - then Array.iteri (fun i1 (id1,super1,label1) -> | |
296 | - if super1 = i | |
297 | - then paths.(i1) <- (id1,0,label1)) paths) paths; | |
298 | - paths | |
299 | - | |
300 | -let remove_interps interp paths tokens = | |
301 | - let paths_ls = Array.to_list paths in | |
302 | - Array.iteri (fun i (id,super,label) -> | |
303 | - if (ExtArray.get tokens id).orth = interp && | |
304 | - not (List.exists (fun (_,super,_) -> super = i) paths_ls) | |
305 | - then paths.(i) <- (0,-1,"")) paths; | |
306 | - paths | |
307 | - | |
308 | -let correct_passive_voice paths tokens = | |
309 | - Array.iteri (fun i (id,super,label) -> | |
310 | - if super >= 0 then | |
311 | - (let id_s, super_s, label_s = paths.(super) in | |
312 | - if (if_cat ["praet"] (ExtArray.get tokens id).token && | |
313 | - if_cat ["ppas"] (ExtArray.get tokens id_s).token) | |
314 | - then (paths.(i) <- (id,super_s,label); | |
315 | - paths.(super) <- (id_s,i,label_s); | |
316 | - Array.iteri (fun i_p (id_p,super_p,label_p) -> | |
317 | - if super_p = super | |
318 | - then paths.(i_p) <- (id_p,i,label_p)) paths))) paths; | |
319 | - paths | |
320 | - | |
321 | -let swap_dep paths tokens = | |
322 | - let change_dep i (id,super,label) = | |
323 | - let id_S, super_S, label_S = paths.(super) in | |
324 | - paths.(i) <- (id,super_S,label); | |
325 | - paths.(super) <- (id_S, id, label_S) in | |
326 | - let rec correct_dep i (id,super,label) = | |
327 | - let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; | |
328 | - "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in | |
329 | - if (if_cat ["comp"] (ExtArray.get tokens id).token && | |
330 | - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token) || | |
331 | - (if_cat ["conj"] (ExtArray.get tokens id).token && | |
332 | - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token && | |
333 | - not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) || | |
334 | - (if_cat ["ppron3"] (ExtArray.get tokens id).token && | |
335 | - if_interps [5,"praep"] (ExtArray.get tokens id).token) || | |
336 | - (if_lemma adv_relators (ExtArray.get tokens id).token && | |
337 | - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token) | |
338 | - then | |
339 | - change_dep i (id,super,label); | |
340 | - if (if_lemma adv_relators (ExtArray.get tokens id).token && | |
341 | - if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) | |
342 | - then correct_dep i paths.(i) in | |
343 | - Array.iteri correct_dep paths; paths | |
344 | - | |
345 | - (* | |
346 | - correct_coordination1 -> sąsiad słowem najbliższym po prawej, jeśli pomiędzy nim a mną spójnik, to najbliższym po lewej | |
347 | - nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy | |
348 | - nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *) |
parser/visualization.ml
... | ... | @@ -916,7 +916,7 @@ let rec html_of_text path tokens = function |
916 | 916 | sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_text path tokens text))) ^ |
917 | 917 | "</table>" |
918 | 918 | |
919 | -let print_html_text path name text tokens lex_sems = | |
919 | +let print_html_text path name text tokens (*lex_sems*) = | |
920 | 920 | File.file_out (path ^ name ^ ".html") (fun file -> |
921 | 921 | fprintf file "%s\n" html_header; |
922 | 922 | fprintf file "%s<BR>\n" (html_of_text path tokens text); |
... | ... |
pre/makefile
... | ... | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa | |
7 | 7 | INSTALLDIR=`ocamlc -where` |
8 | 8 | |
9 | 9 | WAL= paths.ml |
... | ... |
pre/preProcessing.ml
... | ... | @@ -121,9 +121,9 @@ let parse_text = function |
121 | 121 | let lex_sems = ENIAMlexSemantics.assign tokens text in |
122 | 122 | text,tokens,lex_sems |
123 | 123 | | AltText[Raw,RawText query;CONLL,StructText[ |
124 | - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens -> | |
124 | + StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens -> | |
125 | 125 | let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in |
126 | - let conll = StructParagraph[{p with psentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] | |
126 | + let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] | |
127 | 127 | @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in |
128 | 128 | let paths = ENIAMsubsyntax.parse query in |
129 | 129 | let sentences = ENIAMsentences.split_into_sentences "" query tokens paths in |
... | ... | @@ -135,7 +135,7 @@ let parse_text = function |
135 | 135 | |
136 | 136 | let rec main_loop in_chan out_chan = |
137 | 137 | (* print_endline "main_loop 1"; *) |
138 | - let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_record ExtArray.t) in | |
138 | + let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_env ExtArray.t) in | |
139 | 139 | (* print_endline "main_loop 2"; *) |
140 | 140 | if fst query = RawText "" then () else ( |
141 | 141 | (try |
... | ... | @@ -154,7 +154,7 @@ let rec main_loop in_chan out_chan = |
154 | 154 | (* print_endline "main_loop 7"; *) |
155 | 155 | Marshal.to_channel out_chan ( |
156 | 156 | RawText "", |
157 | - ExtArray.make 1 ENIAMtokenizerTypes.empty_token, | |
157 | + ExtArray.make 1 ENIAMtokenizerTypes.empty_token_env, | |
158 | 158 | ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem, |
159 | 159 | Printexc.to_string e, |
160 | 160 | 0.) [])); |
... | ... |
testy/skladnica-test2.conll
... | ... | @@ -11,7 +11,7 @@ |
11 | 11 | 5 szanse szansa subst subst pl|acc|f 4 obj_th _ _ |
12 | 12 | 6 ? ? interp interp _ 4 punct _ _ |
13 | 13 | |
14 | -# trees/NKJP_1M_1202900095/morph_3-p/morph_3.46-s.xml.tree | |
14 | +# trees/NKJP_1M_1202900095/morph_3-p/morph_3.46-s.xml.trees | |
15 | 15 | 1 - - interp interp 0 _ _ _ |
16 | 16 | 2 Słoń słoń subst subst sg|nom|m2 4 _ _ _ |
17 | 17 | 3 - - interp interp 0 _ _ _ |
... | ... | @@ -19,7 +19,7 @@ |
19 | 19 | 5 Pinio Pinio subst subst sg|nom|m1 4 _ _ _ |
20 | 20 | 6 . . interp interp 0 _ _ _ |
21 | 21 | |
22 | -# trees/NKJP_1M_2002000114/morph_2-p/morph_2.72-s.xml.tree | |
22 | +# trees/NKJP_1M_2002000114/morph_2-p/morph_2.72-s.xml.trees | |
23 | 23 | 1 - - interp interp 0 _ _ _ |
24 | 24 | 2 Nie nie qub qub 3 _ _ _ |
25 | 25 | 3 mogę móc fin fin sg|pri|imperf 7 _ _ _ |
... | ... | @@ -29,7 +29,7 @@ |
29 | 29 | 7 zachrypiał zachrypieć praet praet sg|m1|perf 0 _ _ _ |
30 | 30 | 8 . . interp interp 0 _ _ _ |
31 | 31 | |
32 | -# trees/NKJP_1M_2002000028/morph_5-p/morph_5.40-s.xml.tree | |
32 | +# trees/NKJP_1M_2002000028/morph_5-p/morph_5.40-s.xml.trees | |
33 | 33 | 1 - - interp interp 0 _ _ _ |
34 | 34 | 2 Właśnie właśnie qub qub 4 _ _ _ |
35 | 35 | 3 to to subst subst sg|acc|n 4 _ _ _ |
... | ... | @@ -39,7 +39,7 @@ |
39 | 39 | 7 twardo twardo adv adv pos 6 _ _ _ |
40 | 40 | 8 . . interp interp 0 _ _ _ |
41 | 41 | |
42 | -# trees/NKJP_1M_1202000001/morph_3-p/morph_3.9-s.xml.tree | |
42 | +# trees/NKJP_1M_1202000001/morph_3-p/morph_3.9-s.xml.trees | |
43 | 43 | 1 CKM CKM subst subst sg|nom|n 0 _ _ _ |
44 | 44 | 2 : interp 0 _ _ _ |
45 | 45 | 3 Jak jak adv adv pos 5 _ _ _ |
... | ... | @@ -50,7 +50,7 @@ |
50 | 50 | 8 patrzeć patrzeć inf inf imperf 5 _ _ _ |
51 | 51 | 9 ? ? interp interp 0 _ _ _ |
52 | 52 | |
53 | -# trees/NKJP_1M_2001000023/morph_1-p/morph_1.61-s.xml.tree | |
53 | +# trees/NKJP_1M_2001000023/morph_1-p/morph_1.61-s.xml.trees | |
54 | 54 | 1 Pochylił pochylić praet praet sg|m1|perf 0 _ _ _ |
55 | 55 | 2 em być aglt aglt sg|pri|imperf|wok 1 _ _ _ |
56 | 56 | 3 się się qub qub 1 _ _ _ |
... | ... |