Commit 0b0d4af3a9070341c24d391ddcd082f4cf5e15c9

Authored by Wojciech Jaworski
2 parents d06dc00b 535b2498

Merge branch 'dep_trees' into integration

LCGlexicon/resources/lexicon-pl.dic
@@ -7,7 +7,7 @@ @@ -7,7 +7,7 @@
7 month-lex month-interval year-interval roman roman-interval 7 month-lex month-interval year-interval roman roman-interval
8 hour-minute-interval hour-interval obj-id match-result 8 hour-minute-interval hour-interval obj-id match-result
9 url email day-month day year date hour hour-minute 9 url email day-month day year date hour hour-minute
10 - się nie by s <root> or or2 <colon> <speaker> <speaker-end> <squery> 10 + się nie by s <root> <conll_root> or or2 <colon> <speaker> <speaker-end> <squery>
11 11
12 @WEIGHTS 12 @WEIGHTS
13 symbol_weight=1 13 symbol_weight=1
@@ -272,6 +272,8 @@ pos=unk: np*number*case*gender*person; @@ -272,6 +272,8 @@ pos=unk: np*number*case*gender*person;
272 # [LCGrenderer.make_frame false tokens lex_sems [] schema_list ["<conll_root>"] d batrs] 272 # [LCGrenderer.make_frame false tokens lex_sems [] schema_list ["<conll_root>"] d batrs]
273 # | lemma,c,l -> failwith ("process_interp: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) in 273 # | lemma,c,l -> failwith ("process_interp: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) in
274 274
  275 +lemma=<conll_root>,pos=interp: <conll_root>/(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj);
  276 +
275 pos=sinterj: BRACKET interj; 277 pos=sinterj: BRACKET interj;
276 278
277 lemma=</sentence>,pos=interp: BRACKET s\?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj); 279 lemma=</sentence>,pos=interp: BRACKET s\?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj);
LCGparser/ENIAM_LCGrules.ml
@@ -446,8 +446,8 @@ let backward_cross_composition references args functs = @@ -446,8 +446,8 @@ let backward_cross_composition references args functs =
446 let rules = [ 446 let rules = [
447 backward_application; 447 backward_application;
448 forward_application; 448 forward_application;
449 - backward_cross_composition;  
450 - forward_cross_composition; 449 + (* backward_cross_composition; *)
  450 + (* forward_cross_composition; *)
451 ] 451 ]
452 452
453 let rec flatten_functor2 l seml = function 453 let rec flatten_functor2 l seml = function
corpora/CONLL.ml
@@ -3,133 +3,55 @@ open ENIAMsubsyntaxTypes @@ -3,133 +3,55 @@ open ENIAMsubsyntaxTypes
3 open ENIAMtokenizerTypes 3 open ENIAMtokenizerTypes
4 4
5 let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts 5 let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts
6 - then f (snd @@ List.find (fun (m,_) -> m = mode) alts)  
7 - else f (snd @@ List.find (fun (m,_) -> m = Struct) alts)  
8 -  
9 -let string_of_token mode token conll_id super label =  
10 - let decompose_lemma = function  
11 - | Lemma(a,b,c) -> a,b,if c = [[]]  
12 - then "_"  
13 - else String.concat "][" @@ Xlist.map c (fun x ->  
14 - String.concat "|" @@ Xlist.map x ( fun y ->  
15 - String.concat "." y))  
16 - | t -> failwith ("string_of_token: not Lemma") in  
17 - match mode with  
18 - | Raw -> token.orth  
19 - | Struct -> failwith ("function string_of_token for mode Struct is not defined")  
20 - | CONLL -> let lemma,cat,interp = decompose_lemma token.token in  
21 - String.concat "\t" [string_of_int conll_id;  
22 - token.orth; lemma; cat; cat; interp; "_"; "_";  
23 - string_of_int token.beg; string_of_int token.len]  
24 - | Mate -> let lemma,cat,interp = decompose_lemma token.token in  
25 - String.concat "\t" [string_of_int conll_id;  
26 - token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"]  
27 - | _ -> failwith "string_of_token: ni"  
28 -  
29 -let string_of_paths mode tokens paths =  
30 - let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id ->  
31 - let id,super,label = paths.(conll_id) in  
32 - (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in  
33 - String.concat "\n" (List.rev l) ^ "\n\n"  
34 -  
35 -let rec string_of_sentence mode tokens = function  
36 - RawSentence s -> if mode = Raw then s else ""  
37 - | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)  
38 - | DepSentence (paths) -> string_of_paths mode tokens paths  
39 - | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences")  
40 - | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts  
41 -  
42 -let string_of_p_record mode tokens p_record =  
43 - (if p_record.id = "" then "" else p_record.id ^ "\n") ^  
44 - string_of_sentence mode tokens p_record.sentence  
45 -  
46 -(*let rec string_of_paragraph mode tokens = function  
47 - RawParagraph s -> if mode = Raw then s else ""  
48 - | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens)  
49 - | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts  
50 -  
51 -let rec string_of_text mode tokens = function  
52 - RawText s -> if mode = Raw then s else ""  
53 - | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens)  
54 - | AltText alts -> alternative_string (string_of_text mode) mode alts*)  
55 -  
56 -  
57 -(******************)  
58 -(***  
59 -let establish_next tokens paths =  
60 - let n = ExtArray.size tokens in  
61 - Int.iter 1 (n - 2) (fun i ->  
62 - let f = ExtArray.get tokens i in  
63 - let s = ExtArray.get tokens (i+1) in  
64 - ExtArray.set tokens i {f with next = s.beg});  
65 - let last = ExtArray.get tokens (n-1) in  
66 - ExtArray.set tokens (n-1) {last with next = last.beg + last.len}  
67 -  
68 -  
69 - (*let rec pom res = function  
70 - h :: t -> let next = if res = []  
71 - then h.beg+h.len  
72 - else (List.hd res).beg in  
73 - pom ({h with next = next} :: res) t  
74 - | [] -> res in  
75 - pom [] rev_tokens*)  
76 -  
77 -let rec establish_for_token i text tokens = function  
78 - (id,_,_) :: t as l->  
79 - let h = ExtArray.get tokens id in  
80 - if Xstring.check_prefix " " text  
81 - then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l  
82 - else if Xstring.check_prefix h.orth text  
83 - then  
84 - let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in  
85 - let n_h = {h with beg = i ; len = n} in  
86 - ExtArray.set tokens id n_h;  
87 - establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t  
88 - else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)  
89 - | [] -> 100, i  
90 -  
91 -let rec establish_lengths text paths tokens =  
92 - let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in  
93 - establish_next tokens paths;  
94 - pbeg, plen-100  
95 -  
96 -(******************)  
97 -  
98 -exception ErrorInfoFile of string  
99 -  
100 -let info_file = "../corpora/info_sentences.txt"  
101 -  
102 -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file  
103 -  
104 -let add_to_map map info_str =  
105 - match Xstring.split "\n" info_str with  
106 - [id; text; info_token] -> StringMap.add map info_token (id, text)  
107 - | _ -> raise (ErrorInfoFile info_str)  
108 -  
109 -let info_map =  
110 - Xlist.fold info StringMap.empty add_to_map  
111 -  
112 -let match_sentence (p_record,tokens) =  
113 - let rec info_token s = match s with  
114 - RawSentence text -> failwith ("match_sentence: " ^ text)  
115 - | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)  
116 - | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths  
117 - | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")  
118 - | AltSentence alts -> failwith ("match_sentence: AltSentence")  
119 - (*if List.exists (fun (mode, s) -> mode = CONLL) alts  
120 - then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))  
121 - else failwith ("match_sentence: no CONLL mode in AltSentence")*) in  
122 - let info_token, paths = info_token p_record.psentence in  
123 - try  
124 - let id, text = StringMap.find info_map info_token in  
125 - let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in  
126 - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";  
127 - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]  
128 -(* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)  
129 - with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]  
130 -  
131 -let match_corpus corpus =  
132 - Xlist.map corpus match_sentence***) 6 + then f (snd @@ List.find (fun (m,_) -> m = mode) alts)
  7 + else f (snd @@ List.find (fun (m,_) -> m = Struct) alts)
  8 +
  9 +let string_of_token mode token conll_id super label =
  10 + let decompose_lemma = function
  11 + | Lemma(a,b,c) -> a,b,if c = [[]]
  12 + then "_"
  13 + else String.concat "][" @@ Xlist.map c (fun x ->
  14 + String.concat "|" @@ Xlist.map x ( fun y ->
  15 + String.concat "." y))
  16 + | t -> failwith ("string_of_token: not Lemma") in
  17 + match mode with
  18 + | Raw -> token.orth
  19 + | Struct -> failwith ("function string_of_token for mode Struct is not defined")
  20 + | CONLL -> let lemma,cat,interp = decompose_lemma token.token in
  21 + String.concat "\t" [string_of_int conll_id;
  22 + token.orth; lemma; cat; cat; interp; "_"; "_";
  23 + string_of_int token.beg; string_of_int token.len]
  24 + | Mate -> let lemma,cat,interp = decompose_lemma token.token in
  25 + String.concat "\t" [string_of_int conll_id;
  26 + token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"]
  27 + | _ -> failwith "string_of_token: ni"
  28 +
  29 +let string_of_paths mode tokens paths =
  30 + let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id ->
  31 + let id,super,label = paths.(conll_id) in
  32 + (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in
  33 + String.concat "\n" (List.rev l) ^ "\n\n"
  34 +
  35 +let rec string_of_sentence mode tokens = function
  36 + RawSentence s -> if mode = Raw then s else ""
  37 + | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)
  38 + | DepSentence (paths) -> string_of_paths mode tokens paths
  39 + | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences")
  40 + | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts
  41 +
  42 +let string_of_p_record mode tokens p_record =
  43 + (if p_record.id = "" then "" else p_record.id ^ "\n") ^
  44 + string_of_sentence mode tokens p_record.sentence
  45 +
  46 +(*let rec string_of_paragraph mode tokens = function
  47 + RawParagraph s -> if mode = Raw then s else ""
  48 + | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens)
  49 + | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts
  50 +
  51 +let rec string_of_text mode tokens = function
  52 + RawText s -> if mode = Raw then s else ""
  53 + | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens)
  54 + | AltText alts -> alternative_string (string_of_text mode) mode alts*)
133 55
134 (******************) 56 (******************)
135 57
@@ -207,15 +129,6 @@ let establish_next tokens paths = @@ -207,15 +129,6 @@ let establish_next tokens paths =
207 let last = ExtArray.get tokens (n-1) in 129 let last = ExtArray.get tokens (n-1) in
208 ExtArray.set tokens (n-1) {last with next = last.beg + last.len} 130 ExtArray.set tokens (n-1) {last with next = last.beg + last.len}
209 131
210 -  
211 - (*let rec pom res = function  
212 - h :: t -> let next = if res = []  
213 - then h.beg+h.len  
214 - else (List.hd res).beg in  
215 - pom ({h with next = next} :: res) t  
216 - | [] -> res in  
217 - pom [] rev_tokens*)  
218 -  
219 let rec establish_for_token i text tokens = function 132 let rec establish_for_token i text tokens = function
220 (id,_,_) :: t as l-> 133 (id,_,_) :: t as l->
221 let h = ExtArray.get tokens id in 134 let h = ExtArray.get tokens id in
@@ -245,15 +158,15 @@ exception ErrorInfoFile of string @@ -245,15 +158,15 @@ exception ErrorInfoFile of string
245 158
246 let info_file = "../corpora/info_sentences2.txt" 159 let info_file = "../corpora/info_sentences2.txt"
247 160
248 -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file 161 +let info () = Xstring.split "\n\n" @@ File.load_file_gen info_file
249 162
250 let add_to_map map info_str = 163 let add_to_map map info_str =
251 match Xstring.split "\n" info_str with 164 match Xstring.split "\n" info_str with
252 [id; text; info_token] -> StringMap.add map info_token (id, text) 165 [id; text; info_token] -> StringMap.add map info_token (id, text)
253 | _ -> raise (ErrorInfoFile info_str) 166 | _ -> raise (ErrorInfoFile info_str)
254 167
255 -let info_map =  
256 - Xlist.fold (List.tl info) StringMap.empty add_to_map 168 +let info_map () =
  169 + Xlist.fold (List.tl (info ())) StringMap.empty add_to_map
257 170
258 let match_sentence (p_record,tokens) = 171 let match_sentence (p_record,tokens) =
259 let rec info_token s = match s with 172 let rec info_token s = match s with
@@ -268,7 +181,7 @@ let match_sentence (p_record,tokens) = @@ -268,7 +181,7 @@ let match_sentence (p_record,tokens) =
268 let info_token, paths = info_token p_record.sentence in 181 let info_token, paths = info_token p_record.sentence in
269 (* try *) 182 (* try *)
270 let id, text = try 183 let id, text = try
271 - StringMap.find info_map info_token 184 + StringMap.find (info_map ()) info_token
272 with 185 with
273 | _ -> p_record.id, get_text tokens in 186 | _ -> p_record.id, get_text tokens in
274 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in 187 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
@@ -282,7 +195,7 @@ let match_corpus corpus = @@ -282,7 +195,7 @@ let match_corpus corpus =
282 [] -> [] 195 [] -> []
283 | a::l -> try 196 | a::l -> try
284 let r = f a in r :: pom f l 197 let r = f a in r :: pom f l
285 - with e -> (*print_endline (Printexc.to_string e);*) pom f l in 198 + with e -> pom f l in
286 pom match_sentence corpus 199 pom match_sentence corpus
287 200
288 (******************) 201 (******************)
@@ -304,7 +217,6 @@ let load_token in_channel = @@ -304,7 +217,6 @@ let load_token in_channel =
304 else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in 217 else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in
305 {empty_token_env with orth = orth; token = Lemma(lemma,cat,interp);}, int_of_string id, int_of_super super, label in 218 {empty_token_env with orth = orth; token = Lemma(lemma,cat,interp);}, int_of_string id, int_of_super super, label in
306 let line = input_line in_channel in 219 let line = input_line in_channel in
307 - (* print_endline ("load_token: " ^ line); *)  
308 if line = "" 220 if line = ""
309 then raise Empty_line 221 then raise Empty_line
310 else if line.[0] = '#' 222 else if line.[0] = '#'
@@ -329,30 +241,19 @@ let load_token in_channel = @@ -329,30 +241,19 @@ let load_token in_channel =
329 let label = Xstring.cut_sufix "_" label_err in 241 let label = Xstring.cut_sufix "_" label_err in
330 n_token id orth lemma cat interp super label) 242 n_token id orth lemma cat interp super label)
331 | _ -> failwith ("load_token: " ^ line) 243 | _ -> failwith ("load_token: " ^ line)
332 -(* {c_id = List.nth pom 1;  
333 - c_lemma = List.nth pom 2;  
334 - c_cat = List.nth pom 3;  
335 - c_interp = (let interp = List.nth pom 5 in  
336 - if interp = "_"  
337 - then []  
338 - else Str.split (Str.regexp "|") interp);  
339 - c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)  
340 244
341 let load_sentence in_channel = 245 let load_sentence in_channel =
342 let tokens = ExtArray.make 100 empty_token_env in 246 let tokens = ExtArray.make 100 empty_token_env in
343 let _ = ExtArray.add tokens {empty_token_env with token = Interp "<conll_root>"} in 247 let _ = ExtArray.add tokens {empty_token_env with token = Interp "<conll_root>"} in
344 let rec pom rev_paths id = 248 let rec pom rev_paths id =
345 - (* print_endline "pom 1"; *)  
346 try 249 try
347 - (* print_endline "pom 2"; *)  
348 let token, conll_id, super, label = load_token in_channel in 250 let token, conll_id, super, label = load_token in_channel in
349 let id_a = ExtArray.add tokens token in 251 let id_a = ExtArray.add tokens token in
350 if id_a <> conll_id then failwith "load_sentence: different ids" else 252 if id_a <> conll_id then failwith "load_sentence: different ids" else
351 - (* print_endline "pom 3"; *)  
352 pom ((id_a,super,label) :: rev_paths) id 253 pom ((id_a,super,label) :: rev_paths) id
353 - with Id_line new_id -> (*print_endline "pom 4";*)pom rev_paths new_id  
354 - | Empty_line -> (*print_endline "pom 5";*)rev_paths, id  
355 - | End_of_file -> (*print_endline "pom 6";*)if rev_paths = [] 254 + with Id_line new_id -> pom rev_paths new_id
  255 + | Empty_line -> rev_paths, id
  256 + | End_of_file -> if rev_paths = []
356 then raise End_of_file 257 then raise End_of_file
357 else rev_paths, id in 258 else rev_paths, id in
358 let rev_paths, id = pom [] "" in 259 let rev_paths, id = pom [] "" in
@@ -366,4 +267,4 @@ let load_corpus in_channel = @@ -366,4 +267,4 @@ let load_corpus in_channel =
366 pom ((conll_sentence, tokens) :: res) 267 pom ((conll_sentence, tokens) :: res)
367 with End_of_file -> res 268 with End_of_file -> res
368 | e -> prerr_endline (Printexc.to_string e); res in 269 | e -> prerr_endline (Printexc.to_string e); res in
369 - (* match_corpus @@ *) List.rev @@ pom [] 270 + List.rev @@ pom []
corpora/CONLL_adapter.ml
  1 +open Xstd
  2 +open ENIAMsubsyntaxTypes
  3 +open ENIAMtokenizerTypes
1 4
2 -let convert_dep_tree id first_try paths tokens lex_sems =  
3 - let do_if cond f paths = if cond then f paths tokens else paths in 5 +let if_lemma lemmas = function
  6 + Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas
  7 + | _ -> false
  8 +
  9 +let if_cat cats = function
  10 + Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats
  11 + | _ -> false
  12 +
  13 +let if_interps interps token =
  14 + let interp = match token with
  15 + Lemma(_,_,i) -> i
  16 + | _ -> [[[]]] in
  17 + let if_interp nr value =
  18 + List.exists (fun x ->
  19 + try
  20 + List.exists (fun y ->
  21 + y = value) (List.nth x nr)
  22 + with _ -> false
  23 + ) interp in
  24 + Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value))
  25 +
  26 +let correct_coordination1 paths tokens =
  27 + let paths_ls = List.mapi (fun i (id,super,label) ->
  28 + (i,id,super,label)) (Array.to_list paths) in
  29 +
  30 + let l = [("subst:nom",0),(["fin";"praet"],0);
  31 + ("subst:acc",0),(["inf"],0);
  32 + ("ppron3:nom",0),(["fin";"praet"],0);
  33 + ("ppron3:acc",0),(["fin";"praet"],0);
  34 + ("adv",0),(["fin";"praet"],0);
  35 + ("adv",0),(["inf"],0);
  36 + ("adv",0),(["adj"],0);
  37 + ("prep",0),(["fin";"praet"],0);
  38 + ("prep",0),(["inf"],0);
  39 + ("prep",0),(["ppas"],0);
  40 + ("prep",0),(["subst"],0);
  41 + ("prep:gen",0),(["subst:gen"],0);
  42 + ("adj:nom",0),(["fin";"praet"],0);
  43 + ("adj:nom",0),(["subst:nom"],0);
  44 + ("adj:gen",0),(["subst:gen"],0);
  45 + ("adj:dat",0),(["subst:dat"],0);
  46 + ("adj:acc",0),(["subst:acc"],0);
  47 + ("adj:inst",0),(["subst:inst"],0);
  48 + ("adj:loc",0),(["subst:loc"],0);
  49 + ("subst:gen",0),(["subst:nom"],0);
  50 + (* ("subst:gen",0),(["subst:gen"],0); *)
  51 + ("subst:gen",0),(["subst:dat"],0);
  52 + ("subst:gen",0),(["subst:acc"],0);
  53 + ("subst:gen",0),(["subst:inst"],0);
  54 + ("subst:gen",0),(["subst:loc"],0);
  55 + ("ppron3:gen",0),(["subst:nom"],0);
  56 + ("ppron3:gen",0),(["subst:dat"],0);
  57 + ("ppron3:gen",0),(["subst:acc"],0);
  58 + ("ppron3:gen",0),(["subst:inst"],0);
  59 + ("ppron3:gen",0),(["subst:loc"],0);
  60 + ("qub",0),(["fin";"praet"],0);
  61 + ("qub",0),(["subst"],0);
  62 + ("qub",0),(["adj"],0);
  63 + ("pact",0),(["subst"],0);
  64 + ("ppas",0),(["subst"],0)
  65 + ] in
  66 +
  67 + let find_dependents sons =
  68 +
  69 + let is (i,id,super,label) pattern = match Xstring.split ":" pattern with
  70 + ["prep";case] -> if_cat ["prep"] (ExtArray.get tokens id).token &&
  71 + if_interps [0,case] (ExtArray.get tokens id).token
  72 + | [cat;case] -> if_cat [cat] (ExtArray.get tokens id).token &&
  73 + if_interps [1,case] (ExtArray.get tokens id).token
  74 + | [cat] -> if_cat [cat] (ExtArray.get tokens id).token
  75 + | _ -> failwith "is (in correct_coordination1)" in
  76 +
  77 + let incr_representative acc son = Xlist.map acc (fun ((one,a),(rest,b)) ->
  78 + if is son one
  79 + then (one,a + 1), (rest,b)
  80 + else if List.exists (is son) rest
  81 + then (one,a), (rest,b + 1)
  82 + else (one,a), (rest,b)) in
  83 +
  84 + let get_from sons pattern = List.find (fun x -> is x pattern) sons in
  85 +
  86 + let l = Xlist.fold sons l incr_representative in
  87 + let results = List.filter (fun ((_,a),(_,b)) -> a = 1 && b > 1) l in
  88 + Xlist.map results (fun result ->
  89 + get_from sons @@ fst @@ fst result,
  90 + List.filter (fun son ->
  91 + List.exists (fun one -> is son one) (fst (snd result))) sons) in
  92 +
  93 + let establish_neighbour super ((i_d,id_d,super_d,label_d),sons) =
  94 + let not_between (i_s,_,_,_) =
  95 + (super < i_d && super < i_s) ||
  96 + (super > i_d && super > i_s) in
  97 + let (i_n,id_n,super_n,label_n) = List.find (fun son ->
  98 + not_between son) sons in
  99 + paths.(i_d) <- (id_d, i_n, label_d) in
  100 +
  101 + let examine_coords (i,id,super,label) sons =
  102 + try
  103 + let dependents = find_dependents sons in
  104 + Xlist.iter dependents (establish_neighbour super)
  105 + with
  106 + | _ -> () in
  107 +
  108 + Array.iteri (fun i (id,super,label) ->
  109 + if if_cat ["conj"] (ExtArray.get tokens id).token
  110 + then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in
  111 + if (List.length sons > 2)
  112 + then examine_coords (i,id,super,label) sons)) paths;
  113 + paths
  114 +
  115 +let correct_coordination2 paths tokens =
  116 + let paths_c = Array.copy paths in
  117 + let paths_ls () = List.mapi (fun i (id,super,label) ->
  118 + (i,id,super,label)) (Array.to_list paths_c) in
  119 +
  120 + (* let ps a sons =
  121 + print_endline a;
  122 + List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons;
  123 + print_endline "" in *)
  124 +
  125 + let rec correct_rec (i,id,super,label) sons =
  126 + let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in
  127 + (* ps "left:" (List.rev left_s);
  128 + ps "right:" right_s; *)
  129 + find_father i (List.rev left_s);
  130 + find_father i right_s
  131 +
  132 + and find_father i0 = function
  133 + [(i,id,super,label)] -> paths_c.(i) <- (id,i0,label)
  134 + | (a,b,c,d) :: (i,id,super,label) :: t ->
  135 + paths_c.(i) <- (id,i0,label);
  136 + if not (if_cat ["conj"] (ExtArray.get tokens i).token ||
  137 + (ExtArray.get tokens i).orth = ",")
  138 + then failwith "find_father";
  139 + correct_rec (i,id,super,label) (if a < i
  140 + then (a,b,c,d) :: t
  141 + else List.rev @@ (a,b,c,d) :: t)
  142 + | _ -> failwith "find_father" in
  143 +
  144 + let check_previous_for_interp i =
  145 + if i >= 0 && (ExtArray.get tokens i).orth = "," &&
  146 + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c))
  147 + then paths_c.(i) <- (0,-1,"") in
  148 +
  149 + Array.iteri (fun i (id,super,label) ->
  150 + if if_cat ["conj"] (ExtArray.get tokens i).token ||
  151 + (ExtArray.get tokens i).orth = ","
  152 + then
  153 + (check_previous_for_interp (i-1);
  154 + let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in
  155 + if (List.length sons > 2)
  156 + then correct_rec (i,id,super,label) sons)) paths_c;
  157 + paths_c
  158 +
  159 +let praet_qub_aglt paths tokens =
  160 + Array.iteri (fun i (id,super,label) ->
  161 + if super >= 0 then
  162 + (let id_s, super_s, label_s = paths.(super) in
  163 + if if_cat ["aglt"] (ExtArray.get tokens id).token &&
  164 + (ExtArray.get tokens id_s).orth = "by"
  165 + then let id_gf,super_gf,label_gf = paths.(super_s) in
  166 + if if_cat ["praet"] (ExtArray.get tokens id_gf).token
  167 + then paths.(i) <- (id,super_s,label))) paths;
  168 + paths
  169 +
  170 +let replace_tokens paths tokens =
  171 +(* for i = 0 to ExtArray.size tokens - 1 do
  172 + print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth)
  173 +done; *)
  174 + let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->
  175 + if (ExtArray.get tokens i).orth = orth then i else acc) in
  176 +
  177 + let multidot i id0 super0 label0 =
  178 + let id1, super1, label1 = paths.(super0) in
  179 + if super1 >= 0 then
  180 + let id2, super2, label2 = paths.(super1) in
  181 + if (ExtArray.get tokens id1).orth = "." &&
  182 + (ExtArray.get tokens id2).orth = "."
  183 + then
  184 + (paths.(super1) <- (find_token "..." ,super2, label2);
  185 + paths.(super0) <- (0,-1,"");
  186 + paths.(i) <- (0,-1,"")) in
  187 +
  188 + let brev i id super label =
  189 + let if_the_last_dot () =
  190 + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) ->
  191 + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in
  192 + Array.fold_left (fun acc (i2,s,l) ->
  193 + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in
  194 +
  195 + let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot ()
  196 + then ""
  197 + else "." in
  198 +
  199 + let n_orth = (ExtArray.get tokens id).orth ^ dot in
  200 + paths.(i) <- (find_token n_orth,super,label) in
  201 +
  202 + Array.iteri (fun i (id,super,label) ->
  203 + if (ExtArray.get tokens id).orth = "."
  204 + then multidot i id super label;
  205 + if if_cat ["brev"] (ExtArray.get tokens id).token
  206 + then brev i id super label)
  207 + paths;
  208 + paths
  209 +
  210 +let replace_hyphens paths tokens =
  211 + let ref_paths = ref paths in
  212 + let find_token token = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->
  213 + if (ExtArray.get tokens i).token = token then i else acc) in
  214 + let find_specific_token token beg next = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->
  215 + if (ExtArray.get tokens i).token = token &&
  216 + beg <= (ExtArray.get tokens i).beg &&
  217 + (ExtArray.get tokens i).next <= next
  218 + then i else acc) in
  219 +
  220 + let correct_last sons_of_zero = (* TODO: synowie zamiast syna *)
  221 + let i1,s1,l1 = !ref_paths.(Array.length !ref_paths - 1) in
  222 + if (ExtArray.get tokens i1).orth = "."
  223 + then
  224 + !ref_paths.(Array.length !ref_paths - 1) <- (find_token (Interp "</sentence>"),1,l1)
  225 + else
  226 + (ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),1,"-") |];
  227 + !ref_paths.(Array.length !ref_paths - 2) <- (i1,Array.length !ref_paths - 1,l1));
  228 + Xlist.iter sons_of_zero (fun son_of_zero ->
  229 + let i2,s2,l2 = !ref_paths.(son_of_zero) in
  230 + !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) in
  231 +
  232 + let one_hyphen sons_of_zero =
  233 + let i2,s2,l2 = !ref_paths.(1) in
  234 + Xlist.iter sons_of_zero (fun son_of_zero ->
  235 + let i1,s1,l1 = !ref_paths.(son_of_zero) in
  236 + !ref_paths.(son_of_zero) <- (i1,1,l1));
  237 + !ref_paths.(1) <- (find_token (Interp "<or-sentence>"),0,l2);
  238 + correct_last sons_of_zero in
  239 +
  240 + let two_hyphens first second son parent =
  241 + let i1,s1,l1 = !ref_paths.(first) in
  242 + let i2,s2,l2 = !ref_paths.(second) in
  243 + let beg, next = (ExtArray.get tokens i2).beg, (ExtArray.get tokens i2).next in
  244 + let i3,s3,l3 = !ref_paths.(son) in
  245 + let i4,s4,l4 = !ref_paths.(parent) in
  246 + ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),first,"-") |];
  247 + !ref_paths.(first) <- (find_token (Interp "<or-sentence>"),0,l1);
  248 + !ref_paths.(second) <- (find_specific_token (Interp "</or-sentence>") beg next,first,l2);
  249 + !ref_paths.(son) <- (i3,second,l3);
  250 + !ref_paths.(parent) <- (i4,first,l4) in
  251 +
  252 + let rec is_dep_correct a b out zero res i (id,super,label) = (* out = how many words in (a,b) have parent outside [a,b]*)
  253 + (* print_endline ((string_of_int a) ^ " " ^ (string_of_int b) ^ " " ^ (string_of_int out) ^ " " ^ (string_of_int zero) ^ " " ^ (string_of_int i)); *)
  254 + if out > 1 || zero > 1 || (* zero = how many words (not interps) have parent 0 *)
  255 + (a < i && i < b && super < a && label <> "interp") ||
  256 + (a < super && super < b && (i < a || b < i))
  257 + then false, res
  258 + else
  259 + if i+1 = Array.length !ref_paths
  260 + then out = 1 && zero = 1, res
  261 + else
  262 + if a < i && i < b && b < super
  263 + then is_dep_correct a b (out+1) zero (i,super) (i+1) !ref_paths.(i+1)
  264 + else
  265 + if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token)
  266 + then is_dep_correct a b out (zero+1) res (i+1) !ref_paths.(i+1)
  267 + else is_dep_correct a b out zero res (i+1) !ref_paths.(i+1) in
  268 +
  269 + let hyphens = snd @@ Array.fold_left (fun (i,acc) (id,super,label) ->
  270 + if (ExtArray.get tokens id).orth = "-"
  271 + then i+1, i :: acc
  272 + else i+1, acc) (0,[]) !ref_paths in
  273 +
  274 + let sons_of_zero = snd @@ Array.fold_left (fun (i,acc) (id,super,label) ->
  275 + if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token)
  276 + then i+1, i :: acc
  277 + else i+1, acc) (0,[]) !ref_paths in
  278 +
  279 + (if List.length sons_of_zero = 1
  280 + then
  281 + if List.length hyphens = 1 && hyphens = [1]
  282 + then one_hyphen sons_of_zero
  283 + else
  284 + if List.length hyphens = 2
  285 + then let a, b = List.nth hyphens 1, List.nth hyphens 0 in
  286 + let is_good, (son,parent) = is_dep_correct a b 0 0 (0,0) 1 !ref_paths.(1) in
  287 + if a = 1 && is_good
  288 + then two_hyphens a b son parent);
  289 + !ref_paths
  290 +
  291 +let correct_interp_with_father_0 paths tokens =
  292 + Array.iteri (fun i (id,super,label) ->
  293 + if (super = 0 ||
  294 + (ExtArray.get tokens id).token = Interp "<or-sentence>" ||
  295 + (ExtArray.get tokens id).token = Interp "</or-sentence>") && (ExtArray.get tokens id).orth = ","
  296 + then Array.iteri (fun i1 (id1,super1,label1) ->
  297 + if super1 = i
  298 + then paths.(i1) <- (id1,0,label1)) paths) paths;
  299 + paths
  300 +
  301 +let remove_interps interp paths tokens =
  302 + let paths_ls = Array.to_list paths in
  303 + Array.iteri (fun i (id,super,label) ->
  304 + if (ExtArray.get tokens id).orth = interp &&
  305 + not (List.exists (fun (_,super,_) -> super = i) paths_ls)
  306 + then paths.(i) <- (0,-1,"")) paths;
  307 + paths
  308 +
  309 +let correct_passive_voice paths tokens =
  310 + Array.iteri (fun i (id,super,label) ->
  311 + if super >= 0 then
  312 + (let id_s, super_s, label_s = paths.(super) in
  313 + if (if_cat ["praet"] (ExtArray.get tokens id).token &&
  314 + if_cat ["ppas"] (ExtArray.get tokens id_s).token)
  315 + then (paths.(i) <- (id,super_s,label);
  316 + paths.(super) <- (id_s,i,label_s);
  317 + Array.iteri (fun i_p (id_p,super_p,label_p) ->
  318 + if super_p = super
  319 + then paths.(i_p) <- (id_p,i,label_p)) paths))) paths;
  320 + paths
  321 +
  322 +let swap_dep paths tokens =
  323 + let change_dep i (id,super,label) =
  324 + let id_S, super_S, label_S = paths.(super) in
  325 + paths.(i) <- (id,super_S,label);
  326 + paths.(super) <- (id_S, id, label_S) in
  327 + let rec correct_dep i (id,super,label) =
  328 + let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który";
  329 + "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in
  330 + if (if_cat ["comp"] (ExtArray.get tokens id).token &&
  331 + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token) ||
  332 + (if_cat ["conj"] (ExtArray.get tokens id).token &&
  333 + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token &&
  334 + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) ||
  335 + (if_cat ["ppron3"] (ExtArray.get tokens id).token &&
  336 + if_interps [5,"praep"] (ExtArray.get tokens id).token) ||
  337 + (if_lemma adv_relators (ExtArray.get tokens id).token &&
  338 + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token)
  339 + then
  340 + change_dep i (id,super,label);
  341 + if (if_lemma adv_relators (ExtArray.get tokens id).token &&
  342 + if_cat ["subst"; "pred"] (ExtArray.get tokens super).token)
  343 + then correct_dep i paths.(i) in
  344 + Array.iteri correct_dep paths; paths
  345 +
  346 + (*
  347 + correct_coordination1 -> sąsiad słowem najbliższym po prawej, jeśli pomiędzy nim a mną spójnik, to najbliższym po lewej
  348 + nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy
  349 + nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *)
  350 +
  351 +let convert_dep_tree id first_try paths tokens =
4 let paths = Array.copy paths in 352 let paths = Array.copy paths in
5 - let paths = do_if first_try TreeChange.replace_tokens paths in  
6 - let paths = do_if first_try (TreeChange.remove_interps ".") paths in  
7 - let paths = do_if first_try TreeChange.replace_hyphens paths in  
8 - let paths = do_if first_try TreeChange.correct_coordination1 paths in  
9 - let paths = do_if first_try TreeChange.correct_interp_with_father_0 paths in  
10 - let paths = do_if first_try TreeChange.correct_coordination2 paths in  
11 - let paths = do_if first_try (TreeChange.remove_interps ",") paths in  
12 - let paths = do_if first_try TreeChange.correct_passive_voice paths in  
13 - let paths = do_if first_try TreeChange.praet_qub_aglt paths in  
14 - let paths = do_if (not first_try) TreeChange.swap_dep paths in  
15 - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->  
16 - fprintf file "%s\n" Visualization.html_header;  
17 - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);  
18 - fprintf file "%s\n" Visualization.html_trailer);  
19 - (* let paths = do_if first_try TreeChange.replace_tokens paths in  
20 - let paths = do_if first_try TreeChange.replace_hyphens paths in  
21 - let paths = do_if first_try (TreeChange.remove_interps ".") paths in  
22 - let paths = do_if (not first_try) TreeChange.swap_dep paths in  
23 - let paths = do_if first_try TreeChange.correct_coordination1 paths in  
24 - let paths = try  
25 - do_if first_try TreeChange.correct_coordination2 paths  
26 - with  
27 - | _ -> (  
28 - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->  
29 - fprintf file "%s\n" Visualization.html_header;  
30 - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);  
31 - fprintf file "%s\n" Visualization.html_trailer);  
32 - do_if first_try TreeChange.correct_interp_with_father_0 paths;  
33 - do_if first_try (TreeChange.remove_interps ",") paths;  
34 - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ "2.html") (fun file ->  
35 - fprintf file "%s\n" Visualization.html_header;  
36 - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);  
37 - fprintf file "%s\n" Visualization.html_trailer);  
38 - do_if first_try TreeChange.correct_coordination2 paths) in  
39 - let paths = do_if first_try TreeChange.praet_qub_aglt paths in  
40 - let paths = do_if first_try TreeChange.correct_interp_with_father_0 paths in  
41 - let paths = do_if first_try (TreeChange.remove_interps ",") paths in  
42 - let paths = do_if first_try (TreeChange.remove_interps "-") paths in  
43 - let paths = do_if first_try TreeChange.correct_passive_voice paths in  
44 - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->  
45 - fprintf file "%s\n" Visualization.html_header;  
46 - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);  
47 - fprintf file "%s\n" Visualization.html_trailer); *) 353 + let paths =
  354 + if first_try
  355 + then
  356 + let pom = replace_tokens paths tokens in
  357 + let pom = (remove_interps ".") pom tokens in
  358 + let pom = replace_hyphens pom tokens in
  359 + let pom = correct_coordination1 pom tokens in
  360 + let pom = correct_interp_with_father_0 pom tokens in
  361 + let pom = correct_coordination2 pom tokens in
  362 + let pom = remove_interps "," pom tokens in
  363 + let pom = correct_passive_voice pom tokens in
  364 + praet_qub_aglt pom tokens
  365 + else
  366 + swap_dep paths tokens in
  367 + (* File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->
  368 + Printf.fprintf file "%s\n" Visualization.html_header;
  369 + Printf.fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);
  370 + Printf.fprintf file "%s\n" Visualization.html_trailer); *)
  371 + paths
diagnostics/LCGfields.ml renamed to corpora/LCGfields.ml
@@ -83,7 +83,7 @@ let field_of_dependency_tree str_node fields dep_tree = @@ -83,7 +83,7 @@ let field_of_dependency_tree str_node fields dep_tree =
83 Array.fold_left (fun acc x -> 83 Array.fold_left (fun acc x ->
84 acc ^ (field_of_linear_term str_node field x) ^ "\n\t\t" ) "" dep_tree)) 84 acc ^ (field_of_linear_term str_node field x) ^ "\n\t\t" ) "" dep_tree))
85 85
86 -let field_of_eniam_sentence fields tokens (result : eniam_parse_result) = 86 +let field_of_eniam_sentence fields (result : eniam_parse_result) =
87 match result.status with 87 match result.status with
88 Idle -> "Idle" 88 Idle -> "Idle"
89 (* | PreprocessingError -> "PreprocessingError" *) 89 (* | PreprocessingError -> "PreprocessingError" *)
@@ -99,7 +99,7 @@ let field_of_eniam_sentence fields tokens (result : eniam_parse_result) = @@ -99,7 +99,7 @@ let field_of_eniam_sentence fields tokens (result : eniam_parse_result) =
99 | Parsed -> ignore ("Parsed\n\t\t" ^ (field_of_dependency_tree eniam fields result.dependency_tree)); "Parsed\n" 99 | Parsed -> ignore ("Parsed\n\t\t" ^ (field_of_dependency_tree eniam fields result.dependency_tree)); "Parsed\n"
100 | _ -> failwith "field_of_eniam_sentence" 100 | _ -> failwith "field_of_eniam_sentence"
101 101
102 -let field_of_conll_sentence fields tokens (result : conll_parse_result) = 102 +let field_of_conll_sentence fields (result : conll_parse_result) =
103 stat_map := StatMap.add !stat_map result.status; 103 stat_map := StatMap.add !stat_map result.status;
104 match result.status with 104 match result.status with
105 Idle -> "Idle" 105 Idle -> "Idle"
@@ -117,33 +117,36 @@ let field_of_conll_sentence fields tokens (result : conll_parse_result) = @@ -117,33 +117,36 @@ let field_of_conll_sentence fields tokens (result : conll_parse_result) =
117 | _ -> failwith "field_of_conll_sentence" 117 | _ -> failwith "field_of_conll_sentence"
118 118
119 119
120 -let rec field_of_sentence fields tokens = function 120 +let rec field_of_sentence fields = function
121 RawSentence s -> s 121 RawSentence s -> s
122 | StructSentence _ -> "StructSentence" 122 | StructSentence _ -> "StructSentence"
123 | DepSentence _ -> "DepSentence" 123 | DepSentence _ -> "DepSentence"
124 - | ENIAMSentence result -> field_of_eniam_sentence fields tokens result  
125 - | CONLLSentence result -> field_of_conll_sentence fields tokens result 124 + | ENIAMSentence result -> field_of_eniam_sentence fields result
  125 + | CONLLSentence result -> field_of_conll_sentence fields result
126 | QuotedSentences sentences -> "QuotedSentences" 126 | QuotedSentences sentences -> "QuotedSentences"
127 | AltSentence l -> String.concat "\n\t" (Xlist.map l (fun (m, s) -> 127 | AltSentence l -> String.concat "\n\t" (Xlist.map l (fun (m, s) ->
128 - Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields tokens s))) 128 + Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields s)))
129 | _ -> failwith "field_of_sentence: ni" 129 | _ -> failwith "field_of_sentence: ni"
130 130
131 -let rec field_of_paragraph fields tokens = function 131 +let rec field_of_paragraph fields = function
132 RawParagraph s -> print_endline "no fields detected: only raw paragraph"; s 132 RawParagraph s -> print_endline "no fields detected: only raw paragraph"; s
133 | StructParagraph sentences -> 133 | StructParagraph sentences ->
134 - String.concat "\n\t" (Xlist.map sentences (fun p -> field_of_sentence fields tokens p.psentence)) 134 + String.concat "\n\t" (Xlist.map sentences (fun p -> field_of_sentence fields p.psentence))
135 | AltParagraph l -> 135 | AltParagraph l ->
136 String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> (*m = ENIAM ||*) m = CONLL) l) (fun (m,t) -> 136 String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> (*m = ENIAM ||*) m = CONLL) l) (fun (m,t) ->
137 - Visualization.string_of_mode m ^ "\n\t" ^ (field_of_paragraph fields tokens t)))  
138 - (* field_of_paragraph fields tokens (snd @@ List.find (fun (mode,text) -> mode = ENIAM || mode = CONLL) l) *) 137 + Visualization.string_of_mode m ^ "\n\t" ^ (field_of_paragraph fields t)))
  138 + (* field_of_paragraph fields (snd @@ List.find (fun (mode,text) -> mode = ENIAM || mode = CONLL) l) *)
139 139
140 let rec print_fields_rec fields = function 140 let rec print_fields_rec fields = function
141 - RawText s -> print_endline "no fields detected: only raw text";  
142 -| StructText(paragraphs,tokens) ->  
143 - print_endline (String.concat "\n\n" (Xlist.map paragraphs (field_of_paragraph fields tokens)) ^ "\n") 141 + RawText s -> s
  142 + (* print_endline "no fields detected: only raw text"; *)
  143 +| StructText(paragraphs) ->
  144 + String.concat "\n\n" (Xlist.map paragraphs (field_of_paragraph fields)) ^ "\n"
144 | AltText l -> 145 | AltText l ->
145 - print_fields_rec fields (snd @@ List.find (fun (m,t) -> m = Struct (*|| m = ENIAM*) || m = CONLL) l) 146 + String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> m = Struct || m = CONLL) l) (fun (m,t) ->
  147 + Visualization.string_of_mode m ^ "\n\t" ^ (print_fields_rec fields t)))
  148 + (* print_fields_rec fields (snd @@ List.find (fun (m,t) -> m = Struct (*|| m = ENIAM*) || m = CONLL) l) *)
146 149
147 let print_fields fields text = 150 let print_fields fields text =
148 - print_fields_rec fields text 151 + print_endline @@ print_fields_rec fields text
149 (* ; print_field_map () *) 152 (* ; print_field_map () *)
corpora/makefile
@@ -16,9 +16,9 @@ lib: @@ -16,9 +16,9 @@ lib:
16 freq_test: 16 freq_test:
17 $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(MODS) freq_test.ml 17 $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(MODS) freq_test.ml
18 18
19 -test: CONLL.ml test_conll2.ml 19 +test: CONLL.ml CONLL_adapter.ml test_conll2.ml
20 mkdir -p results 20 mkdir -p results
21 - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) CONLL.ml test_conll2.ml 21 + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^
22 22
23 23
24 .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx 24 .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx
corpora/test_conll.ml
  1 +open Xstd
  2 +open ENIAMsubsyntaxTypes
  3 +open ENIAMtokenizerTypes
  4 +open LCGtypes
  5 +open ExecTypes
1 6
2 let empty_result = { 7 let empty_result = {
3 input_text=RawText ""; 8 input_text=RawText "";
@@ -146,7 +151,7 @@ let eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems = @@ -146,7 +151,7 @@ let eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems =
146 let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens lex_sems = 151 let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens lex_sems =
147 let result = empty_conll_parse_result in 152 let result = empty_conll_parse_result in
148 let time2 = time_fun () in 153 let time2 = time_fun () in
149 - let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems 154 + let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in
150 try 155 try
151 let dep_chart = LCGlexicon.dep_create paths tokens lex_sems in 156 let dep_chart = LCGlexicon.dep_create paths tokens lex_sems in
152 let dep_chart,references = LCGchart.dep_lazify dep_chart in 157 let dep_chart,references = LCGchart.dep_lazify dep_chart in
@@ -193,7 +198,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le @@ -193,7 +198,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le
193 let time5 = time_fun () in 198 let time5 = time_fun () in
194 {result with status=ReductionError; msg=Printexc.to_string e; reduction_time=time5 -. time4} 199 {result with status=ReductionError; msg=Printexc.to_string e; reduction_time=time5 -. time4}
195 else if first_try 200 else if first_try
196 - then conll_parse_sentence timeout test_only_flag id false paths tokens 201 + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems
197 else {result with status=NotParsed} 202 else {result with status=NotParsed}
198 with 203 with
199 Timeout t -> 204 Timeout t ->
@@ -201,7 +206,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le @@ -201,7 +206,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le
201 {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3} 206 {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3}
202 | NotDepParsed(id_ndp,left,l,right) -> 207 | NotDepParsed(id_ndp,left,l,right) ->
203 if first_try 208 if first_try
204 - then conll_parse_sentence timeout test_only_flag id false paths tokens 209 + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems
205 else let time4 = time_fun () in 210 else let time4 = time_fun () in
206 {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3} 211 {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3}
207 | e -> 212 | e ->
@@ -210,7 +215,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le @@ -210,7 +215,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le
210 with e -> (*print_endline (Printexc.to_string e);*) 215 with e -> (*print_endline (Printexc.to_string e);*)
211 let time3 = time_fun () in 216 let time3 = time_fun () in
212 if first_try 217 if first_try
213 - then conll_parse_sentence timeout test_only_flag id false paths tokens 218 + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems
214 else {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2} 219 else {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}
215 220
216 221
@@ -243,11 +248,7 @@ let get_paths old_paths = function @@ -243,11 +248,7 @@ let get_paths old_paths = function
243 paths 248 paths
244 | _ -> failwith "get_paths" 249 | _ -> failwith "get_paths"
245 250
246 -<<<<<<< HEAD  
247 -let rec parse_sentence timeout test_only_flag mode file_prefix tokens lex_sems = function  
248 -=======  
249 -let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = function  
250 ->>>>>>> dep_trees 251 +let rec parse_sentence timeout test_only_flag mode id file_prefix tokens lex_sems = function
251 RawSentence s -> 252 RawSentence s ->
252 (match mode with 253 (match mode with
253 Swigra -> 254 Swigra ->
@@ -259,23 +260,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct @@ -259,23 +260,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct
259 | StructSentence(paths,last) -> 260 | StructSentence(paths,last) ->
260 (match mode with 261 (match mode with
261 ENIAM -> 262 ENIAM ->
262 -<<<<<<< HEAD  
263 let result = eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems in 263 let result = eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems in
264 -=======  
265 - let result = empty_eniam_parse_result in  
266 - (* let result = print_endline "eniam_parse_sentence"; eniam_parse_sentence timeout test_only_flag paths last tokens in *)  
267 ->>>>>>> dep_trees 264 + (* let result = empty_eniam_parse_result in *)
268 let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in 265 let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in
269 ENIAMSentence result 266 ENIAMSentence result
270 | _ -> failwith "parse_sentence") 267 | _ -> failwith "parse_sentence")
271 | DepSentence(paths) -> 268 | DepSentence(paths) ->
272 (match mode with 269 (match mode with
273 CONLL -> 270 CONLL ->
274 -<<<<<<< HEAD  
275 - let result = conll_parse_sentence timeout test_only_flag paths tokens lex_sems in  
276 -=======  
277 - let result = (*print_endline "conll_parse_sentence";*) conll_parse_sentence timeout test_only_flag id true paths tokens in  
278 ->>>>>>> dep_trees 271 + let result = conll_parse_sentence timeout test_only_flag id true paths tokens lex_sems in
279 let result = {result with 272 let result = {result with
280 file_prefix = file_prefix_of_mode mode ^ file_prefix; 273 file_prefix = file_prefix_of_mode mode ^ file_prefix;
281 paths = paths} in 274 paths = paths} in
@@ -289,19 +282,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct @@ -289,19 +282,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct
289 if not Paths.config.Paths.mate_parser_enabled then DepSentence paths else ( 282 if not Paths.config.Paths.mate_parser_enabled then DepSentence paths else (
290 print_endline "parse_sentence 1"; 283 print_endline "parse_sentence 1";
291 (* print_endline (Visualization.html_of_dep_sentence tokens paths); *) 284 (* print_endline (Visualization.html_of_dep_sentence tokens paths); *)
292 - let conll = ENIAM_CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in 285 + let conll = CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in
293 print_endline "parse_sentence 2"; 286 print_endline "parse_sentence 2";
294 (* printf "|%s|\n" conll; *) 287 (* printf "|%s|\n" conll; *)
295 Printf.fprintf mate_out "%s%!" conll; 288 Printf.fprintf mate_out "%s%!" conll;
296 print_endline "parse_sentence 3"; 289 print_endline "parse_sentence 3";
297 - let new_paths = get_paths paths (ENIAM_CONLL.load_sentence mate_in) in 290 + let new_paths = get_paths paths (CONLL.load_sentence mate_in) in
298 print_endline "parse_sentence 4"; 291 print_endline "parse_sentence 4";
299 (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *) 292 (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *)
300 -<<<<<<< HEAD  
301 - let result = conll_parse_sentence timeout test_only_flag new_paths tokens lex_sems in  
302 -=======  
303 - let result = conll_parse_sentence timeout test_only_flag id true new_paths tokens in  
304 ->>>>>>> dep_trees 293 + let result = conll_parse_sentence timeout test_only_flag id true new_paths tokens lex_sems in
305 let result = {result with 294 let result = {result with
306 file_prefix = file_prefix_of_mode mode ^ file_prefix; 295 file_prefix = file_prefix_of_mode mode ^ file_prefix;
307 paths=new_paths} in 296 paths=new_paths} in
@@ -309,66 +298,94 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct @@ -309,66 +298,94 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct
309 | _ -> failwith "parse_sentence") 298 | _ -> failwith "parse_sentence")
310 | QuotedSentences sentences -> 299 | QuotedSentences sentences ->
311 let sentences = Xlist.rev_map sentences (fun p -> 300 let sentences = Xlist.rev_map sentences (fun p ->
312 -<<<<<<< HEAD  
313 - let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens lex_sems p.psentence in  
314 -=======  
315 - let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens p.psentence in  
316 ->>>>>>> dep_trees 301 + let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens lex_sems p.psentence in
317 {p with psentence=sentence}) in 302 {p with psentence=sentence}) in
318 QuotedSentences(List.rev sentences) 303 QuotedSentences(List.rev sentences)
319 | AltSentence l -> 304 | AltSentence l ->
320 let l = Xlist.rev_map l (fun (mode,sentence) -> 305 let l = Xlist.rev_map l (fun (mode,sentence) ->
321 -<<<<<<< HEAD  
322 - mode, parse_sentence timeout test_only_flag mode file_prefix tokens lex_sems sentence) in 306 + mode, parse_sentence timeout test_only_flag mode id file_prefix tokens lex_sems sentence) in
323 AltSentence(List.rev l) 307 AltSentence(List.rev l)
324 | _ -> failwith "parse_sentence" 308 | _ -> failwith "parse_sentence"
325 309
326 -let rec parse_paragraph timeout test_only_flag mode tokens lex_sems = function 310 +let rec parse_paragraph timeout test_only_flag mode id tokens lex_sems = function
327 RawParagraph s -> RawParagraph s 311 RawParagraph s -> RawParagraph s
328 | StructParagraph sentences -> 312 | StructParagraph sentences ->
329 let sentences = Xlist.rev_map sentences (fun p -> 313 let sentences = Xlist.rev_map sentences (fun p ->
330 - let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens lex_sems p.psentence in  
331 -=======  
332 - mode, parse_sentence timeout test_only_flag mode id file_prefix tokens sentence) in  
333 - AltSentence(List.rev l)  
334 - | _ -> failwith "parse_sentence" 314 + let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens lex_sems p.psentence in
  315 + {p with psentence=sentence}) in
  316 + StructParagraph(List.rev sentences)
  317 + | AltParagraph l ->
  318 + let l = Xlist.rev_map l (fun (mode,paragraph) ->
  319 + mode, parse_paragraph timeout test_only_flag mode id tokens lex_sems paragraph) in
  320 + AltParagraph(List.rev l)
  321 +
  322 +let rec parse_text timeout test_only_flag mode id tokens lex_sems = function
  323 + RawText s -> RawText s
  324 + | StructText paragraphs ->
  325 + let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
  326 + parse_paragraph timeout test_only_flag mode id tokens lex_sems paragraph) in
  327 + StructText(List.rev paragraphs)
  328 + | AltText l -> AltText(Xlist.map l (fun (mode,text) ->
  329 + mode, parse_text timeout test_only_flag mode id tokens lex_sems text))
  330 +
  331 +let select_mode = function
  332 + (Raw,_),_ -> failwith "select_mode"
  333 + | _,(Raw,_) -> failwith "select_mode"
  334 + | (Struct,_),_ -> failwith "select_mode"
  335 + | _,(Struct,_) -> failwith "select_mode"
  336 + | (CONLL,s),_ -> CONLL,s
  337 + | _,(CONLL,s) -> CONLL,s
  338 + | (ENIAM,s),_ -> ENIAM,s
  339 + | _,(ENIAM,s) -> ENIAM,s
  340 + | (Swigra,s),_ -> Swigra,s
  341 + | _,(Swigra,s) -> Swigra,s
  342 + | (Mate,s),_ -> Mate,s
  343 + | _,(Mate,s) -> Mate,s
  344 + | _ -> failwith "select_mode: ni"
335 345
336 -let rec parse_paragraph timeout test_only_flag mode id tokens = function 346 +let rec select_sentences_sentence = function
  347 + RawSentence s -> failwith "select_sentences_sentence"
  348 + | StructSentence(paths,last) -> failwith "select_sentences_sentence"
  349 + | DepSentence paths -> failwith "select_sentences_sentence"
  350 + | QuotedSentences sentences ->
  351 + let sentences = Xlist.rev_map sentences (fun p ->
  352 + let sentence,_ = select_sentences_sentence p.psentence in
  353 + {p with psentence=sentence}) in
  354 + QuotedSentences(List.rev sentences), Parsed
  355 + | AltSentence l ->
  356 + let raw,selected = Xlist.fold l ([],[]) (fun (raw,selected) (mode,sentence) ->
  357 + if mode = Raw then (mode,sentence) :: raw, selected else
  358 + let sentence,status = select_sentences_sentence sentence in
  359 + if status <> Parsed && status <> NotTranslated then raw,selected else
  360 + match selected with
  361 + [] -> raw,[mode,sentence]
  362 + | [mode2,sentence2] -> raw,[select_mode ((mode,sentence),(mode2,sentence2))]
  363 + | _ -> failwith "select_sentences_sentence") in
  364 + AltSentence(raw @ selected), Parsed
  365 + | ENIAMSentence result -> ENIAMSentence result, result.status
  366 + | CONLLSentence result -> CONLLSentence result, result.status
  367 + | SemSentence result -> SemSentence result, result.status
  368 +
  369 +let rec select_sentences_paragraph = function
337 RawParagraph s -> RawParagraph s 370 RawParagraph s -> RawParagraph s
338 | StructParagraph sentences -> 371 | StructParagraph sentences ->
339 let sentences = Xlist.rev_map sentences (fun p -> 372 let sentences = Xlist.rev_map sentences (fun p ->
340 - let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens p.psentence in  
341 ->>>>>>> dep_trees 373 + let sentence,_ = select_sentences_sentence p.psentence in
342 {p with psentence=sentence}) in 374 {p with psentence=sentence}) in
343 StructParagraph(List.rev sentences) 375 StructParagraph(List.rev sentences)
344 | AltParagraph l -> 376 | AltParagraph l ->
345 let l = Xlist.rev_map l (fun (mode,paragraph) -> 377 let l = Xlist.rev_map l (fun (mode,paragraph) ->
346 -<<<<<<< HEAD  
347 - mode, parse_paragraph timeout test_only_flag mode tokens lex_sems paragraph) in  
348 - AltParagraph(List.rev l)  
349 -  
350 -let rec parse_text timeout test_only_flag mode tokens lex_sems = function  
351 -=======  
352 - mode, parse_paragraph timeout test_only_flag mode id tokens paragraph) in 378 + mode, select_sentences_paragraph paragraph) in
353 AltParagraph(List.rev l) 379 AltParagraph(List.rev l)
354 380
355 -let rec parse_text timeout test_only_flag mode id = function  
356 ->>>>>>> dep_trees 381 +let rec select_sentences_text = function
357 RawText s -> RawText s 382 RawText s -> RawText s
358 | StructText paragraphs -> 383 | StructText paragraphs ->
359 let paragraphs = Xlist.rev_map paragraphs (fun paragraph -> 384 let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
360 -<<<<<<< HEAD  
361 - parse_paragraph timeout test_only_flag mode tokens lex_sems paragraph) in 385 + select_sentences_paragraph paragraph) in
362 StructText(List.rev paragraphs) 386 StructText(List.rev paragraphs)
363 | AltText l -> AltText(Xlist.map l (fun (mode,text) -> 387 | AltText l -> AltText(Xlist.map l (fun (mode,text) ->
364 - mode, parse_text timeout test_only_flag mode tokens lex_sems text))  
365 -=======  
366 - parse_paragraph timeout test_only_flag mode id tokens paragraph) in  
367 - StructText(List.rev paragraphs, tokens)  
368 - | AltText l -> AltText(Xlist.map l (fun (mode,text) ->  
369 - mode, parse_text timeout test_only_flag mode id text))  
370 ->>>>>>> dep_trees  
371 - 388 + mode, select_sentences_text text))
372 389
373 let rec extract_query_text = function 390 let rec extract_query_text = function
374 RawText s -> s 391 RawText s -> s
@@ -392,11 +409,7 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n = @@ -392,11 +409,7 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n =
392 let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1} in 409 let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1} in
393 if msg <> "" then {result with status=PreprocessingError; msg=msg} else ( 410 if msg <> "" then {result with status=PreprocessingError; msg=msg} else (
394 (* print_endline "process_query 3"; *) 411 (* print_endline "process_query 3"; *)
395 -<<<<<<< HEAD  
396 - let parsed_text = parse_text timeout test_only_flag Struct tokens lex_sems (translate_text pre_text) in  
397 -=======  
398 - let parsed_text = parse_text timeout test_only_flag Struct id (translate_text pre_text) in  
399 ->>>>>>> dep_trees 412 + let parsed_text = parse_text timeout test_only_flag Struct id tokens lex_sems (translate_text pre_text) in
400 (* print_endline "process_query 4"; *) 413 (* print_endline "process_query 4"; *)
401 let time3 = time_fun () in 414 let time3 = time_fun () in
402 let result = if test_only_flag then result else {result with status=Parsed; parsed_text=parsed_text} in 415 let result = if test_only_flag then result else {result with status=Parsed; parsed_text=parsed_text} in
@@ -421,23 +434,50 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n = @@ -421,23 +434,50 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n =
421 let result = {result with semantic_time=time4 -. time3} in 434 let result = {result with semantic_time=time4 -. time3} in
422 result) 435 result)
423 436
  437 +let get_sock_addr host_name port =
  438 + let he = Unix.gethostbyname host_name in
  439 + let addr = he.Unix.h_addr_list in
  440 + Unix.ADDR_INET(addr.(0),port)
  441 +
  442 +let id_counter = ref 0
  443 +
  444 +let get_id () =
  445 + incr id_counter;
  446 + "ID_" ^ (string_of_int !id_counter)
  447 +
  448 +let get_query_id = function
  449 + ENIAMsubsyntaxTypes.AltText[_;ENIAMsubsyntaxTypes.CONLL,ENIAMsubsyntaxTypes.StructText([ENIAMsubsyntaxTypes.StructParagraph[p]])] -> if p.ENIAMsubsyntaxTypes.pid = "" then get_id () else p.ENIAMsubsyntaxTypes.pid
  450 + | ENIAMsubsyntaxTypes.AltText[ENIAMsubsyntaxTypes.CONLL,ENIAMsubsyntaxTypes.StructText([ENIAMsubsyntaxTypes.StructParagraph[p]])] -> if p.ENIAMsubsyntaxTypes.pid = "" then get_id () else p.ENIAMsubsyntaxTypes.pid
  451 + | _ -> failwith "get_query_id"
  452 +
  453 +let process_id s =
  454 + if Xstring.check_prefix "ID_" s then s else
  455 + let a,b,c = match Xstring.split_delim "/" s with
  456 + [a;b;c] -> a,b,c
  457 + | _ -> failwith ("process_id: " ^ s) in
  458 + if Xstring.check_prefix "NKJP_1M_" a && Xstring.check_prefix "morph_" b && Xstring.check_sufix "-p" b &&
  459 + Xstring.check_prefix "morph_" c && Xstring.check_sufix "-s" c then
  460 + Xstring.cut_prefix "NKJP_1M_" a ^ "." ^ Xstring.cut_sufix "-s" (Xstring.cut_prefix "morph_" c)
  461 + else failwith ("process_id: " ^ s)
424 462
425 let process_conll_corpus filename = 463 let process_conll_corpus filename =
  464 + print_endline "process_conll_corpus: START";
426 let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in 465 let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in
427 - print_endline "process_conll_corpus";  
428 - let corpus = [List.hd corpus] in 466 + print_endline "process_conll_corpus: DONE";
  467 + (* let corpus = [List.hd corpus] in *)
429 let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in 468 let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in
430 - Xlist.iter corpus (fun query -> 469 + print_endline "connection_opened";
  470 + Xlist.iter corpus (fun (query,tokens) ->
431 let id = process_id (get_query_id query) in 471 let id = process_id (get_query_id query) in
432 let path = "results/" ^ id ^ "/" in 472 let path = "results/" ^ id ^ "/" in
433 ignore (Sys.command ("mkdir -p " ^ path)); 473 ignore (Sys.command ("mkdir -p " ^ path));
434 - let result = process_query ic oc 30. false "x" query 10 in  
435 - Visualization.print_html_text path "input_text" result.input_text;  
436 - Visualization.print_html_text path "pre_text" result.pre_text;  
437 - Visualization.print_html_text path "parsed_text" result.parsed_text;  
438 - Visualization.print_html_text path "selected_sent_text" result.selected_sent_text;  
439 - Visualization.print_html_text path "semantic_text" result.semantic_text;  
440 - Visualization.print_html_text path "selected_semantic_text" result.selected_semantic_text; 474 + let result = process_query ic oc 30. false "x" (query,tokens) 10 in
  475 + (* Visualization.print_html_text path "input_text" result.input_text tokens;
  476 + Visualization.print_html_text path "pre_text" result.pre_text tokens;
  477 + Visualization.print_html_text path "parsed_text" result.parsed_text tokens;
  478 + Visualization.print_html_text path "selected_sent_text" result.selected_sent_text tokens;
  479 + Visualization.print_html_text path "semantic_text" result.semantic_text tokens;
  480 + Visualization.print_html_text path "selected_semantic_text" result.selected_semantic_text tokens; *)
441 (* printf "input_text:\n%s\n" (Visualization.string_of_text result.input_text); 481 (* printf "input_text:\n%s\n" (Visualization.string_of_text result.input_text);
442 printf "pre_text:\n%s\n" (Visualization.string_of_text result.pre_text); *) 482 printf "pre_text:\n%s\n" (Visualization.string_of_text result.pre_text); *)
443 (* Exec.print_result stdout result; *) 483 (* Exec.print_result stdout result; *)
@@ -445,13 +485,15 @@ let process_conll_corpus filename = @@ -445,13 +485,15 @@ let process_conll_corpus filename =
445 (* CompTrees.compare_results result.parsed_text; *) 485 (* CompTrees.compare_results result.parsed_text; *)
446 (* Visualization.print_paths "results/" "paths" result.paths; *) 486 (* Visualization.print_paths "results/" "paths" result.paths; *)
447 ()); 487 ());
448 - Marshal.to_channel oc (PreTypes.RawText "",ExtArray.make 1 ENIAMtokenizerTypes.empty_token) []; 488 + Marshal.to_channel oc (ENIAMsubsyntaxTypes.RawText "",ExtArray.make 1 ENIAMtokenizerTypes.empty_token) [];
449 flush oc; 489 flush oc;
450 let _ = Unix.shutdown_connection ic in 490 let _ = Unix.shutdown_connection ic in
451 () 491 ()
452 492
453 let _ = 493 let _ =
  494 + LCGfields.reset();
454 (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *) 495 (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *)
455 - (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *)  
456 - process_conll_corpus "../testy/skladnica-test1.conll"; 496 + process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll";
  497 + (* process_conll_corpus "../testy/skladnica-test1.conll"; *)
  498 + LCGfields.print_results();
457 () 499 ()
corpora/test_conll2.ml
@@ -116,7 +116,7 @@ let test_example path id tokens lex_sems paths last = @@ -116,7 +116,7 @@ let test_example path id tokens lex_sems paths last =
116 let test_dep_example path id tokens lex_sems paths = 116 let test_dep_example path id tokens lex_sems paths =
117 try 117 try
118 ENIAM_LCGreductions.reset_variant_label (); 118 ENIAM_LCGreductions.reset_variant_label ();
119 - (* let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in *) 119 + let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in
120 ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; 120 ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths;
121 let chart = create_dep_chart tokens lex_sems paths in 121 let chart = create_dep_chart tokens lex_sems paths in
122 ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; 122 ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart;
@@ -150,7 +150,7 @@ let test_dep_example path id tokens lex_sems paths = @@ -150,7 +150,7 @@ let test_dep_example path id tokens lex_sems paths =
150 let rec parse_sentence name id tokens lex_sems = function 150 let rec parse_sentence name id tokens lex_sems = function
151 RawSentence s -> id 151 RawSentence s -> id
152 | StructSentence(paths,last) -> 152 | StructSentence(paths,last) ->
153 - test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; 153 + (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *)
154 id + 1 154 id + 1
155 | DepSentence(paths) -> 155 | DepSentence(paths) ->
156 test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths; 156 test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths;
diagnostics/treeChange.ml deleted
1 -open Xstd  
2 -open PreTypes  
3 -  
4 -let if_lemma lemmas = function  
5 - Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas  
6 - | _ -> false  
7 -  
8 -let if_cat cats = function  
9 - Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats  
10 - | _ -> false  
11 -  
12 -let if_interps interps token =  
13 - let interp = match token with  
14 - Lemma(_,_,i) -> i  
15 - | _ -> [[[]]] in  
16 - let if_interp nr value =  
17 - List.exists (fun x ->  
18 - try  
19 - List.exists (fun y ->  
20 - y = value) (List.nth x nr)  
21 - with _ -> false  
22 - ) interp in  
23 - Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value))  
24 -  
25 -let correct_coordination1 paths tokens =  
26 - let paths_ls = List.mapi (fun i (id,super,label) ->  
27 - (i,id,super,label)) (Array.to_list paths) in  
28 -  
29 - let l = [("subst:nom",0),(["fin";"praet"],0);  
30 - ("subst:acc",0),(["inf"],0);  
31 - ("ppron3:nom",0),(["fin";"praet"],0);  
32 - ("ppron3:acc",0),(["fin";"praet"],0);  
33 - ("adv",0),(["fin";"praet"],0);  
34 - ("adv",0),(["inf"],0);  
35 - ("adv",0),(["adj"],0);  
36 - ("prep",0),(["fin";"praet"],0);  
37 - ("prep",0),(["inf"],0);  
38 - ("prep",0),(["ppas"],0);  
39 - ("prep",0),(["subst"],0);  
40 - ("prep:gen",0),(["subst:gen"],0);  
41 - ("adj:nom",0),(["fin";"praet"],0);  
42 - ("adj:nom",0),(["subst:nom"],0);  
43 - ("adj:gen",0),(["subst:gen"],0);  
44 - ("adj:dat",0),(["subst:dat"],0);  
45 - ("adj:acc",0),(["subst:acc"],0);  
46 - ("adj:inst",0),(["subst:inst"],0);  
47 - ("adj:loc",0),(["subst:loc"],0);  
48 - ("subst:gen",0),(["subst:nom"],0);  
49 - (* ("subst:gen",0),(["subst:gen"],0); *)  
50 - ("subst:gen",0),(["subst:dat"],0);  
51 - ("subst:gen",0),(["subst:acc"],0);  
52 - ("subst:gen",0),(["subst:inst"],0);  
53 - ("subst:gen",0),(["subst:loc"],0);  
54 - ("ppron3:gen",0),(["subst:nom"],0);  
55 - ("ppron3:gen",0),(["subst:dat"],0);  
56 - ("ppron3:gen",0),(["subst:acc"],0);  
57 - ("ppron3:gen",0),(["subst:inst"],0);  
58 - ("ppron3:gen",0),(["subst:loc"],0);  
59 - ("qub",0),(["fin";"praet"],0);  
60 - ("qub",0),(["subst"],0);  
61 - ("qub",0),(["adj"],0);  
62 - ("pact",0),(["subst"],0);  
63 - ("ppas",0),(["subst"],0)  
64 - ] in  
65 -  
66 - let find_dependents sons =  
67 -  
68 - let is (i,id,super,label) pattern = match Xstring.split ":" pattern with  
69 - ["prep";case] -> if_cat ["prep"] (ExtArray.get tokens id).token &&  
70 - if_interps [0,case] (ExtArray.get tokens id).token  
71 - | [cat;case] -> if_cat [cat] (ExtArray.get tokens id).token &&  
72 - if_interps [1,case] (ExtArray.get tokens id).token  
73 - | [cat] -> if_cat [cat] (ExtArray.get tokens id).token  
74 - | _ -> failwith "is (in correct_coordination1)" in  
75 -  
76 - let incr_representative acc son = Xlist.map acc (fun ((one,a),(rest,b)) ->  
77 - if is son one  
78 - then (one,a + 1), (rest,b)  
79 - else if List.exists (is son) rest  
80 - then (one,a), (rest,b + 1)  
81 - else (one,a), (rest,b)) in  
82 -  
83 - let get_from sons pattern = List.find (fun x -> is x pattern) sons in  
84 -  
85 - let l = Xlist.fold sons l incr_representative in  
86 - let results = List.filter (fun ((_,a),(_,b)) -> a = 1 && b > 1) l in  
87 - Xlist.map results (fun result ->  
88 - get_from sons @@ fst @@ fst result,  
89 - List.filter (fun son ->  
90 - List.exists (fun one -> is son one) (fst (snd result))) sons) in  
91 -  
92 - let establish_neighbour super ((i_d,id_d,super_d,label_d),sons) =  
93 - let not_between (i_s,_,_,_) =  
94 - (super < i_d && super < i_s) ||  
95 - (super > i_d && super > i_s) in  
96 - let (i_n,id_n,super_n,label_n) = List.find (fun son ->  
97 - not_between son) sons in  
98 - paths.(i_d) <- (id_d, i_n, label_d) in  
99 -  
100 - let examine_coords (i,id,super,label) sons =  
101 - try  
102 - let dependents = find_dependents sons in  
103 - Xlist.iter dependents (establish_neighbour super)  
104 - with  
105 - | _ -> () in  
106 -  
107 - Array.iteri (fun i (id,super,label) ->  
108 - if if_cat ["conj"] (ExtArray.get tokens id).token  
109 - then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in  
110 - if (List.length sons > 2)  
111 - then examine_coords (i,id,super,label) sons)) paths;  
112 - paths  
113 -  
114 -let correct_coordination2 paths tokens =  
115 - let paths_c = Array.copy paths in  
116 - let paths_ls () = List.mapi (fun i (id,super,label) ->  
117 - (i,id,super,label)) (Array.to_list paths_c) in  
118 -  
119 - (* let ps a sons =  
120 - print_endline a;  
121 - List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons;  
122 - print_endline "" in *)  
123 -  
124 - let rec correct_rec (i,id,super,label) sons =  
125 - let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in  
126 - (* ps "left:" (List.rev left_s);  
127 - ps "right:" right_s; *)  
128 - find_father i (List.rev left_s);  
129 - find_father i right_s  
130 -  
131 - and find_father i0 = function  
132 - [(i,id,super,label)] -> paths_c.(i) <- (id,i0,label)  
133 - | (a,b,c,d) :: (i,id,super,label) :: t ->  
134 - paths_c.(i) <- (id,i0,label);  
135 - if not (if_cat ["conj"] (ExtArray.get tokens i).token ||  
136 - (ExtArray.get tokens i).orth = ",")  
137 - then failwith "find_father";  
138 - correct_rec (i,id,super,label) (if a < i  
139 - then (a,b,c,d) :: t  
140 - else List.rev @@ (a,b,c,d) :: t)  
141 - | _ -> failwith "find_father" in  
142 -  
143 - let check_previous_for_interp i =  
144 - if i >= 0 && (ExtArray.get tokens i).orth = "," &&  
145 - not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c))  
146 - then paths_c.(i) <- (0,-1,"") in  
147 -  
148 - Array.iteri (fun i (id,super,label) ->  
149 - if if_cat ["conj"] (ExtArray.get tokens i).token ||  
150 - (ExtArray.get tokens i).orth = ","  
151 - then  
152 - (check_previous_for_interp (i-1);  
153 - let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in  
154 - if (List.length sons > 2)  
155 - then correct_rec (i,id,super,label) sons)) paths_c;  
156 - paths_c  
157 -  
158 -let praet_qub_aglt paths tokens =  
159 - Array.iteri (fun i (id,super,label) ->  
160 - if super >= 0 then  
161 - (let id_s, super_s, label_s = paths.(super) in  
162 - if if_cat ["aglt"] (ExtArray.get tokens id).token &&  
163 - (ExtArray.get tokens id_s).orth = "by"  
164 - then let id_gf,super_gf,label_gf = paths.(super_s) in  
165 - if if_cat ["praet"] (ExtArray.get tokens id_gf).token  
166 - then paths.(i) <- (id,super_s,label))) paths;  
167 - paths  
168 -  
169 -let replace_tokens paths tokens =  
170 -(* for i = 0 to ExtArray.size tokens - 1 do  
171 - print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth)  
172 -done; *)  
173 - let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->  
174 - if (ExtArray.get tokens i).orth = orth then i else acc) in  
175 -  
176 - let multidot i id0 super0 label0 =  
177 - let id1, super1, label1 = paths.(super0) in  
178 - if super1 >= 0 then  
179 - let id2, super2, label2 = paths.(super1) in  
180 - if (ExtArray.get tokens id1).orth = "." &&  
181 - (ExtArray.get tokens id2).orth = "."  
182 - then  
183 - (paths.(super1) <- (find_token "..." ,super2, label2);  
184 - paths.(super0) <- (0,-1,"");  
185 - paths.(i) <- (0,-1,"")) in  
186 -  
187 - let brev i id super label =  
188 - let if_the_last_dot () =  
189 - let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) ->  
190 - s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in  
191 - Array.fold_left (fun acc (i2,s,l) ->  
192 - acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in  
193 -  
194 - let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot ()  
195 - then ""  
196 - else "." in  
197 -  
198 - let n_orth = (ExtArray.get tokens id).orth ^ dot in  
199 - paths.(i) <- (find_token n_orth,super,label) in  
200 -  
201 - Array.iteri (fun i (id,super,label) ->  
202 - if (ExtArray.get tokens id).orth = "."  
203 - then multidot i id super label;  
204 - if if_cat ["brev"] (ExtArray.get tokens id).token  
205 - then brev i id super label)  
206 - paths;  
207 - paths  
208 -  
209 -let replace_hyphens paths tokens =  
210 - let ref_paths = ref paths in  
211 - let find_token token = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->  
212 - if (ExtArray.get tokens i).token = token then i else acc) in  
213 - let find_specific_token token beg next = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->  
214 - if (ExtArray.get tokens i).token = token &&  
215 - beg <= (ExtArray.get tokens i).beg &&  
216 - (ExtArray.get tokens i).next <= next  
217 - then i else acc) in  
218 -  
219 - let correct_last son_of_zero =  
220 - let i1,s1,l1 = !ref_paths.(Array.length !ref_paths - 1) in  
221 - let i2,s2,l2 = !ref_paths.(son_of_zero) in  
222 - if (ExtArray.get tokens i1).orth = "."  
223 - then  
224 - (!ref_paths.(Array.length !ref_paths - 1) <- (find_token (Interp "</sentence>"),1,l1);  
225 - !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2))  
226 - else  
227 - (ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),1,"-") |];  
228 - !ref_paths.(Array.length !ref_paths - 2) <- (i1,Array.length !ref_paths - 1,l1);  
229 - !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) in  
230 -  
231 - let one_hyphen sons_of_zero =  
232 - let i2,s2,l2 = !ref_paths.(1) in  
233 - Xlist.iter sons_of_zero (fun son_of_zero ->  
234 - let i1,s1,l1 = !ref_paths.(son_of_zero) in  
235 - !ref_paths.(son_of_zero) <- (i1,1,l1));  
236 - !ref_paths.(1) <- (find_token (Interp "<or-sentence>"),0,l2);  
237 - correct_last son_of_zero in  
238 -  
239 - let two_hyphens first second son parent =  
240 - let i1,s1,l1 = !ref_paths.(first) in  
241 - let i2,s2,l2 = !ref_paths.(second) in  
242 - let beg, next = (ExtArray.get tokens i2).beg, (ExtArray.get tokens i2).next in  
243 - let i3,s3,l3 = !ref_paths.(son) in  
244 - let i4,s4,l4 = !ref_paths.(parent) in  
245 - ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),first,"-") |];  
246 - !ref_paths.(first) <- (find_token (Interp "<or-sentence>"),0,l1);  
247 - !ref_paths.(second) <- (find_specific_token (Interp "</or-sentence>") beg next,first,l2);  
248 - !ref_paths.(son) <- (i3,second,l3);  
249 - !ref_paths.(parent) <- (i4,first,l4) in  
250 -  
251 - let rec is_dep_correct a b out zero res i (id,super,label) = (* out = how many words in (a,b) have parent outside [a,b]*)  
252 - (* print_endline ((string_of_int a) ^ " " ^ (string_of_int b) ^ " " ^ (string_of_int out) ^ " " ^ (string_of_int zero) ^ " " ^ (string_of_int i)); *)  
253 - if out > 1 || zero > 1 || (* zero = how many words (not interps) have parent 0 *)  
254 - (a < i && i < b && super < a && label <> "interp") ||  
255 - (a < super && super < b && (i < a || b < i))  
256 - then false, res  
257 - else  
258 - if i+1 = Array.length !ref_paths  
259 - then out = 1 && zero = 1, res  
260 - else  
261 - if a < i && i < b && b < super  
262 - then is_dep_correct a b (out+1) zero (i,super) (i+1) !ref_paths.(i+1)  
263 - else  
264 - if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token)  
265 - then is_dep_correct a b out (zero+1) res (i+1) !ref_paths.(i+1)  
266 - else is_dep_correct a b out zero res (i+1) !ref_paths.(i+1) in  
267 -  
268 - let hyphens = snd @@ Array.fold_left (fun (i,acc) (id,super,label) ->  
269 - if (ExtArray.get tokens id).orth = "-"  
270 - then i+1, i :: acc  
271 - else i+1, acc) (0,[]) !ref_paths in  
272 -  
273 - let sons_of_zero = snd @@ Array.fold_left (fun (i,acc) (id,super,label) ->  
274 - if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token)  
275 - then i+1, i :: acc  
276 - else i+1, acc) (0,[]) !ref_paths in  
277 -  
278 - (if List.length sons_of_zero = 1  
279 - then  
280 - if List.length hyphens = 1 && hyphens = [1]  
281 - then one_hyphen sons_of_zero  
282 - else  
283 - if List.length hyphens = 2  
284 - then let a, b = List.nth hyphens 1, List.nth hyphens 0 in  
285 - let is_good, (son,parent) = is_dep_correct a b 0 0 (0,0) 1 !ref_paths.(1) in  
286 - if a = 1 && is_good  
287 - then two_hyphens a b son parent);  
288 - !ref_paths  
289 -  
290 -let correct_interp_with_father_0 paths tokens =  
291 - Array.iteri (fun i (id,super,label) ->  
292 - if (super = 0 ||  
293 - (ExtArray.get tokens id).token = Interp "<or-sentence>" ||  
294 - (ExtArray.get tokens id).token = Interp "</or-sentence>") && (ExtArray.get tokens id).orth = ","  
295 - then Array.iteri (fun i1 (id1,super1,label1) ->  
296 - if super1 = i  
297 - then paths.(i1) <- (id1,0,label1)) paths) paths;  
298 - paths  
299 -  
300 -let remove_interps interp paths tokens =  
301 - let paths_ls = Array.to_list paths in  
302 - Array.iteri (fun i (id,super,label) ->  
303 - if (ExtArray.get tokens id).orth = interp &&  
304 - not (List.exists (fun (_,super,_) -> super = i) paths_ls)  
305 - then paths.(i) <- (0,-1,"")) paths;  
306 - paths  
307 -  
308 -let correct_passive_voice paths tokens =  
309 - Array.iteri (fun i (id,super,label) ->  
310 - if super >= 0 then  
311 - (let id_s, super_s, label_s = paths.(super) in  
312 - if (if_cat ["praet"] (ExtArray.get tokens id).token &&  
313 - if_cat ["ppas"] (ExtArray.get tokens id_s).token)  
314 - then (paths.(i) <- (id,super_s,label);  
315 - paths.(super) <- (id_s,i,label_s);  
316 - Array.iteri (fun i_p (id_p,super_p,label_p) ->  
317 - if super_p = super  
318 - then paths.(i_p) <- (id_p,i,label_p)) paths))) paths;  
319 - paths  
320 -  
321 -let swap_dep paths tokens =  
322 - let change_dep i (id,super,label) =  
323 - let id_S, super_S, label_S = paths.(super) in  
324 - paths.(i) <- (id,super_S,label);  
325 - paths.(super) <- (id_S, id, label_S) in  
326 - let rec correct_dep i (id,super,label) =  
327 - let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który";  
328 - "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in  
329 - if (if_cat ["comp"] (ExtArray.get tokens id).token &&  
330 - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token) ||  
331 - (if_cat ["conj"] (ExtArray.get tokens id).token &&  
332 - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token &&  
333 - not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) ||  
334 - (if_cat ["ppron3"] (ExtArray.get tokens id).token &&  
335 - if_interps [5,"praep"] (ExtArray.get tokens id).token) ||  
336 - (if_lemma adv_relators (ExtArray.get tokens id).token &&  
337 - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token)  
338 - then  
339 - change_dep i (id,super,label);  
340 - if (if_lemma adv_relators (ExtArray.get tokens id).token &&  
341 - if_cat ["subst"; "pred"] (ExtArray.get tokens super).token)  
342 - then correct_dep i paths.(i) in  
343 - Array.iteri correct_dep paths; paths  
344 -  
345 - (*  
346 - correct_coordination1 -> sąsiad słowem najbliższym po prawej, jeśli pomiędzy nim a mną spójnik, to najbliższym po lewej  
347 - nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy  
348 - nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *)  
parser/visualization.ml
@@ -916,7 +916,7 @@ let rec html_of_text path tokens = function @@ -916,7 +916,7 @@ let rec html_of_text path tokens = function
916 sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_text path tokens text))) ^ 916 sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_text path tokens text))) ^
917 "</table>" 917 "</table>"
918 918
919 -let print_html_text path name text tokens lex_sems = 919 +let print_html_text path name text tokens (*lex_sems*) =
920 File.file_out (path ^ name ^ ".html") (fun file -> 920 File.file_out (path ^ name ^ ".html") (fun file ->
921 fprintf file "%s\n" html_header; 921 fprintf file "%s\n" html_header;
922 fprintf file "%s<BR>\n" (html_of_text path tokens text); 922 fprintf file "%s<BR>\n" (html_of_text path tokens text);
pre/makefile
@@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 OCAMLDEP=ocamldep 3 OCAMLDEP=ocamldep
4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 OCAMLFLAGS=$(INCLUDES) -g 5 OCAMLFLAGS=$(INCLUDES) -g
6 -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa 6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa
7 INSTALLDIR=`ocamlc -where` 7 INSTALLDIR=`ocamlc -where`
8 8
9 WAL= paths.ml 9 WAL= paths.ml
pre/preProcessing.ml
@@ -121,9 +121,9 @@ let parse_text = function @@ -121,9 +121,9 @@ let parse_text = function
121 let lex_sems = ENIAMlexSemantics.assign tokens text in 121 let lex_sems = ENIAMlexSemantics.assign tokens text in
122 text,tokens,lex_sems 122 text,tokens,lex_sems
123 | AltText[Raw,RawText query;CONLL,StructText[ 123 | AltText[Raw,RawText query;CONLL,StructText[
124 - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens -> 124 + StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens ->
125 let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in 125 let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in
126 - let conll = StructParagraph[{p with psentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] 126 + let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
127 @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in 127 @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in
128 let paths = ENIAMsubsyntax.parse query in 128 let paths = ENIAMsubsyntax.parse query in
129 let sentences = ENIAMsentences.split_into_sentences "" query tokens paths in 129 let sentences = ENIAMsentences.split_into_sentences "" query tokens paths in
@@ -135,7 +135,7 @@ let parse_text = function @@ -135,7 +135,7 @@ let parse_text = function
135 135
136 let rec main_loop in_chan out_chan = 136 let rec main_loop in_chan out_chan =
137 (* print_endline "main_loop 1"; *) 137 (* print_endline "main_loop 1"; *)
138 - let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_record ExtArray.t) in 138 + let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_env ExtArray.t) in
139 (* print_endline "main_loop 2"; *) 139 (* print_endline "main_loop 2"; *)
140 if fst query = RawText "" then () else ( 140 if fst query = RawText "" then () else (
141 (try 141 (try
@@ -154,7 +154,7 @@ let rec main_loop in_chan out_chan = @@ -154,7 +154,7 @@ let rec main_loop in_chan out_chan =
154 (* print_endline "main_loop 7"; *) 154 (* print_endline "main_loop 7"; *)
155 Marshal.to_channel out_chan ( 155 Marshal.to_channel out_chan (
156 RawText "", 156 RawText "",
157 - ExtArray.make 1 ENIAMtokenizerTypes.empty_token, 157 + ExtArray.make 1 ENIAMtokenizerTypes.empty_token_env,
158 ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem, 158 ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem,
159 Printexc.to_string e, 159 Printexc.to_string e,
160 0.) [])); 160 0.) []));
testy/skladnica-test2.conll
@@ -11,7 +11,7 @@ @@ -11,7 +11,7 @@
11 5 szanse szansa subst subst pl|acc|f 4 obj_th _ _ 11 5 szanse szansa subst subst pl|acc|f 4 obj_th _ _
12 6 ? ? interp interp _ 4 punct _ _ 12 6 ? ? interp interp _ 4 punct _ _
13 13
14 -# trees/NKJP_1M_1202900095/morph_3-p/morph_3.46-s.xml.tree 14 +# trees/NKJP_1M_1202900095/morph_3-p/morph_3.46-s.xml.trees
15 1 - - interp interp 0 _ _ _ 15 1 - - interp interp 0 _ _ _
16 2 Słoń słoń subst subst sg|nom|m2 4 _ _ _ 16 2 Słoń słoń subst subst sg|nom|m2 4 _ _ _
17 3 - - interp interp 0 _ _ _ 17 3 - - interp interp 0 _ _ _
@@ -19,7 +19,7 @@ @@ -19,7 +19,7 @@
19 5 Pinio Pinio subst subst sg|nom|m1 4 _ _ _ 19 5 Pinio Pinio subst subst sg|nom|m1 4 _ _ _
20 6 . . interp interp 0 _ _ _ 20 6 . . interp interp 0 _ _ _
21 21
22 -# trees/NKJP_1M_2002000114/morph_2-p/morph_2.72-s.xml.tree 22 +# trees/NKJP_1M_2002000114/morph_2-p/morph_2.72-s.xml.trees
23 1 - - interp interp 0 _ _ _ 23 1 - - interp interp 0 _ _ _
24 2 Nie nie qub qub 3 _ _ _ 24 2 Nie nie qub qub 3 _ _ _
25 3 mogę móc fin fin sg|pri|imperf 7 _ _ _ 25 3 mogę móc fin fin sg|pri|imperf 7 _ _ _
@@ -29,7 +29,7 @@ @@ -29,7 +29,7 @@
29 7 zachrypiał zachrypieć praet praet sg|m1|perf 0 _ _ _ 29 7 zachrypiał zachrypieć praet praet sg|m1|perf 0 _ _ _
30 8 . . interp interp 0 _ _ _ 30 8 . . interp interp 0 _ _ _
31 31
32 -# trees/NKJP_1M_2002000028/morph_5-p/morph_5.40-s.xml.tree 32 +# trees/NKJP_1M_2002000028/morph_5-p/morph_5.40-s.xml.trees
33 1 - - interp interp 0 _ _ _ 33 1 - - interp interp 0 _ _ _
34 2 Właśnie właśnie qub qub 4 _ _ _ 34 2 Właśnie właśnie qub qub 4 _ _ _
35 3 to to subst subst sg|acc|n 4 _ _ _ 35 3 to to subst subst sg|acc|n 4 _ _ _
@@ -39,7 +39,7 @@ @@ -39,7 +39,7 @@
39 7 twardo twardo adv adv pos 6 _ _ _ 39 7 twardo twardo adv adv pos 6 _ _ _
40 8 . . interp interp 0 _ _ _ 40 8 . . interp interp 0 _ _ _
41 41
42 -# trees/NKJP_1M_1202000001/morph_3-p/morph_3.9-s.xml.tree 42 +# trees/NKJP_1M_1202000001/morph_3-p/morph_3.9-s.xml.trees
43 1 CKM CKM subst subst sg|nom|n 0 _ _ _ 43 1 CKM CKM subst subst sg|nom|n 0 _ _ _
44 2 : interp 0 _ _ _ 44 2 : interp 0 _ _ _
45 3 Jak jak adv adv pos 5 _ _ _ 45 3 Jak jak adv adv pos 5 _ _ _
@@ -50,7 +50,7 @@ @@ -50,7 +50,7 @@
50 8 patrzeć patrzeć inf inf imperf 5 _ _ _ 50 8 patrzeć patrzeć inf inf imperf 5 _ _ _
51 9 ? ? interp interp 0 _ _ _ 51 9 ? ? interp interp 0 _ _ _
52 52
53 -# trees/NKJP_1M_2001000023/morph_1-p/morph_1.61-s.xml.tree 53 +# trees/NKJP_1M_2001000023/morph_1-p/morph_1.61-s.xml.trees
54 1 Pochylił pochylić praet praet sg|m1|perf 0 _ _ _ 54 1 Pochylił pochylić praet praet sg|m1|perf 0 _ _ _
55 2 em być aglt aglt sg|pri|imperf|wok 1 _ _ _ 55 2 em być aglt aglt sg|pri|imperf|wok 1 _ _ _
56 3 się się qub qub 1 _ _ _ 56 3 się się qub qub 1 _ _ _