Commit 0b0d4af3a9070341c24d391ddcd082f4cf5e15c9

Authored by Wojciech Jaworski
2 parents d06dc00b 535b2498

Merge branch 'dep_trees' into integration

LCGlexicon/resources/lexicon-pl.dic
... ... @@ -7,7 +7,7 @@
7 7 month-lex month-interval year-interval roman roman-interval
8 8 hour-minute-interval hour-interval obj-id match-result
9 9 url email day-month day year date hour hour-minute
10   - się nie by s <root> or or2 <colon> <speaker> <speaker-end> <squery>
  10 + się nie by s <root> <conll_root> or or2 <colon> <speaker> <speaker-end> <squery>
11 11  
12 12 @WEIGHTS
13 13 symbol_weight=1
... ... @@ -272,6 +272,8 @@ pos=unk: np*number*case*gender*person;
272 272 # [LCGrenderer.make_frame false tokens lex_sems [] schema_list ["<conll_root>"] d batrs]
273 273 # | lemma,c,l -> failwith ("process_interp: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) in
274 274  
  275 +lemma=<conll_root>,pos=interp: <conll_root>/(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj);
  276 +
275 277 pos=sinterj: BRACKET interj;
276 278  
277 279 lemma=</sentence>,pos=interp: BRACKET s\?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj);
... ...
LCGparser/ENIAM_LCGrules.ml
... ... @@ -446,8 +446,8 @@ let backward_cross_composition references args functs =
446 446 let rules = [
447 447 backward_application;
448 448 forward_application;
449   - backward_cross_composition;
450   - forward_cross_composition;
  449 + (* backward_cross_composition; *)
  450 + (* forward_cross_composition; *)
451 451 ]
452 452  
453 453 let rec flatten_functor2 l seml = function
... ...
corpora/CONLL.ml
... ... @@ -3,133 +3,55 @@ open ENIAMsubsyntaxTypes
3 3 open ENIAMtokenizerTypes
4 4  
5 5 let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts
6   - then f (snd @@ List.find (fun (m,_) -> m = mode) alts)
7   - else f (snd @@ List.find (fun (m,_) -> m = Struct) alts)
8   -
9   -let string_of_token mode token conll_id super label =
10   - let decompose_lemma = function
11   - | Lemma(a,b,c) -> a,b,if c = [[]]
12   - then "_"
13   - else String.concat "][" @@ Xlist.map c (fun x ->
14   - String.concat "|" @@ Xlist.map x ( fun y ->
15   - String.concat "." y))
16   - | t -> failwith ("string_of_token: not Lemma") in
17   - match mode with
18   - | Raw -> token.orth
19   - | Struct -> failwith ("function string_of_token for mode Struct is not defined")
20   - | CONLL -> let lemma,cat,interp = decompose_lemma token.token in
21   - String.concat "\t" [string_of_int conll_id;
22   - token.orth; lemma; cat; cat; interp; "_"; "_";
23   - string_of_int token.beg; string_of_int token.len]
24   - | Mate -> let lemma,cat,interp = decompose_lemma token.token in
25   - String.concat "\t" [string_of_int conll_id;
26   - token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"]
27   - | _ -> failwith "string_of_token: ni"
28   -
29   -let string_of_paths mode tokens paths =
30   - let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id ->
31   - let id,super,label = paths.(conll_id) in
32   - (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in
33   - String.concat "\n" (List.rev l) ^ "\n\n"
34   -
35   -let rec string_of_sentence mode tokens = function
36   - RawSentence s -> if mode = Raw then s else ""
37   - | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)
38   - | DepSentence (paths) -> string_of_paths mode tokens paths
39   - | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences")
40   - | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts
41   -
42   -let string_of_p_record mode tokens p_record =
43   - (if p_record.id = "" then "" else p_record.id ^ "\n") ^
44   - string_of_sentence mode tokens p_record.sentence
45   -
46   -(*let rec string_of_paragraph mode tokens = function
47   - RawParagraph s -> if mode = Raw then s else ""
48   - | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens)
49   - | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts
50   -
51   -let rec string_of_text mode tokens = function
52   - RawText s -> if mode = Raw then s else ""
53   - | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens)
54   - | AltText alts -> alternative_string (string_of_text mode) mode alts*)
55   -
56   -
57   -(******************)
58   -(***
59   -let establish_next tokens paths =
60   - let n = ExtArray.size tokens in
61   - Int.iter 1 (n - 2) (fun i ->
62   - let f = ExtArray.get tokens i in
63   - let s = ExtArray.get tokens (i+1) in
64   - ExtArray.set tokens i {f with next = s.beg});
65   - let last = ExtArray.get tokens (n-1) in
66   - ExtArray.set tokens (n-1) {last with next = last.beg + last.len}
67   -
68   -
69   - (*let rec pom res = function
70   - h :: t -> let next = if res = []
71   - then h.beg+h.len
72   - else (List.hd res).beg in
73   - pom ({h with next = next} :: res) t
74   - | [] -> res in
75   - pom [] rev_tokens*)
76   -
77   -let rec establish_for_token i text tokens = function
78   - (id,_,_) :: t as l->
79   - let h = ExtArray.get tokens id in
80   - if Xstring.check_prefix " " text
81   - then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l
82   - else if Xstring.check_prefix h.orth text
83   - then
84   - let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in
85   - let n_h = {h with beg = i ; len = n} in
86   - ExtArray.set tokens id n_h;
87   - establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t
88   - else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)
89   - | [] -> 100, i
90   -
91   -let rec establish_lengths text paths tokens =
92   - let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in
93   - establish_next tokens paths;
94   - pbeg, plen-100
95   -
96   -(******************)
97   -
98   -exception ErrorInfoFile of string
99   -
100   -let info_file = "../corpora/info_sentences.txt"
101   -
102   -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file
103   -
104   -let add_to_map map info_str =
105   - match Xstring.split "\n" info_str with
106   - [id; text; info_token] -> StringMap.add map info_token (id, text)
107   - | _ -> raise (ErrorInfoFile info_str)
108   -
109   -let info_map =
110   - Xlist.fold info StringMap.empty add_to_map
111   -
112   -let match_sentence (p_record,tokens) =
113   - let rec info_token s = match s with
114   - RawSentence text -> failwith ("match_sentence: " ^ text)
115   - | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
116   - | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
117   - | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")
118   - | AltSentence alts -> failwith ("match_sentence: AltSentence")
119   - (*if List.exists (fun (mode, s) -> mode = CONLL) alts
120   - then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
121   - else failwith ("match_sentence: no CONLL mode in AltSentence")*) in
122   - let info_token, paths = info_token p_record.psentence in
123   - try
124   - let id, text = StringMap.find info_map info_token in
125   - let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
126   - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
127   - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
128   -(* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
129   - with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]
130   -
131   -let match_corpus corpus =
132   - Xlist.map corpus match_sentence***)
  6 + then f (snd @@ List.find (fun (m,_) -> m = mode) alts)
  7 + else f (snd @@ List.find (fun (m,_) -> m = Struct) alts)
  8 +
  9 +let string_of_token mode token conll_id super label =
  10 + let decompose_lemma = function
  11 + | Lemma(a,b,c) -> a,b,if c = [[]]
  12 + then "_"
  13 + else String.concat "][" @@ Xlist.map c (fun x ->
  14 + String.concat "|" @@ Xlist.map x ( fun y ->
  15 + String.concat "." y))
  16 + | t -> failwith ("string_of_token: not Lemma") in
  17 + match mode with
  18 + | Raw -> token.orth
  19 + | Struct -> failwith ("function string_of_token for mode Struct is not defined")
  20 + | CONLL -> let lemma,cat,interp = decompose_lemma token.token in
  21 + String.concat "\t" [string_of_int conll_id;
  22 + token.orth; lemma; cat; cat; interp; "_"; "_";
  23 + string_of_int token.beg; string_of_int token.len]
  24 + | Mate -> let lemma,cat,interp = decompose_lemma token.token in
  25 + String.concat "\t" [string_of_int conll_id;
  26 + token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"]
  27 + | _ -> failwith "string_of_token: ni"
  28 +
  29 +let string_of_paths mode tokens paths =
  30 + let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id ->
  31 + let id,super,label = paths.(conll_id) in
  32 + (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in
  33 + String.concat "\n" (List.rev l) ^ "\n\n"
  34 +
  35 +let rec string_of_sentence mode tokens = function
  36 + RawSentence s -> if mode = Raw then s else ""
  37 + | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)
  38 + | DepSentence (paths) -> string_of_paths mode tokens paths
  39 + | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences")
  40 + | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts
  41 +
  42 +let string_of_p_record mode tokens p_record =
  43 + (if p_record.id = "" then "" else p_record.id ^ "\n") ^
  44 + string_of_sentence mode tokens p_record.sentence
  45 +
  46 +(*let rec string_of_paragraph mode tokens = function
  47 + RawParagraph s -> if mode = Raw then s else ""
  48 + | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens)
  49 + | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts
  50 +
  51 +let rec string_of_text mode tokens = function
  52 + RawText s -> if mode = Raw then s else ""
  53 + | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens)
  54 + | AltText alts -> alternative_string (string_of_text mode) mode alts*)
133 55  
134 56 (******************)
135 57  
... ... @@ -207,15 +129,6 @@ let establish_next tokens paths =
207 129 let last = ExtArray.get tokens (n-1) in
208 130 ExtArray.set tokens (n-1) {last with next = last.beg + last.len}
209 131  
210   -
211   - (*let rec pom res = function
212   - h :: t -> let next = if res = []
213   - then h.beg+h.len
214   - else (List.hd res).beg in
215   - pom ({h with next = next} :: res) t
216   - | [] -> res in
217   - pom [] rev_tokens*)
218   -
219 132 let rec establish_for_token i text tokens = function
220 133 (id,_,_) :: t as l->
221 134 let h = ExtArray.get tokens id in
... ... @@ -245,15 +158,15 @@ exception ErrorInfoFile of string
245 158  
246 159 let info_file = "../corpora/info_sentences2.txt"
247 160  
248   -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file
  161 +let info () = Xstring.split "\n\n" @@ File.load_file_gen info_file
249 162  
250 163 let add_to_map map info_str =
251 164 match Xstring.split "\n" info_str with
252 165 [id; text; info_token] -> StringMap.add map info_token (id, text)
253 166 | _ -> raise (ErrorInfoFile info_str)
254 167  
255   -let info_map =
256   - Xlist.fold (List.tl info) StringMap.empty add_to_map
  168 +let info_map () =
  169 + Xlist.fold (List.tl (info ())) StringMap.empty add_to_map
257 170  
258 171 let match_sentence (p_record,tokens) =
259 172 let rec info_token s = match s with
... ... @@ -268,7 +181,7 @@ let match_sentence (p_record,tokens) =
268 181 let info_token, paths = info_token p_record.sentence in
269 182 (* try *)
270 183 let id, text = try
271   - StringMap.find info_map info_token
  184 + StringMap.find (info_map ()) info_token
272 185 with
273 186 | _ -> p_record.id, get_text tokens in
274 187 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
... ... @@ -282,7 +195,7 @@ let match_corpus corpus =
282 195 [] -> []
283 196 | a::l -> try
284 197 let r = f a in r :: pom f l
285   - with e -> (*print_endline (Printexc.to_string e);*) pom f l in
  198 + with e -> pom f l in
286 199 pom match_sentence corpus
287 200  
288 201 (******************)
... ... @@ -304,7 +217,6 @@ let load_token in_channel =
304 217 else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in
305 218 {empty_token_env with orth = orth; token = Lemma(lemma,cat,interp);}, int_of_string id, int_of_super super, label in
306 219 let line = input_line in_channel in
307   - (* print_endline ("load_token: " ^ line); *)
308 220 if line = ""
309 221 then raise Empty_line
310 222 else if line.[0] = '#'
... ... @@ -329,30 +241,19 @@ let load_token in_channel =
329 241 let label = Xstring.cut_sufix "_" label_err in
330 242 n_token id orth lemma cat interp super label)
331 243 | _ -> failwith ("load_token: " ^ line)
332   -(* {c_id = List.nth pom 1;
333   - c_lemma = List.nth pom 2;
334   - c_cat = List.nth pom 3;
335   - c_interp = (let interp = List.nth pom 5 in
336   - if interp = "_"
337   - then []
338   - else Str.split (Str.regexp "|") interp);
339   - c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)
340 244  
341 245 let load_sentence in_channel =
342 246 let tokens = ExtArray.make 100 empty_token_env in
343 247 let _ = ExtArray.add tokens {empty_token_env with token = Interp "<conll_root>"} in
344 248 let rec pom rev_paths id =
345   - (* print_endline "pom 1"; *)
346 249 try
347   - (* print_endline "pom 2"; *)
348 250 let token, conll_id, super, label = load_token in_channel in
349 251 let id_a = ExtArray.add tokens token in
350 252 if id_a <> conll_id then failwith "load_sentence: different ids" else
351   - (* print_endline "pom 3"; *)
352 253 pom ((id_a,super,label) :: rev_paths) id
353   - with Id_line new_id -> (*print_endline "pom 4";*)pom rev_paths new_id
354   - | Empty_line -> (*print_endline "pom 5";*)rev_paths, id
355   - | End_of_file -> (*print_endline "pom 6";*)if rev_paths = []
  254 + with Id_line new_id -> pom rev_paths new_id
  255 + | Empty_line -> rev_paths, id
  256 + | End_of_file -> if rev_paths = []
356 257 then raise End_of_file
357 258 else rev_paths, id in
358 259 let rev_paths, id = pom [] "" in
... ... @@ -366,4 +267,4 @@ let load_corpus in_channel =
366 267 pom ((conll_sentence, tokens) :: res)
367 268 with End_of_file -> res
368 269 | e -> prerr_endline (Printexc.to_string e); res in
369   - (* match_corpus @@ *) List.rev @@ pom []
  270 + List.rev @@ pom []
... ...
corpora/CONLL_adapter.ml
  1 +open Xstd
  2 +open ENIAMsubsyntaxTypes
  3 +open ENIAMtokenizerTypes
1 4  
2   -let convert_dep_tree id first_try paths tokens lex_sems =
3   - let do_if cond f paths = if cond then f paths tokens else paths in
  5 +let if_lemma lemmas = function
  6 + Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas
  7 + | _ -> false
  8 +
  9 +let if_cat cats = function
  10 + Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats
  11 + | _ -> false
  12 +
  13 +let if_interps interps token =
  14 + let interp = match token with
  15 + Lemma(_,_,i) -> i
  16 + | _ -> [[[]]] in
  17 + let if_interp nr value =
  18 + List.exists (fun x ->
  19 + try
  20 + List.exists (fun y ->
  21 + y = value) (List.nth x nr)
  22 + with _ -> false
  23 + ) interp in
  24 + Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value))
  25 +
  26 +let correct_coordination1 paths tokens =
  27 + let paths_ls = List.mapi (fun i (id,super,label) ->
  28 + (i,id,super,label)) (Array.to_list paths) in
  29 +
  30 + let l = [("subst:nom",0),(["fin";"praet"],0);
  31 + ("subst:acc",0),(["inf"],0);
  32 + ("ppron3:nom",0),(["fin";"praet"],0);
  33 + ("ppron3:acc",0),(["fin";"praet"],0);
  34 + ("adv",0),(["fin";"praet"],0);
  35 + ("adv",0),(["inf"],0);
  36 + ("adv",0),(["adj"],0);
  37 + ("prep",0),(["fin";"praet"],0);
  38 + ("prep",0),(["inf"],0);
  39 + ("prep",0),(["ppas"],0);
  40 + ("prep",0),(["subst"],0);
  41 + ("prep:gen",0),(["subst:gen"],0);
  42 + ("adj:nom",0),(["fin";"praet"],0);
  43 + ("adj:nom",0),(["subst:nom"],0);
  44 + ("adj:gen",0),(["subst:gen"],0);
  45 + ("adj:dat",0),(["subst:dat"],0);
  46 + ("adj:acc",0),(["subst:acc"],0);
  47 + ("adj:inst",0),(["subst:inst"],0);
  48 + ("adj:loc",0),(["subst:loc"],0);
  49 + ("subst:gen",0),(["subst:nom"],0);
  50 + (* ("subst:gen",0),(["subst:gen"],0); *)
  51 + ("subst:gen",0),(["subst:dat"],0);
  52 + ("subst:gen",0),(["subst:acc"],0);
  53 + ("subst:gen",0),(["subst:inst"],0);
  54 + ("subst:gen",0),(["subst:loc"],0);
  55 + ("ppron3:gen",0),(["subst:nom"],0);
  56 + ("ppron3:gen",0),(["subst:dat"],0);
  57 + ("ppron3:gen",0),(["subst:acc"],0);
  58 + ("ppron3:gen",0),(["subst:inst"],0);
  59 + ("ppron3:gen",0),(["subst:loc"],0);
  60 + ("qub",0),(["fin";"praet"],0);
  61 + ("qub",0),(["subst"],0);
  62 + ("qub",0),(["adj"],0);
  63 + ("pact",0),(["subst"],0);
  64 + ("ppas",0),(["subst"],0)
  65 + ] in
  66 +
  67 + let find_dependents sons =
  68 +
  69 + let is (i,id,super,label) pattern = match Xstring.split ":" pattern with
  70 + ["prep";case] -> if_cat ["prep"] (ExtArray.get tokens id).token &&
  71 + if_interps [0,case] (ExtArray.get tokens id).token
  72 + | [cat;case] -> if_cat [cat] (ExtArray.get tokens id).token &&
  73 + if_interps [1,case] (ExtArray.get tokens id).token
  74 + | [cat] -> if_cat [cat] (ExtArray.get tokens id).token
  75 + | _ -> failwith "is (in correct_coordination1)" in
  76 +
  77 + let incr_representative acc son = Xlist.map acc (fun ((one,a),(rest,b)) ->
  78 + if is son one
  79 + then (one,a + 1), (rest,b)
  80 + else if List.exists (is son) rest
  81 + then (one,a), (rest,b + 1)
  82 + else (one,a), (rest,b)) in
  83 +
  84 + let get_from sons pattern = List.find (fun x -> is x pattern) sons in
  85 +
  86 + let l = Xlist.fold sons l incr_representative in
  87 + let results = List.filter (fun ((_,a),(_,b)) -> a = 1 && b > 1) l in
  88 + Xlist.map results (fun result ->
  89 + get_from sons @@ fst @@ fst result,
  90 + List.filter (fun son ->
  91 + List.exists (fun one -> is son one) (fst (snd result))) sons) in
  92 +
  93 + let establish_neighbour super ((i_d,id_d,super_d,label_d),sons) =
  94 + let not_between (i_s,_,_,_) =
  95 + (super < i_d && super < i_s) ||
  96 + (super > i_d && super > i_s) in
  97 + let (i_n,id_n,super_n,label_n) = List.find (fun son ->
  98 + not_between son) sons in
  99 + paths.(i_d) <- (id_d, i_n, label_d) in
  100 +
  101 + let examine_coords (i,id,super,label) sons =
  102 + try
  103 + let dependents = find_dependents sons in
  104 + Xlist.iter dependents (establish_neighbour super)
  105 + with
  106 + | _ -> () in
  107 +
  108 + Array.iteri (fun i (id,super,label) ->
  109 + if if_cat ["conj"] (ExtArray.get tokens id).token
  110 + then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in
  111 + if (List.length sons > 2)
  112 + then examine_coords (i,id,super,label) sons)) paths;
  113 + paths
  114 +
  115 +let correct_coordination2 paths tokens =
  116 + let paths_c = Array.copy paths in
  117 + let paths_ls () = List.mapi (fun i (id,super,label) ->
  118 + (i,id,super,label)) (Array.to_list paths_c) in
  119 +
  120 + (* let ps a sons =
  121 + print_endline a;
  122 + List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons;
  123 + print_endline "" in *)
  124 +
  125 + let rec correct_rec (i,id,super,label) sons =
  126 + let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in
  127 + (* ps "left:" (List.rev left_s);
  128 + ps "right:" right_s; *)
  129 + find_father i (List.rev left_s);
  130 + find_father i right_s
  131 +
  132 + and find_father i0 = function
  133 + [(i,id,super,label)] -> paths_c.(i) <- (id,i0,label)
  134 + | (a,b,c,d) :: (i,id,super,label) :: t ->
  135 + paths_c.(i) <- (id,i0,label);
  136 + if not (if_cat ["conj"] (ExtArray.get tokens i).token ||
  137 + (ExtArray.get tokens i).orth = ",")
  138 + then failwith "find_father";
  139 + correct_rec (i,id,super,label) (if a < i
  140 + then (a,b,c,d) :: t
  141 + else List.rev @@ (a,b,c,d) :: t)
  142 + | _ -> failwith "find_father" in
  143 +
  144 + let check_previous_for_interp i =
  145 + if i >= 0 && (ExtArray.get tokens i).orth = "," &&
  146 + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c))
  147 + then paths_c.(i) <- (0,-1,"") in
  148 +
  149 + Array.iteri (fun i (id,super,label) ->
  150 + if if_cat ["conj"] (ExtArray.get tokens i).token ||
  151 + (ExtArray.get tokens i).orth = ","
  152 + then
  153 + (check_previous_for_interp (i-1);
  154 + let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in
  155 + if (List.length sons > 2)
  156 + then correct_rec (i,id,super,label) sons)) paths_c;
  157 + paths_c
  158 +
  159 +let praet_qub_aglt paths tokens =
  160 + Array.iteri (fun i (id,super,label) ->
  161 + if super >= 0 then
  162 + (let id_s, super_s, label_s = paths.(super) in
  163 + if if_cat ["aglt"] (ExtArray.get tokens id).token &&
  164 + (ExtArray.get tokens id_s).orth = "by"
  165 + then let id_gf,super_gf,label_gf = paths.(super_s) in
  166 + if if_cat ["praet"] (ExtArray.get tokens id_gf).token
  167 + then paths.(i) <- (id,super_s,label))) paths;
  168 + paths
  169 +
  170 +let replace_tokens paths tokens =
  171 +(* for i = 0 to ExtArray.size tokens - 1 do
  172 + print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth)
  173 +done; *)
  174 + let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->
  175 + if (ExtArray.get tokens i).orth = orth then i else acc) in
  176 +
  177 + let multidot i id0 super0 label0 =
  178 + let id1, super1, label1 = paths.(super0) in
  179 + if super1 >= 0 then
  180 + let id2, super2, label2 = paths.(super1) in
  181 + if (ExtArray.get tokens id1).orth = "." &&
  182 + (ExtArray.get tokens id2).orth = "."
  183 + then
  184 + (paths.(super1) <- (find_token "..." ,super2, label2);
  185 + paths.(super0) <- (0,-1,"");
  186 + paths.(i) <- (0,-1,"")) in
  187 +
  188 + let brev i id super label =
  189 + let if_the_last_dot () =
  190 + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) ->
  191 + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in
  192 + Array.fold_left (fun acc (i2,s,l) ->
  193 + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in
  194 +
  195 + let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot ()
  196 + then ""
  197 + else "." in
  198 +
  199 + let n_orth = (ExtArray.get tokens id).orth ^ dot in
  200 + paths.(i) <- (find_token n_orth,super,label) in
  201 +
  202 + Array.iteri (fun i (id,super,label) ->
  203 + if (ExtArray.get tokens id).orth = "."
  204 + then multidot i id super label;
  205 + if if_cat ["brev"] (ExtArray.get tokens id).token
  206 + then brev i id super label)
  207 + paths;
  208 + paths
  209 +
  210 +let replace_hyphens paths tokens =
  211 + let ref_paths = ref paths in
  212 + let find_token token = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->
  213 + if (ExtArray.get tokens i).token = token then i else acc) in
  214 + let find_specific_token token beg next = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->
  215 + if (ExtArray.get tokens i).token = token &&
  216 + beg <= (ExtArray.get tokens i).beg &&
  217 + (ExtArray.get tokens i).next <= next
  218 + then i else acc) in
  219 +
  220 + let correct_last sons_of_zero = (* TODO: synowie zamiast syna *)
  221 + let i1,s1,l1 = !ref_paths.(Array.length !ref_paths - 1) in
  222 + if (ExtArray.get tokens i1).orth = "."
  223 + then
  224 + !ref_paths.(Array.length !ref_paths - 1) <- (find_token (Interp "</sentence>"),1,l1)
  225 + else
  226 + (ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),1,"-") |];
  227 + !ref_paths.(Array.length !ref_paths - 2) <- (i1,Array.length !ref_paths - 1,l1));
  228 + Xlist.iter sons_of_zero (fun son_of_zero ->
  229 + let i2,s2,l2 = !ref_paths.(son_of_zero) in
  230 + !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) in
  231 +
  232 + let one_hyphen sons_of_zero =
  233 + let i2,s2,l2 = !ref_paths.(1) in
  234 + Xlist.iter sons_of_zero (fun son_of_zero ->
  235 + let i1,s1,l1 = !ref_paths.(son_of_zero) in
  236 + !ref_paths.(son_of_zero) <- (i1,1,l1));
  237 + !ref_paths.(1) <- (find_token (Interp "<or-sentence>"),0,l2);
  238 + correct_last sons_of_zero in
  239 +
  240 + let two_hyphens first second son parent =
  241 + let i1,s1,l1 = !ref_paths.(first) in
  242 + let i2,s2,l2 = !ref_paths.(second) in
  243 + let beg, next = (ExtArray.get tokens i2).beg, (ExtArray.get tokens i2).next in
  244 + let i3,s3,l3 = !ref_paths.(son) in
  245 + let i4,s4,l4 = !ref_paths.(parent) in
  246 + ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),first,"-") |];
  247 + !ref_paths.(first) <- (find_token (Interp "<or-sentence>"),0,l1);
  248 + !ref_paths.(second) <- (find_specific_token (Interp "</or-sentence>") beg next,first,l2);
  249 + !ref_paths.(son) <- (i3,second,l3);
  250 + !ref_paths.(parent) <- (i4,first,l4) in
  251 +
  252 + let rec is_dep_correct a b out zero res i (id,super,label) = (* out = how many words in (a,b) have parent outside [a,b]*)
  253 + (* print_endline ((string_of_int a) ^ " " ^ (string_of_int b) ^ " " ^ (string_of_int out) ^ " " ^ (string_of_int zero) ^ " " ^ (string_of_int i)); *)
  254 + if out > 1 || zero > 1 || (* zero = how many words (not interps) have parent 0 *)
  255 + (a < i && i < b && super < a && label <> "interp") ||
  256 + (a < super && super < b && (i < a || b < i))
  257 + then false, res
  258 + else
  259 + if i+1 = Array.length !ref_paths
  260 + then out = 1 && zero = 1, res
  261 + else
  262 + if a < i && i < b && b < super
  263 + then is_dep_correct a b (out+1) zero (i,super) (i+1) !ref_paths.(i+1)
  264 + else
  265 + if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token)
  266 + then is_dep_correct a b out (zero+1) res (i+1) !ref_paths.(i+1)
  267 + else is_dep_correct a b out zero res (i+1) !ref_paths.(i+1) in
  268 +
  269 + let hyphens = snd @@ Array.fold_left (fun (i,acc) (id,super,label) ->
  270 + if (ExtArray.get tokens id).orth = "-"
  271 + then i+1, i :: acc
  272 + else i+1, acc) (0,[]) !ref_paths in
  273 +
  274 + let sons_of_zero = snd @@ Array.fold_left (fun (i,acc) (id,super,label) ->
  275 + if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token)
  276 + then i+1, i :: acc
  277 + else i+1, acc) (0,[]) !ref_paths in
  278 +
  279 + (if List.length sons_of_zero = 1
  280 + then
  281 + if List.length hyphens = 1 && hyphens = [1]
  282 + then one_hyphen sons_of_zero
  283 + else
  284 + if List.length hyphens = 2
  285 + then let a, b = List.nth hyphens 1, List.nth hyphens 0 in
  286 + let is_good, (son,parent) = is_dep_correct a b 0 0 (0,0) 1 !ref_paths.(1) in
  287 + if a = 1 && is_good
  288 + then two_hyphens a b son parent);
  289 + !ref_paths
  290 +
  291 +let correct_interp_with_father_0 paths tokens =
  292 + Array.iteri (fun i (id,super,label) ->
  293 + if (super = 0 ||
  294 + (ExtArray.get tokens id).token = Interp "<or-sentence>" ||
  295 + (ExtArray.get tokens id).token = Interp "</or-sentence>") && (ExtArray.get tokens id).orth = ","
  296 + then Array.iteri (fun i1 (id1,super1,label1) ->
  297 + if super1 = i
  298 + then paths.(i1) <- (id1,0,label1)) paths) paths;
  299 + paths
  300 +
  301 +let remove_interps interp paths tokens =
  302 + let paths_ls = Array.to_list paths in
  303 + Array.iteri (fun i (id,super,label) ->
  304 + if (ExtArray.get tokens id).orth = interp &&
  305 + not (List.exists (fun (_,super,_) -> super = i) paths_ls)
  306 + then paths.(i) <- (0,-1,"")) paths;
  307 + paths
  308 +
  309 +let correct_passive_voice paths tokens =
  310 + Array.iteri (fun i (id,super,label) ->
  311 + if super >= 0 then
  312 + (let id_s, super_s, label_s = paths.(super) in
  313 + if (if_cat ["praet"] (ExtArray.get tokens id).token &&
  314 + if_cat ["ppas"] (ExtArray.get tokens id_s).token)
  315 + then (paths.(i) <- (id,super_s,label);
  316 + paths.(super) <- (id_s,i,label_s);
  317 + Array.iteri (fun i_p (id_p,super_p,label_p) ->
  318 + if super_p = super
  319 + then paths.(i_p) <- (id_p,i,label_p)) paths))) paths;
  320 + paths
  321 +
  322 +let swap_dep paths tokens =
  323 + let change_dep i (id,super,label) =
  324 + let id_S, super_S, label_S = paths.(super) in
  325 + paths.(i) <- (id,super_S,label);
  326 + paths.(super) <- (id_S, id, label_S) in
  327 + let rec correct_dep i (id,super,label) =
  328 + let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który";
  329 + "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in
  330 + if (if_cat ["comp"] (ExtArray.get tokens id).token &&
  331 + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token) ||
  332 + (if_cat ["conj"] (ExtArray.get tokens id).token &&
  333 + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token &&
  334 + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) ||
  335 + (if_cat ["ppron3"] (ExtArray.get tokens id).token &&
  336 + if_interps [5,"praep"] (ExtArray.get tokens id).token) ||
  337 + (if_lemma adv_relators (ExtArray.get tokens id).token &&
  338 + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token)
  339 + then
  340 + change_dep i (id,super,label);
  341 + if (if_lemma adv_relators (ExtArray.get tokens id).token &&
  342 + if_cat ["subst"; "pred"] (ExtArray.get tokens super).token)
  343 + then correct_dep i paths.(i) in
  344 + Array.iteri correct_dep paths; paths
  345 +
  346 + (*
  347 + correct_coordination1 -> sąsiad słowem najbliższym po prawej, jeśli pomiędzy nim a mną spójnik, to najbliższym po lewej
  348 + nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy
  349 + nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *)
  350 +
  351 +let convert_dep_tree id first_try paths tokens =
4 352 let paths = Array.copy paths in
5   - let paths = do_if first_try TreeChange.replace_tokens paths in
6   - let paths = do_if first_try (TreeChange.remove_interps ".") paths in
7   - let paths = do_if first_try TreeChange.replace_hyphens paths in
8   - let paths = do_if first_try TreeChange.correct_coordination1 paths in
9   - let paths = do_if first_try TreeChange.correct_interp_with_father_0 paths in
10   - let paths = do_if first_try TreeChange.correct_coordination2 paths in
11   - let paths = do_if first_try (TreeChange.remove_interps ",") paths in
12   - let paths = do_if first_try TreeChange.correct_passive_voice paths in
13   - let paths = do_if first_try TreeChange.praet_qub_aglt paths in
14   - let paths = do_if (not first_try) TreeChange.swap_dep paths in
15   - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->
16   - fprintf file "%s\n" Visualization.html_header;
17   - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);
18   - fprintf file "%s\n" Visualization.html_trailer);
19   - (* let paths = do_if first_try TreeChange.replace_tokens paths in
20   - let paths = do_if first_try TreeChange.replace_hyphens paths in
21   - let paths = do_if first_try (TreeChange.remove_interps ".") paths in
22   - let paths = do_if (not first_try) TreeChange.swap_dep paths in
23   - let paths = do_if first_try TreeChange.correct_coordination1 paths in
24   - let paths = try
25   - do_if first_try TreeChange.correct_coordination2 paths
26   - with
27   - | _ -> (
28   - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->
29   - fprintf file "%s\n" Visualization.html_header;
30   - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);
31   - fprintf file "%s\n" Visualization.html_trailer);
32   - do_if first_try TreeChange.correct_interp_with_father_0 paths;
33   - do_if first_try (TreeChange.remove_interps ",") paths;
34   - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ "2.html") (fun file ->
35   - fprintf file "%s\n" Visualization.html_header;
36   - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);
37   - fprintf file "%s\n" Visualization.html_trailer);
38   - do_if first_try TreeChange.correct_coordination2 paths) in
39   - let paths = do_if first_try TreeChange.praet_qub_aglt paths in
40   - let paths = do_if first_try TreeChange.correct_interp_with_father_0 paths in
41   - let paths = do_if first_try (TreeChange.remove_interps ",") paths in
42   - let paths = do_if first_try (TreeChange.remove_interps "-") paths in
43   - let paths = do_if first_try TreeChange.correct_passive_voice paths in
44   - File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->
45   - fprintf file "%s\n" Visualization.html_header;
46   - fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);
47   - fprintf file "%s\n" Visualization.html_trailer); *)
  353 + let paths =
  354 + if first_try
  355 + then
  356 + let pom = replace_tokens paths tokens in
  357 + let pom = (remove_interps ".") pom tokens in
  358 + let pom = replace_hyphens pom tokens in
  359 + let pom = correct_coordination1 pom tokens in
  360 + let pom = correct_interp_with_father_0 pom tokens in
  361 + let pom = correct_coordination2 pom tokens in
  362 + let pom = remove_interps "," pom tokens in
  363 + let pom = correct_passive_voice pom tokens in
  364 + praet_qub_aglt pom tokens
  365 + else
  366 + swap_dep paths tokens in
  367 + (* File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->
  368 + Printf.fprintf file "%s\n" Visualization.html_header;
  369 + Printf.fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);
  370 + Printf.fprintf file "%s\n" Visualization.html_trailer); *)
  371 + paths
... ...
diagnostics/LCGfields.ml renamed to corpora/LCGfields.ml
... ... @@ -83,7 +83,7 @@ let field_of_dependency_tree str_node fields dep_tree =
83 83 Array.fold_left (fun acc x ->
84 84 acc ^ (field_of_linear_term str_node field x) ^ "\n\t\t" ) "" dep_tree))
85 85  
86   -let field_of_eniam_sentence fields tokens (result : eniam_parse_result) =
  86 +let field_of_eniam_sentence fields (result : eniam_parse_result) =
87 87 match result.status with
88 88 Idle -> "Idle"
89 89 (* | PreprocessingError -> "PreprocessingError" *)
... ... @@ -99,7 +99,7 @@ let field_of_eniam_sentence fields tokens (result : eniam_parse_result) =
99 99 | Parsed -> ignore ("Parsed\n\t\t" ^ (field_of_dependency_tree eniam fields result.dependency_tree)); "Parsed\n"
100 100 | _ -> failwith "field_of_eniam_sentence"
101 101  
102   -let field_of_conll_sentence fields tokens (result : conll_parse_result) =
  102 +let field_of_conll_sentence fields (result : conll_parse_result) =
103 103 stat_map := StatMap.add !stat_map result.status;
104 104 match result.status with
105 105 Idle -> "Idle"
... ... @@ -117,33 +117,36 @@ let field_of_conll_sentence fields tokens (result : conll_parse_result) =
117 117 | _ -> failwith "field_of_conll_sentence"
118 118  
119 119  
120   -let rec field_of_sentence fields tokens = function
  120 +let rec field_of_sentence fields = function
121 121 RawSentence s -> s
122 122 | StructSentence _ -> "StructSentence"
123 123 | DepSentence _ -> "DepSentence"
124   - | ENIAMSentence result -> field_of_eniam_sentence fields tokens result
125   - | CONLLSentence result -> field_of_conll_sentence fields tokens result
  124 + | ENIAMSentence result -> field_of_eniam_sentence fields result
  125 + | CONLLSentence result -> field_of_conll_sentence fields result
126 126 | QuotedSentences sentences -> "QuotedSentences"
127 127 | AltSentence l -> String.concat "\n\t" (Xlist.map l (fun (m, s) ->
128   - Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields tokens s)))
  128 + Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields s)))
129 129 | _ -> failwith "field_of_sentence: ni"
130 130  
131   -let rec field_of_paragraph fields tokens = function
  131 +let rec field_of_paragraph fields = function
132 132 RawParagraph s -> print_endline "no fields detected: only raw paragraph"; s
133 133 | StructParagraph sentences ->
134   - String.concat "\n\t" (Xlist.map sentences (fun p -> field_of_sentence fields tokens p.psentence))
  134 + String.concat "\n\t" (Xlist.map sentences (fun p -> field_of_sentence fields p.psentence))
135 135 | AltParagraph l ->
136 136 String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> (*m = ENIAM ||*) m = CONLL) l) (fun (m,t) ->
137   - Visualization.string_of_mode m ^ "\n\t" ^ (field_of_paragraph fields tokens t)))
138   - (* field_of_paragraph fields tokens (snd @@ List.find (fun (mode,text) -> mode = ENIAM || mode = CONLL) l) *)
  137 + Visualization.string_of_mode m ^ "\n\t" ^ (field_of_paragraph fields t)))
  138 + (* field_of_paragraph fields (snd @@ List.find (fun (mode,text) -> mode = ENIAM || mode = CONLL) l) *)
139 139  
140 140 let rec print_fields_rec fields = function
141   - RawText s -> print_endline "no fields detected: only raw text";
142   -| StructText(paragraphs,tokens) ->
143   - print_endline (String.concat "\n\n" (Xlist.map paragraphs (field_of_paragraph fields tokens)) ^ "\n")
  141 + RawText s -> s
  142 + (* print_endline "no fields detected: only raw text"; *)
  143 +| StructText(paragraphs) ->
  144 + String.concat "\n\n" (Xlist.map paragraphs (field_of_paragraph fields)) ^ "\n"
144 145 | AltText l ->
145   - print_fields_rec fields (snd @@ List.find (fun (m,t) -> m = Struct (*|| m = ENIAM*) || m = CONLL) l)
  146 + String.concat "\n" (Xlist.map (List.filter (fun (m,t) -> m = Struct || m = CONLL) l) (fun (m,t) ->
  147 + Visualization.string_of_mode m ^ "\n\t" ^ (print_fields_rec fields t)))
  148 + (* print_fields_rec fields (snd @@ List.find (fun (m,t) -> m = Struct (*|| m = ENIAM*) || m = CONLL) l) *)
146 149  
147 150 let print_fields fields text =
148   - print_fields_rec fields text
  151 + print_endline @@ print_fields_rec fields text
149 152 (* ; print_field_map () *)
... ...
corpora/makefile
... ... @@ -16,9 +16,9 @@ lib:
16 16 freq_test:
17 17 $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) $(MODS) freq_test.ml
18 18  
19   -test: CONLL.ml test_conll2.ml
  19 +test: CONLL.ml CONLL_adapter.ml test_conll2.ml
20 20 mkdir -p results
21   - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) CONLL.ml test_conll2.ml
  21 + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^
22 22  
23 23  
24 24 .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx
... ...
corpora/test_conll.ml
  1 +open Xstd
  2 +open ENIAMsubsyntaxTypes
  3 +open ENIAMtokenizerTypes
  4 +open LCGtypes
  5 +open ExecTypes
1 6  
2 7 let empty_result = {
3 8 input_text=RawText "";
... ... @@ -146,7 +151,7 @@ let eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems =
146 151 let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens lex_sems =
147 152 let result = empty_conll_parse_result in
148 153 let time2 = time_fun () in
149   - let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems
  154 + let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in
150 155 try
151 156 let dep_chart = LCGlexicon.dep_create paths tokens lex_sems in
152 157 let dep_chart,references = LCGchart.dep_lazify dep_chart in
... ... @@ -193,7 +198,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le
193 198 let time5 = time_fun () in
194 199 {result with status=ReductionError; msg=Printexc.to_string e; reduction_time=time5 -. time4}
195 200 else if first_try
196   - then conll_parse_sentence timeout test_only_flag id false paths tokens
  201 + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems
197 202 else {result with status=NotParsed}
198 203 with
199 204 Timeout t ->
... ... @@ -201,7 +206,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le
201 206 {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3}
202 207 | NotDepParsed(id_ndp,left,l,right) ->
203 208 if first_try
204   - then conll_parse_sentence timeout test_only_flag id false paths tokens
  209 + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems
205 210 else let time4 = time_fun () in
206 211 {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3}
207 212 | e ->
... ... @@ -210,7 +215,7 @@ let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens le
210 215 with e -> (*print_endline (Printexc.to_string e);*)
211 216 let time3 = time_fun () in
212 217 if first_try
213   - then conll_parse_sentence timeout test_only_flag id false paths tokens
  218 + then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems
214 219 else {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}
215 220  
216 221  
... ... @@ -243,11 +248,7 @@ let get_paths old_paths = function
243 248 paths
244 249 | _ -> failwith "get_paths"
245 250  
246   -<<<<<<< HEAD
247   -let rec parse_sentence timeout test_only_flag mode file_prefix tokens lex_sems = function
248   -=======
249   -let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = function
250   ->>>>>>> dep_trees
  251 +let rec parse_sentence timeout test_only_flag mode id file_prefix tokens lex_sems = function
251 252 RawSentence s ->
252 253 (match mode with
253 254 Swigra ->
... ... @@ -259,23 +260,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct
259 260 | StructSentence(paths,last) ->
260 261 (match mode with
261 262 ENIAM ->
262   -<<<<<<< HEAD
263 263 let result = eniam_parse_sentence timeout test_only_flag paths last tokens lex_sems in
264   -=======
265   - let result = empty_eniam_parse_result in
266   - (* let result = print_endline "eniam_parse_sentence"; eniam_parse_sentence timeout test_only_flag paths last tokens in *)
267   ->>>>>>> dep_trees
  264 + (* let result = empty_eniam_parse_result in *)
268 265 let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix} in
269 266 ENIAMSentence result
270 267 | _ -> failwith "parse_sentence")
271 268 | DepSentence(paths) ->
272 269 (match mode with
273 270 CONLL ->
274   -<<<<<<< HEAD
275   - let result = conll_parse_sentence timeout test_only_flag paths tokens lex_sems in
276   -=======
277   - let result = (*print_endline "conll_parse_sentence";*) conll_parse_sentence timeout test_only_flag id true paths tokens in
278   ->>>>>>> dep_trees
  271 + let result = conll_parse_sentence timeout test_only_flag id true paths tokens lex_sems in
279 272 let result = {result with
280 273 file_prefix = file_prefix_of_mode mode ^ file_prefix;
281 274 paths = paths} in
... ... @@ -289,19 +282,15 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct
289 282 if not Paths.config.Paths.mate_parser_enabled then DepSentence paths else (
290 283 print_endline "parse_sentence 1";
291 284 (* print_endline (Visualization.html_of_dep_sentence tokens paths); *)
292   - let conll = ENIAM_CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in
  285 + let conll = CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in
293 286 print_endline "parse_sentence 2";
294 287 (* printf "|%s|\n" conll; *)
295 288 Printf.fprintf mate_out "%s%!" conll;
296 289 print_endline "parse_sentence 3";
297   - let new_paths = get_paths paths (ENIAM_CONLL.load_sentence mate_in) in
  290 + let new_paths = get_paths paths (CONLL.load_sentence mate_in) in
298 291 print_endline "parse_sentence 4";
299 292 (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *)
300   -<<<<<<< HEAD
301   - let result = conll_parse_sentence timeout test_only_flag new_paths tokens lex_sems in
302   -=======
303   - let result = conll_parse_sentence timeout test_only_flag id true new_paths tokens in
304   ->>>>>>> dep_trees
  293 + let result = conll_parse_sentence timeout test_only_flag id true new_paths tokens lex_sems in
305 294 let result = {result with
306 295 file_prefix = file_prefix_of_mode mode ^ file_prefix;
307 296 paths=new_paths} in
... ... @@ -309,66 +298,94 @@ let rec parse_sentence timeout test_only_flag mode id file_prefix tokens = funct
309 298 | _ -> failwith "parse_sentence")
310 299 | QuotedSentences sentences ->
311 300 let sentences = Xlist.rev_map sentences (fun p ->
312   -<<<<<<< HEAD
313   - let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens lex_sems p.psentence in
314   -=======
315   - let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens p.psentence in
316   ->>>>>>> dep_trees
  301 + let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens lex_sems p.psentence in
317 302 {p with psentence=sentence}) in
318 303 QuotedSentences(List.rev sentences)
319 304 | AltSentence l ->
320 305 let l = Xlist.rev_map l (fun (mode,sentence) ->
321   -<<<<<<< HEAD
322   - mode, parse_sentence timeout test_only_flag mode file_prefix tokens lex_sems sentence) in
  306 + mode, parse_sentence timeout test_only_flag mode id file_prefix tokens lex_sems sentence) in
323 307 AltSentence(List.rev l)
324 308 | _ -> failwith "parse_sentence"
325 309  
326   -let rec parse_paragraph timeout test_only_flag mode tokens lex_sems = function
  310 +let rec parse_paragraph timeout test_only_flag mode id tokens lex_sems = function
327 311 RawParagraph s -> RawParagraph s
328 312 | StructParagraph sentences ->
329 313 let sentences = Xlist.rev_map sentences (fun p ->
330   - let sentence = parse_sentence timeout test_only_flag mode p.pfile_prefix tokens lex_sems p.psentence in
331   -=======
332   - mode, parse_sentence timeout test_only_flag mode id file_prefix tokens sentence) in
333   - AltSentence(List.rev l)
334   - | _ -> failwith "parse_sentence"
  314 + let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens lex_sems p.psentence in
  315 + {p with psentence=sentence}) in
  316 + StructParagraph(List.rev sentences)
  317 + | AltParagraph l ->
  318 + let l = Xlist.rev_map l (fun (mode,paragraph) ->
  319 + mode, parse_paragraph timeout test_only_flag mode id tokens lex_sems paragraph) in
  320 + AltParagraph(List.rev l)
  321 +
  322 +let rec parse_text timeout test_only_flag mode id tokens lex_sems = function
  323 + RawText s -> RawText s
  324 + | StructText paragraphs ->
  325 + let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
  326 + parse_paragraph timeout test_only_flag mode id tokens lex_sems paragraph) in
  327 + StructText(List.rev paragraphs)
  328 + | AltText l -> AltText(Xlist.map l (fun (mode,text) ->
  329 + mode, parse_text timeout test_only_flag mode id tokens lex_sems text))
  330 +
  331 +let select_mode = function
  332 + (Raw,_),_ -> failwith "select_mode"
  333 + | _,(Raw,_) -> failwith "select_mode"
  334 + | (Struct,_),_ -> failwith "select_mode"
  335 + | _,(Struct,_) -> failwith "select_mode"
  336 + | (CONLL,s),_ -> CONLL,s
  337 + | _,(CONLL,s) -> CONLL,s
  338 + | (ENIAM,s),_ -> ENIAM,s
  339 + | _,(ENIAM,s) -> ENIAM,s
  340 + | (Swigra,s),_ -> Swigra,s
  341 + | _,(Swigra,s) -> Swigra,s
  342 + | (Mate,s),_ -> Mate,s
  343 + | _,(Mate,s) -> Mate,s
  344 + | _ -> failwith "select_mode: ni"
335 345  
336   -let rec parse_paragraph timeout test_only_flag mode id tokens = function
  346 +let rec select_sentences_sentence = function
  347 + RawSentence s -> failwith "select_sentences_sentence"
  348 + | StructSentence(paths,last) -> failwith "select_sentences_sentence"
  349 + | DepSentence paths -> failwith "select_sentences_sentence"
  350 + | QuotedSentences sentences ->
  351 + let sentences = Xlist.rev_map sentences (fun p ->
  352 + let sentence,_ = select_sentences_sentence p.psentence in
  353 + {p with psentence=sentence}) in
  354 + QuotedSentences(List.rev sentences), Parsed
  355 + | AltSentence l ->
  356 + let raw,selected = Xlist.fold l ([],[]) (fun (raw,selected) (mode,sentence) ->
  357 + if mode = Raw then (mode,sentence) :: raw, selected else
  358 + let sentence,status = select_sentences_sentence sentence in
  359 + if status <> Parsed && status <> NotTranslated then raw,selected else
  360 + match selected with
  361 + [] -> raw,[mode,sentence]
  362 + | [mode2,sentence2] -> raw,[select_mode ((mode,sentence),(mode2,sentence2))]
  363 + | _ -> failwith "select_sentences_sentence") in
  364 + AltSentence(raw @ selected), Parsed
  365 + | ENIAMSentence result -> ENIAMSentence result, result.status
  366 + | CONLLSentence result -> CONLLSentence result, result.status
  367 + | SemSentence result -> SemSentence result, result.status
  368 +
  369 +let rec select_sentences_paragraph = function
337 370 RawParagraph s -> RawParagraph s
338 371 | StructParagraph sentences ->
339 372 let sentences = Xlist.rev_map sentences (fun p ->
340   - let sentence = parse_sentence timeout test_only_flag mode id p.pfile_prefix tokens p.psentence in
341   ->>>>>>> dep_trees
  373 + let sentence,_ = select_sentences_sentence p.psentence in
342 374 {p with psentence=sentence}) in
343 375 StructParagraph(List.rev sentences)
344 376 | AltParagraph l ->
345 377 let l = Xlist.rev_map l (fun (mode,paragraph) ->
346   -<<<<<<< HEAD
347   - mode, parse_paragraph timeout test_only_flag mode tokens lex_sems paragraph) in
348   - AltParagraph(List.rev l)
349   -
350   -let rec parse_text timeout test_only_flag mode tokens lex_sems = function
351   -=======
352   - mode, parse_paragraph timeout test_only_flag mode id tokens paragraph) in
  378 + mode, select_sentences_paragraph paragraph) in
353 379 AltParagraph(List.rev l)
354 380  
355   -let rec parse_text timeout test_only_flag mode id = function
356   ->>>>>>> dep_trees
  381 +let rec select_sentences_text = function
357 382 RawText s -> RawText s
358 383 | StructText paragraphs ->
359 384 let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
360   -<<<<<<< HEAD
361   - parse_paragraph timeout test_only_flag mode tokens lex_sems paragraph) in
  385 + select_sentences_paragraph paragraph) in
362 386 StructText(List.rev paragraphs)
363 387 | AltText l -> AltText(Xlist.map l (fun (mode,text) ->
364   - mode, parse_text timeout test_only_flag mode tokens lex_sems text))
365   -=======
366   - parse_paragraph timeout test_only_flag mode id tokens paragraph) in
367   - StructText(List.rev paragraphs, tokens)
368   - | AltText l -> AltText(Xlist.map l (fun (mode,text) ->
369   - mode, parse_text timeout test_only_flag mode id text))
370   ->>>>>>> dep_trees
371   -
  388 + mode, select_sentences_text text))
372 389  
373 390 let rec extract_query_text = function
374 391 RawText s -> s
... ... @@ -392,11 +409,7 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n =
392 409 let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1} in
393 410 if msg <> "" then {result with status=PreprocessingError; msg=msg} else (
394 411 (* print_endline "process_query 3"; *)
395   -<<<<<<< HEAD
396   - let parsed_text = parse_text timeout test_only_flag Struct tokens lex_sems (translate_text pre_text) in
397   -=======
398   - let parsed_text = parse_text timeout test_only_flag Struct id (translate_text pre_text) in
399   ->>>>>>> dep_trees
  412 + let parsed_text = parse_text timeout test_only_flag Struct id tokens lex_sems (translate_text pre_text) in
400 413 (* print_endline "process_query 4"; *)
401 414 let time3 = time_fun () in
402 415 let result = if test_only_flag then result else {result with status=Parsed; parsed_text=parsed_text} in
... ... @@ -421,23 +434,50 @@ let process_query pre_in pre_out timeout test_only_flag id full_query max_n =
421 434 let result = {result with semantic_time=time4 -. time3} in
422 435 result)
423 436  
  437 +let get_sock_addr host_name port =
  438 + let he = Unix.gethostbyname host_name in
  439 + let addr = he.Unix.h_addr_list in
  440 + Unix.ADDR_INET(addr.(0),port)
  441 +
  442 +let id_counter = ref 0
  443 +
  444 +let get_id () =
  445 + incr id_counter;
  446 + "ID_" ^ (string_of_int !id_counter)
  447 +
  448 +let get_query_id = function
  449 + ENIAMsubsyntaxTypes.AltText[_;ENIAMsubsyntaxTypes.CONLL,ENIAMsubsyntaxTypes.StructText([ENIAMsubsyntaxTypes.StructParagraph[p]])] -> if p.ENIAMsubsyntaxTypes.pid = "" then get_id () else p.ENIAMsubsyntaxTypes.pid
  450 + | ENIAMsubsyntaxTypes.AltText[ENIAMsubsyntaxTypes.CONLL,ENIAMsubsyntaxTypes.StructText([ENIAMsubsyntaxTypes.StructParagraph[p]])] -> if p.ENIAMsubsyntaxTypes.pid = "" then get_id () else p.ENIAMsubsyntaxTypes.pid
  451 + | _ -> failwith "get_query_id"
  452 +
  453 +let process_id s =
  454 + if Xstring.check_prefix "ID_" s then s else
  455 + let a,b,c = match Xstring.split_delim "/" s with
  456 + [a;b;c] -> a,b,c
  457 + | _ -> failwith ("process_id: " ^ s) in
  458 + if Xstring.check_prefix "NKJP_1M_" a && Xstring.check_prefix "morph_" b && Xstring.check_sufix "-p" b &&
  459 + Xstring.check_prefix "morph_" c && Xstring.check_sufix "-s" c then
  460 + Xstring.cut_prefix "NKJP_1M_" a ^ "." ^ Xstring.cut_sufix "-s" (Xstring.cut_prefix "morph_" c)
  461 + else failwith ("process_id: " ^ s)
424 462  
425 463 let process_conll_corpus filename =
  464 + print_endline "process_conll_corpus: START";
426 465 let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in
427   - print_endline "process_conll_corpus";
428   - let corpus = [List.hd corpus] in
  466 + print_endline "process_conll_corpus: DONE";
  467 + (* let corpus = [List.hd corpus] in *)
429 468 let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in
430   - Xlist.iter corpus (fun query ->
  469 + print_endline "connection_opened";
  470 + Xlist.iter corpus (fun (query,tokens) ->
431 471 let id = process_id (get_query_id query) in
432 472 let path = "results/" ^ id ^ "/" in
433 473 ignore (Sys.command ("mkdir -p " ^ path));
434   - let result = process_query ic oc 30. false "x" query 10 in
435   - Visualization.print_html_text path "input_text" result.input_text;
436   - Visualization.print_html_text path "pre_text" result.pre_text;
437   - Visualization.print_html_text path "parsed_text" result.parsed_text;
438   - Visualization.print_html_text path "selected_sent_text" result.selected_sent_text;
439   - Visualization.print_html_text path "semantic_text" result.semantic_text;
440   - Visualization.print_html_text path "selected_semantic_text" result.selected_semantic_text;
  474 + let result = process_query ic oc 30. false "x" (query,tokens) 10 in
  475 + (* Visualization.print_html_text path "input_text" result.input_text tokens;
  476 + Visualization.print_html_text path "pre_text" result.pre_text tokens;
  477 + Visualization.print_html_text path "parsed_text" result.parsed_text tokens;
  478 + Visualization.print_html_text path "selected_sent_text" result.selected_sent_text tokens;
  479 + Visualization.print_html_text path "semantic_text" result.semantic_text tokens;
  480 + Visualization.print_html_text path "selected_semantic_text" result.selected_semantic_text tokens; *)
441 481 (* printf "input_text:\n%s\n" (Visualization.string_of_text result.input_text);
442 482 printf "pre_text:\n%s\n" (Visualization.string_of_text result.pre_text); *)
443 483 (* Exec.print_result stdout result; *)
... ... @@ -445,13 +485,15 @@ let process_conll_corpus filename =
445 485 (* CompTrees.compare_results result.parsed_text; *)
446 486 (* Visualization.print_paths "results/" "paths" result.paths; *)
447 487 ());
448   - Marshal.to_channel oc (PreTypes.RawText "",ExtArray.make 1 ENIAMtokenizerTypes.empty_token) [];
  488 + Marshal.to_channel oc (ENIAMsubsyntaxTypes.RawText "",ExtArray.make 1 ENIAMtokenizerTypes.empty_token) [];
449 489 flush oc;
450 490 let _ = Unix.shutdown_connection ic in
451 491 ()
452 492  
453 493 let _ =
  494 + LCGfields.reset();
454 495 (* process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; *)
455   - (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *)
456   - process_conll_corpus "../testy/skladnica-test1.conll";
  496 + process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll";
  497 + (* process_conll_corpus "../testy/skladnica-test1.conll"; *)
  498 + LCGfields.print_results();
457 499 ()
... ...
corpora/test_conll2.ml
... ... @@ -116,7 +116,7 @@ let test_example path id tokens lex_sems paths last =
116 116 let test_dep_example path id tokens lex_sems paths =
117 117 try
118 118 ENIAM_LCGreductions.reset_variant_label ();
119   - (* let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in *)
  119 + let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in
120 120 ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths;
121 121 let chart = create_dep_chart tokens lex_sems paths in
122 122 ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart;
... ... @@ -150,7 +150,7 @@ let test_dep_example path id tokens lex_sems paths =
150 150 let rec parse_sentence name id tokens lex_sems = function
151 151 RawSentence s -> id
152 152 | StructSentence(paths,last) ->
153   - test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last;
  153 + (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *)
154 154 id + 1
155 155 | DepSentence(paths) ->
156 156 test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths;
... ...
diagnostics/treeChange.ml deleted
1   -open Xstd
2   -open PreTypes
3   -
4   -let if_lemma lemmas = function
5   - Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas
6   - | _ -> false
7   -
8   -let if_cat cats = function
9   - Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats
10   - | _ -> false
11   -
12   -let if_interps interps token =
13   - let interp = match token with
14   - Lemma(_,_,i) -> i
15   - | _ -> [[[]]] in
16   - let if_interp nr value =
17   - List.exists (fun x ->
18   - try
19   - List.exists (fun y ->
20   - y = value) (List.nth x nr)
21   - with _ -> false
22   - ) interp in
23   - Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value))
24   -
25   -let correct_coordination1 paths tokens =
26   - let paths_ls = List.mapi (fun i (id,super,label) ->
27   - (i,id,super,label)) (Array.to_list paths) in
28   -
29   - let l = [("subst:nom",0),(["fin";"praet"],0);
30   - ("subst:acc",0),(["inf"],0);
31   - ("ppron3:nom",0),(["fin";"praet"],0);
32   - ("ppron3:acc",0),(["fin";"praet"],0);
33   - ("adv",0),(["fin";"praet"],0);
34   - ("adv",0),(["inf"],0);
35   - ("adv",0),(["adj"],0);
36   - ("prep",0),(["fin";"praet"],0);
37   - ("prep",0),(["inf"],0);
38   - ("prep",0),(["ppas"],0);
39   - ("prep",0),(["subst"],0);
40   - ("prep:gen",0),(["subst:gen"],0);
41   - ("adj:nom",0),(["fin";"praet"],0);
42   - ("adj:nom",0),(["subst:nom"],0);
43   - ("adj:gen",0),(["subst:gen"],0);
44   - ("adj:dat",0),(["subst:dat"],0);
45   - ("adj:acc",0),(["subst:acc"],0);
46   - ("adj:inst",0),(["subst:inst"],0);
47   - ("adj:loc",0),(["subst:loc"],0);
48   - ("subst:gen",0),(["subst:nom"],0);
49   - (* ("subst:gen",0),(["subst:gen"],0); *)
50   - ("subst:gen",0),(["subst:dat"],0);
51   - ("subst:gen",0),(["subst:acc"],0);
52   - ("subst:gen",0),(["subst:inst"],0);
53   - ("subst:gen",0),(["subst:loc"],0);
54   - ("ppron3:gen",0),(["subst:nom"],0);
55   - ("ppron3:gen",0),(["subst:dat"],0);
56   - ("ppron3:gen",0),(["subst:acc"],0);
57   - ("ppron3:gen",0),(["subst:inst"],0);
58   - ("ppron3:gen",0),(["subst:loc"],0);
59   - ("qub",0),(["fin";"praet"],0);
60   - ("qub",0),(["subst"],0);
61   - ("qub",0),(["adj"],0);
62   - ("pact",0),(["subst"],0);
63   - ("ppas",0),(["subst"],0)
64   - ] in
65   -
66   - let find_dependents sons =
67   -
68   - let is (i,id,super,label) pattern = match Xstring.split ":" pattern with
69   - ["prep";case] -> if_cat ["prep"] (ExtArray.get tokens id).token &&
70   - if_interps [0,case] (ExtArray.get tokens id).token
71   - | [cat;case] -> if_cat [cat] (ExtArray.get tokens id).token &&
72   - if_interps [1,case] (ExtArray.get tokens id).token
73   - | [cat] -> if_cat [cat] (ExtArray.get tokens id).token
74   - | _ -> failwith "is (in correct_coordination1)" in
75   -
76   - let incr_representative acc son = Xlist.map acc (fun ((one,a),(rest,b)) ->
77   - if is son one
78   - then (one,a + 1), (rest,b)
79   - else if List.exists (is son) rest
80   - then (one,a), (rest,b + 1)
81   - else (one,a), (rest,b)) in
82   -
83   - let get_from sons pattern = List.find (fun x -> is x pattern) sons in
84   -
85   - let l = Xlist.fold sons l incr_representative in
86   - let results = List.filter (fun ((_,a),(_,b)) -> a = 1 && b > 1) l in
87   - Xlist.map results (fun result ->
88   - get_from sons @@ fst @@ fst result,
89   - List.filter (fun son ->
90   - List.exists (fun one -> is son one) (fst (snd result))) sons) in
91   -
92   - let establish_neighbour super ((i_d,id_d,super_d,label_d),sons) =
93   - let not_between (i_s,_,_,_) =
94   - (super < i_d && super < i_s) ||
95   - (super > i_d && super > i_s) in
96   - let (i_n,id_n,super_n,label_n) = List.find (fun son ->
97   - not_between son) sons in
98   - paths.(i_d) <- (id_d, i_n, label_d) in
99   -
100   - let examine_coords (i,id,super,label) sons =
101   - try
102   - let dependents = find_dependents sons in
103   - Xlist.iter dependents (establish_neighbour super)
104   - with
105   - | _ -> () in
106   -
107   - Array.iteri (fun i (id,super,label) ->
108   - if if_cat ["conj"] (ExtArray.get tokens id).token
109   - then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in
110   - if (List.length sons > 2)
111   - then examine_coords (i,id,super,label) sons)) paths;
112   - paths
113   -
114   -let correct_coordination2 paths tokens =
115   - let paths_c = Array.copy paths in
116   - let paths_ls () = List.mapi (fun i (id,super,label) ->
117   - (i,id,super,label)) (Array.to_list paths_c) in
118   -
119   - (* let ps a sons =
120   - print_endline a;
121   - List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons;
122   - print_endline "" in *)
123   -
124   - let rec correct_rec (i,id,super,label) sons =
125   - let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in
126   - (* ps "left:" (List.rev left_s);
127   - ps "right:" right_s; *)
128   - find_father i (List.rev left_s);
129   - find_father i right_s
130   -
131   - and find_father i0 = function
132   - [(i,id,super,label)] -> paths_c.(i) <- (id,i0,label)
133   - | (a,b,c,d) :: (i,id,super,label) :: t ->
134   - paths_c.(i) <- (id,i0,label);
135   - if not (if_cat ["conj"] (ExtArray.get tokens i).token ||
136   - (ExtArray.get tokens i).orth = ",")
137   - then failwith "find_father";
138   - correct_rec (i,id,super,label) (if a < i
139   - then (a,b,c,d) :: t
140   - else List.rev @@ (a,b,c,d) :: t)
141   - | _ -> failwith "find_father" in
142   -
143   - let check_previous_for_interp i =
144   - if i >= 0 && (ExtArray.get tokens i).orth = "," &&
145   - not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c))
146   - then paths_c.(i) <- (0,-1,"") in
147   -
148   - Array.iteri (fun i (id,super,label) ->
149   - if if_cat ["conj"] (ExtArray.get tokens i).token ||
150   - (ExtArray.get tokens i).orth = ","
151   - then
152   - (check_previous_for_interp (i-1);
153   - let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in
154   - if (List.length sons > 2)
155   - then correct_rec (i,id,super,label) sons)) paths_c;
156   - paths_c
157   -
158   -let praet_qub_aglt paths tokens =
159   - Array.iteri (fun i (id,super,label) ->
160   - if super >= 0 then
161   - (let id_s, super_s, label_s = paths.(super) in
162   - if if_cat ["aglt"] (ExtArray.get tokens id).token &&
163   - (ExtArray.get tokens id_s).orth = "by"
164   - then let id_gf,super_gf,label_gf = paths.(super_s) in
165   - if if_cat ["praet"] (ExtArray.get tokens id_gf).token
166   - then paths.(i) <- (id,super_s,label))) paths;
167   - paths
168   -
169   -let replace_tokens paths tokens =
170   -(* for i = 0 to ExtArray.size tokens - 1 do
171   - print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth)
172   -done; *)
173   - let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->
174   - if (ExtArray.get tokens i).orth = orth then i else acc) in
175   -
176   - let multidot i id0 super0 label0 =
177   - let id1, super1, label1 = paths.(super0) in
178   - if super1 >= 0 then
179   - let id2, super2, label2 = paths.(super1) in
180   - if (ExtArray.get tokens id1).orth = "." &&
181   - (ExtArray.get tokens id2).orth = "."
182   - then
183   - (paths.(super1) <- (find_token "..." ,super2, label2);
184   - paths.(super0) <- (0,-1,"");
185   - paths.(i) <- (0,-1,"")) in
186   -
187   - let brev i id super label =
188   - let if_the_last_dot () =
189   - let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) ->
190   - s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in
191   - Array.fold_left (fun acc (i2,s,l) ->
192   - acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in
193   -
194   - let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot ()
195   - then ""
196   - else "." in
197   -
198   - let n_orth = (ExtArray.get tokens id).orth ^ dot in
199   - paths.(i) <- (find_token n_orth,super,label) in
200   -
201   - Array.iteri (fun i (id,super,label) ->
202   - if (ExtArray.get tokens id).orth = "."
203   - then multidot i id super label;
204   - if if_cat ["brev"] (ExtArray.get tokens id).token
205   - then brev i id super label)
206   - paths;
207   - paths
208   -
209   -let replace_hyphens paths tokens =
210   - let ref_paths = ref paths in
211   - let find_token token = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->
212   - if (ExtArray.get tokens i).token = token then i else acc) in
213   - let find_specific_token token beg next = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i ->
214   - if (ExtArray.get tokens i).token = token &&
215   - beg <= (ExtArray.get tokens i).beg &&
216   - (ExtArray.get tokens i).next <= next
217   - then i else acc) in
218   -
219   - let correct_last son_of_zero =
220   - let i1,s1,l1 = !ref_paths.(Array.length !ref_paths - 1) in
221   - let i2,s2,l2 = !ref_paths.(son_of_zero) in
222   - if (ExtArray.get tokens i1).orth = "."
223   - then
224   - (!ref_paths.(Array.length !ref_paths - 1) <- (find_token (Interp "</sentence>"),1,l1);
225   - !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2))
226   - else
227   - (ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),1,"-") |];
228   - !ref_paths.(Array.length !ref_paths - 2) <- (i1,Array.length !ref_paths - 1,l1);
229   - !ref_paths.(son_of_zero) <- (i2,Array.length !ref_paths - 1,l2)) in
230   -
231   - let one_hyphen sons_of_zero =
232   - let i2,s2,l2 = !ref_paths.(1) in
233   - Xlist.iter sons_of_zero (fun son_of_zero ->
234   - let i1,s1,l1 = !ref_paths.(son_of_zero) in
235   - !ref_paths.(son_of_zero) <- (i1,1,l1));
236   - !ref_paths.(1) <- (find_token (Interp "<or-sentence>"),0,l2);
237   - correct_last son_of_zero in
238   -
239   - let two_hyphens first second son parent =
240   - let i1,s1,l1 = !ref_paths.(first) in
241   - let i2,s2,l2 = !ref_paths.(second) in
242   - let beg, next = (ExtArray.get tokens i2).beg, (ExtArray.get tokens i2).next in
243   - let i3,s3,l3 = !ref_paths.(son) in
244   - let i4,s4,l4 = !ref_paths.(parent) in
245   - ref_paths := Array.append !ref_paths [| (find_token (Interp "</sentence>"),first,"-") |];
246   - !ref_paths.(first) <- (find_token (Interp "<or-sentence>"),0,l1);
247   - !ref_paths.(second) <- (find_specific_token (Interp "</or-sentence>") beg next,first,l2);
248   - !ref_paths.(son) <- (i3,second,l3);
249   - !ref_paths.(parent) <- (i4,first,l4) in
250   -
251   - let rec is_dep_correct a b out zero res i (id,super,label) = (* out = how many words in (a,b) have parent outside [a,b]*)
252   - (* print_endline ((string_of_int a) ^ " " ^ (string_of_int b) ^ " " ^ (string_of_int out) ^ " " ^ (string_of_int zero) ^ " " ^ (string_of_int i)); *)
253   - if out > 1 || zero > 1 || (* zero = how many words (not interps) have parent 0 *)
254   - (a < i && i < b && super < a && label <> "interp") ||
255   - (a < super && super < b && (i < a || b < i))
256   - then false, res
257   - else
258   - if i+1 = Array.length !ref_paths
259   - then out = 1 && zero = 1, res
260   - else
261   - if a < i && i < b && b < super
262   - then is_dep_correct a b (out+1) zero (i,super) (i+1) !ref_paths.(i+1)
263   - else
264   - if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token)
265   - then is_dep_correct a b out (zero+1) res (i+1) !ref_paths.(i+1)
266   - else is_dep_correct a b out zero res (i+1) !ref_paths.(i+1) in
267   -
268   - let hyphens = snd @@ Array.fold_left (fun (i,acc) (id,super,label) ->
269   - if (ExtArray.get tokens id).orth = "-"
270   - then i+1, i :: acc
271   - else i+1, acc) (0,[]) !ref_paths in
272   -
273   - let sons_of_zero = snd @@ Array.fold_left (fun (i,acc) (id,super,label) ->
274   - if super = 0 && not (if_cat ["interp"] (ExtArray.get tokens id).token)
275   - then i+1, i :: acc
276   - else i+1, acc) (0,[]) !ref_paths in
277   -
278   - (if List.length sons_of_zero = 1
279   - then
280   - if List.length hyphens = 1 && hyphens = [1]
281   - then one_hyphen sons_of_zero
282   - else
283   - if List.length hyphens = 2
284   - then let a, b = List.nth hyphens 1, List.nth hyphens 0 in
285   - let is_good, (son,parent) = is_dep_correct a b 0 0 (0,0) 1 !ref_paths.(1) in
286   - if a = 1 && is_good
287   - then two_hyphens a b son parent);
288   - !ref_paths
289   -
290   -let correct_interp_with_father_0 paths tokens =
291   - Array.iteri (fun i (id,super,label) ->
292   - if (super = 0 ||
293   - (ExtArray.get tokens id).token = Interp "<or-sentence>" ||
294   - (ExtArray.get tokens id).token = Interp "</or-sentence>") && (ExtArray.get tokens id).orth = ","
295   - then Array.iteri (fun i1 (id1,super1,label1) ->
296   - if super1 = i
297   - then paths.(i1) <- (id1,0,label1)) paths) paths;
298   - paths
299   -
300   -let remove_interps interp paths tokens =
301   - let paths_ls = Array.to_list paths in
302   - Array.iteri (fun i (id,super,label) ->
303   - if (ExtArray.get tokens id).orth = interp &&
304   - not (List.exists (fun (_,super,_) -> super = i) paths_ls)
305   - then paths.(i) <- (0,-1,"")) paths;
306   - paths
307   -
308   -let correct_passive_voice paths tokens =
309   - Array.iteri (fun i (id,super,label) ->
310   - if super >= 0 then
311   - (let id_s, super_s, label_s = paths.(super) in
312   - if (if_cat ["praet"] (ExtArray.get tokens id).token &&
313   - if_cat ["ppas"] (ExtArray.get tokens id_s).token)
314   - then (paths.(i) <- (id,super_s,label);
315   - paths.(super) <- (id_s,i,label_s);
316   - Array.iteri (fun i_p (id_p,super_p,label_p) ->
317   - if super_p = super
318   - then paths.(i_p) <- (id_p,i,label_p)) paths))) paths;
319   - paths
320   -
321   -let swap_dep paths tokens =
322   - let change_dep i (id,super,label) =
323   - let id_S, super_S, label_S = paths.(super) in
324   - paths.(i) <- (id,super_S,label);
325   - paths.(super) <- (id_S, id, label_S) in
326   - let rec correct_dep i (id,super,label) =
327   - let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który";
328   - "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in
329   - if (if_cat ["comp"] (ExtArray.get tokens id).token &&
330   - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token) ||
331   - (if_cat ["conj"] (ExtArray.get tokens id).token &&
332   - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"] (ExtArray.get tokens super).token &&
333   - not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) ||
334   - (if_cat ["ppron3"] (ExtArray.get tokens id).token &&
335   - if_interps [5,"praep"] (ExtArray.get tokens id).token) ||
336   - (if_lemma adv_relators (ExtArray.get tokens id).token &&
337   - if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token)
338   - then
339   - change_dep i (id,super,label);
340   - if (if_lemma adv_relators (ExtArray.get tokens id).token &&
341   - if_cat ["subst"; "pred"] (ExtArray.get tokens super).token)
342   - then correct_dep i paths.(i) in
343   - Array.iteri correct_dep paths; paths
344   -
345   - (*
346   - correct_coordination1 -> sąsiad słowem najbliższym po prawej, jeśli pomiędzy nim a mną spójnik, to najbliższym po lewej
347   - nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy
348   - nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *)
parser/visualization.ml
... ... @@ -916,7 +916,7 @@ let rec html_of_text path tokens = function
916 916 sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_text path tokens text))) ^
917 917 "</table>"
918 918  
919   -let print_html_text path name text tokens lex_sems =
  919 +let print_html_text path name text tokens (*lex_sems*) =
920 920 File.file_out (path ^ name ^ ".html") (fun file ->
921 921 fprintf file "%s\n" html_header;
922 922 fprintf file "%s<BR>\n" (html_of_text path tokens text);
... ...
pre/makefile
... ... @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa
7 7 INSTALLDIR=`ocamlc -where`
8 8  
9 9 WAL= paths.ml
... ...
pre/preProcessing.ml
... ... @@ -121,9 +121,9 @@ let parse_text = function
121 121 let lex_sems = ENIAMlexSemantics.assign tokens text in
122 122 text,tokens,lex_sems
123 123 | AltText[Raw,RawText query;CONLL,StructText[
124   - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens ->
  124 + StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens ->
125 125 let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in
126   - let conll = StructParagraph[{p with psentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
  126 + let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
127 127 @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in
128 128 let paths = ENIAMsubsyntax.parse query in
129 129 let sentences = ENIAMsentences.split_into_sentences "" query tokens paths in
... ... @@ -135,7 +135,7 @@ let parse_text = function
135 135  
136 136 let rec main_loop in_chan out_chan =
137 137 (* print_endline "main_loop 1"; *)
138   - let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_record ExtArray.t) in
  138 + let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_env ExtArray.t) in
139 139 (* print_endline "main_loop 2"; *)
140 140 if fst query = RawText "" then () else (
141 141 (try
... ... @@ -154,7 +154,7 @@ let rec main_loop in_chan out_chan =
154 154 (* print_endline "main_loop 7"; *)
155 155 Marshal.to_channel out_chan (
156 156 RawText "",
157   - ExtArray.make 1 ENIAMtokenizerTypes.empty_token,
  157 + ExtArray.make 1 ENIAMtokenizerTypes.empty_token_env,
158 158 ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem,
159 159 Printexc.to_string e,
160 160 0.) []));
... ...
testy/skladnica-test2.conll
... ... @@ -11,7 +11,7 @@
11 11 5 szanse szansa subst subst pl|acc|f 4 obj_th _ _
12 12 6 ? ? interp interp _ 4 punct _ _
13 13  
14   -# trees/NKJP_1M_1202900095/morph_3-p/morph_3.46-s.xml.tree
  14 +# trees/NKJP_1M_1202900095/morph_3-p/morph_3.46-s.xml.trees
15 15 1 - - interp interp 0 _ _ _
16 16 2 Słoń słoń subst subst sg|nom|m2 4 _ _ _
17 17 3 - - interp interp 0 _ _ _
... ... @@ -19,7 +19,7 @@
19 19 5 Pinio Pinio subst subst sg|nom|m1 4 _ _ _
20 20 6 . . interp interp 0 _ _ _
21 21  
22   -# trees/NKJP_1M_2002000114/morph_2-p/morph_2.72-s.xml.tree
  22 +# trees/NKJP_1M_2002000114/morph_2-p/morph_2.72-s.xml.trees
23 23 1 - - interp interp 0 _ _ _
24 24 2 Nie nie qub qub 3 _ _ _
25 25 3 mogę móc fin fin sg|pri|imperf 7 _ _ _
... ... @@ -29,7 +29,7 @@
29 29 7 zachrypiał zachrypieć praet praet sg|m1|perf 0 _ _ _
30 30 8 . . interp interp 0 _ _ _
31 31  
32   -# trees/NKJP_1M_2002000028/morph_5-p/morph_5.40-s.xml.tree
  32 +# trees/NKJP_1M_2002000028/morph_5-p/morph_5.40-s.xml.trees
33 33 1 - - interp interp 0 _ _ _
34 34 2 Właśnie właśnie qub qub 4 _ _ _
35 35 3 to to subst subst sg|acc|n 4 _ _ _
... ... @@ -39,7 +39,7 @@
39 39 7 twardo twardo adv adv pos 6 _ _ _
40 40 8 . . interp interp 0 _ _ _
41 41  
42   -# trees/NKJP_1M_1202000001/morph_3-p/morph_3.9-s.xml.tree
  42 +# trees/NKJP_1M_1202000001/morph_3-p/morph_3.9-s.xml.trees
43 43 1 CKM CKM subst subst sg|nom|n 0 _ _ _
44 44 2 : interp 0 _ _ _
45 45 3 Jak jak adv adv pos 5 _ _ _
... ... @@ -50,7 +50,7 @@
50 50 8 patrzeć patrzeć inf inf imperf 5 _ _ _
51 51 9 ? ? interp interp 0 _ _ _
52 52  
53   -# trees/NKJP_1M_2001000023/morph_1-p/morph_1.61-s.xml.tree
  53 +# trees/NKJP_1M_2001000023/morph_1-p/morph_1.61-s.xml.trees
54 54 1 Pochylił pochylić praet praet sg|m1|perf 0 _ _ _
55 55 2 em być aglt aglt sg|pri|imperf|wok 1 _ _ _
56 56 3 się się qub qub 1 _ _ _
... ...