Commit 7dc3851b214629035560071402796b957b9bf7de

Authored by Daniel Oklesiński
1 parent e2dcc521

poprawienie błędu Not_found

corpora/CONLL.ml
1 1 open Xstd
2 2 open PreTypes
3 3  
4   -let string_of_token token =
5   - let lemma,cat,interp = match token.token with
6   - Lemma(a,b,c) -> a,b,if c = [[]]
  4 +let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts
  5 + then f mode (snd @@ List.find (fun (m,_) -> m = mode) alts)
  6 + else f mode (snd @@ List.find (fun (m,_) -> m = Struct) alts)
  7 +
  8 +let string_of_token mode token = match mode with
  9 + | Raw -> token.orth
  10 + | Struct -> failwith ("function string_of_token for mode Struct is not defined")
  11 + | CONLL -> let lemma,cat,interp = match token.token with
  12 + | Lemma(a,b,c) -> a,b,if c = [[]]
7 13 then "_"
8 14 else String.concat "][" @@ Xlist.map c (fun x ->
9 15 String.concat "|" @@ Xlist.map x ( fun y ->
10 16 String.concat "." y))
11   - | _ -> failwith ("string_of_token: not Lemma") in
12   - String.concat "\t" [string_of_int token.id;
13   - token.orth; lemma; cat; cat; interp; "_"; "_"; "_"; "_"]
14   -
15   -let string_of_sentence sentence =
16   - let rec pom = function
17   - RawSentence text -> failwith ("string_of_sentence: " ^ text)
18   - | StructSentence (tokens, n) -> String.concat "\n" @@ List.map (fun x -> string_of_token x) tokens
  17 + | _ -> failwith ("string_of_token: not Lemma") in
  18 + String.concat "\t" [string_of_int token.id;
  19 + token.orth; lemma; cat; cat; interp; "_"; "_";
  20 + string_of_int token.beg; string_of_int token.len]
  21 +
  22 +let rec string_of_sentence mode = function
  23 + RawSentence s -> if mode = Raw then s else ""
  24 + | StructSentence (tokens, _) -> String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)
19 25 | ORSentence (_,_,_,_) -> failwith ("string_of_sentence: ORSentence")
20   - | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
21   - then pom (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
22   - else failwith ("string_of_sentence: no CONLL mode in AltSentence") in
23   - (if sentence.pid = ""
24   - then ""
25   - else sentence.pid ^ "\n") ^ (pom sentence.psentence)
  26 + | AltSentence alts -> alternative_string string_of_sentence mode alts
26 27  
27   -(******************)
  28 +let string_of_p_record mode p_record =
  29 + (if p_record.pid = "" then "" else p_record.pid ^ "\n") ^
  30 + string_of_sentence mode p_record.psentence
28 31  
29   -exception Empty_line
30   -exception Empty_sentence
31   -exception Id_line of string
  32 +let rec string_of_paragraph mode = function
  33 + RawParagraph s -> if mode = Raw then s else ""
  34 + | StructParagraph (p_records, _) -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode)
  35 + | AltParagraph alts -> alternative_string string_of_paragraph mode alts
32 36  
33   -let load_token stream =
34   - let line = input_line stream in
35   - if line = ""
36   - then raise Empty_line
37   - else if line.[0] = '#'
38   - then
39   - if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
40   - then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
41   - raise (Id_line id)
42   - else failwith ("load_token: " ^ line)
43   - else
44   - match Xstring.split "\t" line with
45   - [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
46   - (*if cat <> cat2
47   - then failwith ("load_token: " ^ line)
48   - else *)
49   - let interp = if interp = "_"
50   - then [[]]
51   - else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in
52   - {empty_token with id = int_of_string id; orth = orth; token = Lemma(lemma,cat,interp)}
53   - | _ -> failwith ("load_token: " ^ line)
54   -(* {c_id = List.nth pom 1;
55   - c_lemma = List.nth pom 2;
56   - c_cat = List.nth pom 3;
57   - c_interp = (let interp = List.nth pom 5 in
58   - if interp = "_"
59   - then []
60   - else Str.split (Str.regexp "|") interp);
61   - c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)
  37 +let rec string_of_text mode = function
  38 + RawText s -> if mode = Raw then s else ""
  39 + | StructText paragraphs -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode)
  40 + | AltText alts -> alternative_string string_of_text mode alts
62 41  
63   -let load_sentence stream =
64   - let rec pom rev_tokens id =
65   - try
66   - let token = load_token stream in
67   - pom (token :: rev_tokens) id
68   - with Id_line new_id -> pom rev_tokens new_id
69   - | Empty_line -> rev_tokens, id
70   - | End_of_file -> if rev_tokens = []
71   - then raise End_of_file
72   - else rev_tokens, id in
73   - let rev_tokens, id = pom [] "" in
74   - {pid = id; pbeg = -1; plen = -1; psentence = StructSentence(List.rev rev_tokens,-1)}
75   -(* {s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)} *)
76   -
77   -let load_corpus stream =
78   - let rec pom res =
79   - try
80   - let conll_sentence = load_sentence stream in
81   - pom (conll_sentence :: res)
82   - with e -> print_endline (Printexc.to_string e); res in
83   - pom []
84 42  
85 43 (******************)
86 44  
... ... @@ -121,7 +79,7 @@ let add_to_map map info_str =
121 79 let info_map =
122 80 Xlist.fold info StringMap.empty add_to_map
123 81  
124   -let match_sentence sentence =
  82 +let match_sentence p_record =
125 83 let rec info_token s = match s with
126 84 RawSentence text -> failwith ("match_sentence: " ^ text)
127 85 | StructSentence (tokens, n) -> String.concat " " @@ List.map (fun x -> x.orth) tokens
... ... @@ -129,14 +87,79 @@ let match_sentence sentence =
129 87 | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
130 88 then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
131 89 else failwith ("match_sentence: no CONLL mode in AltSentence") in
132   - let info_token = info_token sentence.psentence in
133   - (* try *)
  90 + let info_token = info_token p_record.psentence in
  91 + try
134 92 let id, text = StringMap.find info_map info_token in
135   - let pbeg, plen, n_sentence = establish_lengths text sentence.psentence (* -1, -1, sentence.psentence *) in
136   - AltText[Raw,RawText text;CONLL,StructText([StructParagraph([{pid = sentence.pid; pbeg = pbeg; plen = plen;
  93 + let beg, len, n_sentence = establish_lengths text p_record.psentence (* -1, -1, p_record.psentence *) in
  94 + AltText[Raw,RawText text;CONLL,StructText([StructParagraph([{pid = id; pbeg = beg; plen = len;
137 95 psentence = AltSentence[Raw, RawSentence text; CONLL, n_sentence]}],-1)])]
138 96 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
139   - (* with _ -> sentence *)
  97 + with _ -> StructText([StructParagraph([p_record],-1)])
140 98  
141 99 let match_corpus corpus =
142 100 Xlist.map corpus match_sentence
  101 +
  102 +(******************)
  103 +
  104 +exception Empty_line
  105 +exception Empty_sentence
  106 +exception Id_line of string
  107 +
  108 +let load_token in_channel =
  109 + let fail line =
  110 + (* failwith ("load_token: " ^ line) *)
  111 + () in
  112 + let n_token id orth lemma cat interp =
  113 + let interp = if interp = "_"
  114 + then [[]]
  115 + else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in
  116 + {empty_token with id = int_of_string id; orth = orth; token = Lemma(lemma,cat,interp)} in
  117 + let line = input_line in_channel in
  118 + if line = ""
  119 + then raise Empty_line
  120 + else if line.[0] = '#'
  121 + then
  122 + if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
  123 + then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
  124 + raise (Id_line id)
  125 + else failwith ("load_token: " ^ line)
  126 + else
  127 + match Xstring.split "\t" line with
  128 + [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
  129 + (if cat <> cat2 then fail line; n_token id orth lemma cat interp)
  130 + | id :: orth :: lemma :: cat :: cat2 :: interp :: e ->
  131 + (fail line; n_token id orth lemma cat interp)
  132 + | _ -> failwith ("load_token: " ^ line)
  133 +(* {c_id = List.nth pom 1;
  134 + c_lemma = List.nth pom 2;
  135 + c_cat = List.nth pom 3;
  136 + c_interp = (let interp = List.nth pom 5 in
  137 + if interp = "_"
  138 + then []
  139 + else Str.split (Str.regexp "|") interp);
  140 + c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)
  141 +
  142 +let load_sentence in_channel =
  143 + let rec pom rev_tokens id =
  144 + try
  145 + let token = load_token in_channel in
  146 + pom (token :: rev_tokens) id
  147 + with Id_line new_id -> pom rev_tokens new_id
  148 + | Empty_line -> rev_tokens, id
  149 + | End_of_file -> if rev_tokens = []
  150 + then raise End_of_file
  151 + else rev_tokens, id in
  152 + let rev_tokens, id = pom [] "" in
  153 + {pid = id; pbeg = -1; plen = -1; psentence = StructSentence(List.rev rev_tokens,-1)}
  154 +(* {s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)} *)
  155 +
  156 +let load_corpus in_channel =
  157 + let rec pom res =
  158 + try
  159 + let conll_sentence = load_sentence in_channel in
  160 + pom (conll_sentence :: res)
  161 + with End_of_file -> res in
  162 + (* match_corpus @@ *) List.rev @@ pom []
  163 +
  164 +
  165 +
... ...
parser/exec.ml
... ... @@ -142,8 +142,8 @@ let process_query ic oc timeout test_only_flag id full_query max_n =
142 142 let time2 = time_fun () in
143 143 let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1;
144 144 paths_size=let _,_,next_id = paths in next_id-1} in
145   - if msg <> "" then {result with status=PreprocessingError; msg=msg} else
146   - try
  145 + (*if msg <> "" then*) {result with status=PreprocessingError; msg=msg} (*else*)
  146 + (*try
147 147 let graph = LCGlexicon.create query paths in
148 148 let graph,references,next_reference = LCGchart.lazify graph in
149 149 let time3 = time_fun () in
... ... @@ -222,7 +222,7 @@ let process_query ic oc timeout test_only_flag id full_query max_n =
222 222 {result with status=ParseError; msg=Printexc.to_string e; parse_time=time4 -. time3}
223 223 with e ->
224 224 let time3 = time_fun () in
225   - {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}
  225 + {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}*)
226 226  
227 227 let print_result file result =
228 228 Printf.fprintf file "query: %s\n" result.query;
... ...