poprawienie błędu Not_found

Daniel Oklesiński
1 parent e2dcc521
Showing 2 changed files with 103 additions and 80 deletions
corpora/CONLL.ml
parser/exec.ml
 open Xstd
 open PreTypes
  
-let string_of_token token =
-  let lemma,cat,interp = match token.token with
-      Lemma(a,b,c) -> a,b,if c = [[]]
+let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts
+      then f mode (snd @@ List.find (fun (m,_) -> m = mode) alts)
+      else f mode (snd @@ List.find (fun (m,_) -> m = Struct) alts)
+
+let string_of_token mode token = match mode with
+  | Raw -> token.orth
+  | Struct -> failwith ("function string_of_token for mode Struct is not defined")
+  | CONLL -> let lemma,cat,interp = match token.token with
+      | Lemma(a,b,c) -> a,b,if c = [[]]
                    then "_"
                    else String.concat "][" @@ Xlist.map c (fun x ->
                           String.concat "|" @@ Xlist.map x ( fun y ->
                             String.concat "." y))
-    | _ -> failwith ("string_of_token: not Lemma") in
-  String.concat "\t" [string_of_int token.id;
-                 token.orth; lemma; cat; cat; interp; "_"; "_"; "_"; "_"]
-
-let string_of_sentence sentence =
-  let rec pom = function
-      RawSentence text -> failwith ("string_of_sentence: " ^ text)
-    | StructSentence (tokens, n) -> String.concat "\n" @@ List.map (fun x -> string_of_token x) tokens
+      | _ -> failwith ("string_of_token: not Lemma") in
+    String.concat "\t" [string_of_int token.id;
+                 token.orth; lemma; cat; cat; interp; "_"; "_";
+                 string_of_int token.beg; string_of_int token.len]
+
+let rec string_of_sentence mode = function
+      RawSentence s -> if mode = Raw then s else ""
+    | StructSentence (tokens, _) -> String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)
     | ORSentence (_,_,_,_) -> failwith ("string_of_sentence: ORSentence")
-    | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
-        then pom (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
-        else failwith ("string_of_sentence: no CONLL mode in AltSentence") in
-  (if sentence.pid = ""
-    then ""
-    else sentence.pid ^ "\n") ^ (pom sentence.psentence)
+    | AltSentence alts -> alternative_string string_of_sentence mode alts
  
-(******************)
+let string_of_p_record mode p_record =
+  (if p_record.pid = "" then "" else p_record.pid ^ "\n") ^
+  string_of_sentence mode p_record.psentence
  
-exception Empty_line
-exception Empty_sentence
-exception Id_line of string
+let rec string_of_paragraph mode = function
+    RawParagraph s -> if mode = Raw then s else ""
+  | StructParagraph (p_records, _) -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode)
+  | AltParagraph alts -> alternative_string string_of_paragraph mode alts
  
-let load_token stream =
-  let line = input_line stream in
-  if line = ""
-   then raise Empty_line
-   else if line.[0] = '#'
-     then
-       if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
-         then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
-              raise (Id_line id)
-         else failwith ("load_token: " ^ line)
-     else
-       match Xstring.split "\t" line with
-         [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
-          (*if cat <> cat2
-            then failwith ("load_token: " ^ line)
-            else *)
-              let interp = if interp = "_"
-                then [[]]
-                else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in
-            {empty_token with id = int_of_string id; orth = orth; token = Lemma(lemma,cat,interp)}
-       | _ -> failwith ("load_token: " ^ line)
-(*     {c_id = List.nth pom 1;
-       c_lemma = List.nth pom 2;
-       c_cat = List.nth pom 3;
-       c_interp = (let interp = List.nth pom 5 in
-         if interp = "_"
-           then []
-           else Str.split (Str.regexp "|") interp);
-       c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)
+let rec string_of_text mode = function
+    RawText s -> if mode = Raw then s else ""
+  | StructText paragraphs -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode)
+  | AltText alts -> alternative_string string_of_text mode alts
  
-let load_sentence stream =
-  let rec pom rev_tokens id =
-    try
-      let token = load_token stream in
-      pom (token :: rev_tokens) id
-    with Id_line new_id -> pom rev_tokens new_id
-      | Empty_line -> rev_tokens, id
-      | End_of_file -> if rev_tokens = []
-          then raise End_of_file
-          else rev_tokens, id in
-  let rev_tokens, id = pom [] "" in
-  {pid = id; pbeg = -1; plen = -1; psentence = StructSentence(List.rev rev_tokens,-1)}
-(*  {s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)} *)
-
-let load_corpus stream =
-  let rec pom res =
-    try
-      let conll_sentence = load_sentence stream in
-      pom (conll_sentence :: res)
-    with e -> print_endline (Printexc.to_string e); res in
-  pom []
  
 (******************)
  
@@ -121,7 +79,7 @@ let add_to_map map info_str =
 let info_map =
   Xlist.fold info StringMap.empty add_to_map
  
-let match_sentence sentence =
+let match_sentence p_record =
   let rec info_token s = match s with
       RawSentence text -> failwith ("match_sentence: " ^ text)
     | StructSentence (tokens, n) -> String.concat " " @@ List.map (fun x -> x.orth) tokens
@@ -129,14 +87,79 @@ let match_sentence sentence =
     | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
         then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
         else failwith ("match_sentence: no CONLL mode in AltSentence") in
-  let info_token = info_token sentence.psentence in
-  (* try *)
+  let info_token = info_token p_record.psentence in
+  try
     let id, text = StringMap.find info_map info_token in
-    let pbeg, plen, n_sentence = establish_lengths text sentence.psentence (* -1, -1, sentence.psentence *) in
-    AltText[Raw,RawText text;CONLL,StructText([StructParagraph([{pid = sentence.pid; pbeg = pbeg; plen = plen;
+    let beg, len, n_sentence = establish_lengths text p_record.psentence (* -1, -1, p_record.psentence *) in
+    AltText[Raw,RawText text;CONLL,StructText([StructParagraph([{pid = id; pbeg = beg; plen = len;
      psentence = AltSentence[Raw, RawSentence text; CONLL, n_sentence]}],-1)])]
 (*  {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
-  (* with _ -> sentence *)
+  with _ -> StructText([StructParagraph([p_record],-1)])
  
 let match_corpus corpus =
   Xlist.map corpus match_sentence
+
+(******************)
+
+exception Empty_line
+exception Empty_sentence
+exception Id_line of string
+
+let load_token in_channel =
+  let fail line = 
+    (* failwith ("load_token: " ^ line) *)
+    () in
+  let n_token id orth lemma cat interp = 
+    let interp = if interp = "_"
+            then [[]]
+            else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in
+    {empty_token with id = int_of_string id; orth = orth; token = Lemma(lemma,cat,interp)} in
+  let line = input_line in_channel in
+  if line = ""
+   then raise Empty_line
+   else if line.[0] = '#'
+     then
+       if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
+         then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
+              raise (Id_line id)
+         else failwith ("load_token: " ^ line)
+     else
+       match Xstring.split "\t" line with
+         [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
+          (if cat <> cat2 then fail line; n_token id orth lemma cat interp)
+       | id :: orth :: lemma :: cat :: cat2 :: interp :: e ->
+          (fail line; n_token id orth lemma cat interp)
+       | _ -> failwith ("load_token: " ^ line)
+(*     {c_id = List.nth pom 1;
+       c_lemma = List.nth pom 2;
+       c_cat = List.nth pom 3;
+       c_interp = (let interp = List.nth pom 5 in
+         if interp = "_"
+           then []
+           else Str.split (Str.regexp "|") interp);
+       c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)
+
+let load_sentence in_channel =
+  let rec pom rev_tokens id =
+    try
+      let token = load_token in_channel in
+      pom (token :: rev_tokens) id
+    with Id_line new_id -> pom rev_tokens new_id
+      | Empty_line -> rev_tokens, id
+      | End_of_file -> if rev_tokens = []
+          then raise End_of_file
+          else rev_tokens, id in
+  let rev_tokens, id = pom [] "" in
+  {pid = id; pbeg = -1; plen = -1; psentence = StructSentence(List.rev rev_tokens,-1)}
+(*  {s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)} *)
+
+let load_corpus in_channel =
+  let rec pom res =
+    try
+      let conll_sentence = load_sentence in_channel in
+      pom (conll_sentence :: res)
+    with End_of_file -> res in
+  (* match_corpus @@ *) List.rev @@ pom []
+
+
+
@@ -142,8 +142,8 @@ let process_query ic oc timeout test_only_flag id full_query max_n =
   let time2 = time_fun () in
   let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1;
     paths_size=let _,_,next_id = paths in next_id-1} in
-  if msg <> "" then {result with status=PreprocessingError; msg=msg} else
-  try
+  (*if msg <> "" then*) {result with status=PreprocessingError; msg=msg} (*else*)
+  (*try
     let graph = LCGlexicon.create query paths in
     let graph,references,next_reference = LCGchart.lazify graph in
     let time3 = time_fun () in
@@ -222,7 +222,7 @@ let process_query ic oc timeout test_only_flag id full_query max_n =
         {result with status=ParseError; msg=Printexc.to_string e; parse_time=time4 -. time3}
   with e ->
     let time3 = time_fun () in
-    {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}
+    {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}*)
  
 let print_result file result =
   Printf.fprintf file "query: %s\n" result.query;