rozwiniece drzew zaleznosciowych

Daniel Oklesiński
1 parent 9f53f85c
Showing 11 changed files with 485 additions and 32959 deletions
corpora/CONLL.ml
corpora/info_sentences.txt
corpora/interpsInCorpus.ml
diagnostics/LCGfields.ml
diagnostics/treeChange.ml
parser/LCGchart.ml
parser/LCGlexicon.ml
parser/LCGrules.ml
parser/exec.ml
parser/pipe.ml
pre/paths.ml
@@ -132,6 +132,71 @@ let match_corpus corpus =
  
 (******************)
  
+type to_text = { t_orth: string; t_cat: string; t_interp: string list list list }
+
+let empty_to_text = { t_orth = ""; t_cat = ""; t_interp = [[[]]] }
+
+let get_text tokens =
+  let get i =
+    let cat = match (ExtArray.get tokens i).token with
+        Lemma(_,cat,_) -> cat
+      | _ -> "" in
+    let interp = match (ExtArray.get tokens i).token with
+        Lemma(_,_,i) -> i
+      | _ -> [[[]]] in
+    { t_orth = (ExtArray.get tokens i).orth; t_cat = cat; t_interp = interp } in
+
+  let n_tokens = Int.fold_down (ExtArray.size tokens - 1) 0 []
+    (fun acc i -> (get i)::acc)in
+
+  let quote_open = ref false in
+  let hyphenated = ref false in
+
+  let maybe_add_space pre_previous previous token next =
+    if previous.t_orth = "" && token.t_orth = "\""
+    then quote_open := true;
+    if token.t_cat = "aglt" ||
+      (token.t_orth = "by" && previous.t_cat = "praet") ||
+      (previous.t_orth = "\"" && !quote_open) ||
+      previous.t_orth = "(" ||
+      previous.t_orth = "„" ||
+      previous.t_orth = "" ||
+      token.t_orth = "ń" || (* wyrażenie nań *)
+      (token.t_orth = "że" && (previous.t_orth  = "czym" || previous.t_orth  = "Czym")) || (*wyrażenie czymże*)
+(*    (token.orth = "r" && token.cat = "brev") || (*skrót r. - np. 1991r. *) *)
+      (pre_previous.t_cat = "adj" && previous.t_orth = "." &&
+       token.t_cat = "num" && token.t_interp = [[["pl"];["nom"];["f"];["rec"]]]) (* godzina - np 13.15*)
+    then token.t_orth
+    else if !hyphenated
+      then (hyphenated := false; token.t_orth)
+      else match token.t_orth with
+      "." -> "."
+    | "…" -> "…"
+    | "?" -> "?"
+    | "!" -> "!"
+    | "," -> ","
+    | ":" -> ":"
+    | ";" -> ";"
+    | ")" -> ")"
+    | "”" -> "”"
+    | "-" -> if previous.t_cat = "adja" ||
+               (previous.t_cat = "subst" && next.t_cat = "subst" && previous.t_interp = next.t_interp)
+               then (hyphenated := true; "-")
+               else " -"
+    | "\"" -> if !quote_open
+                then (quote_open := false; "\"")
+                else (quote_open := true; " \"")
+    | s -> " "^s in
+
+  let rec fold4 acc = function
+    a::b::c::d::t -> fold4 (acc^maybe_add_space a b c d) (b::c::d::t)
+  | a::b::c::[] -> fold4 (acc^maybe_add_space a b c empty_to_text) (b::c::[])
+  | a::b::[] -> acc
+  | _ -> failwith ("get_sentence") in
+  fold4 "" (empty_to_text::empty_to_text::n_tokens)
+
+(******************)
+
 let establish_next tokens paths =
   let n = ExtArray.size tokens in
   Int.iter 1 (n - 2) (fun i ->
@@ -156,12 +221,16 @@ let rec establish_for_token i text tokens = function
       if Xstring.check_prefix " " text
       then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l
       else if Xstring.check_prefix h.orth text
+            (* || (h.orth = "m.in." && Xstring.check_prefix "m.in" text) *)
         then
           let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in
           let n_h = {h with beg = i ; len = n} in
           ExtArray.set tokens id n_h;
+          (* if Xstring.check_prefix h.orth text then *)
           establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t
-        else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)
+          (* else establish_for_token (i+n) (Xstring.cut_prefix "m.in" text) tokens t *)
+        else (prerr_endline ("establish_for_token :" ^ h.orth ^ " " ^ text);
+            failwith ("establish_for_token :" ^ h.orth ^ " " ^ text))
   | [] -> 100, i
  
 let rec establish_lengths text paths tokens =
@@ -196,16 +265,24 @@ let match_sentence (p_record,tokens) =
         then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
         else failwith ("match_sentence: no CONLL mode in AltSentence")*) in
   let info_token, paths = info_token p_record.psentence in
-  try
-    let id, text = StringMap.find info_map info_token in
+  (* try *)
+    let id, text = try
+      StringMap.find info_map info_token
+    with
+    | _ -> p_record.pid, get_text tokens in
     let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
     AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
      psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
 (*  {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
-  with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]
+  (* with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] *)
  
 let match_corpus corpus =
-  Xlist.map corpus match_sentence
+  let rec pom f = function
+      [] -> []
+    | a::l -> try
+          let r = f a in r :: pom f l
+        with e -> (*print_endline (Printexc.to_string e);*) pom f l in
+  pom match_sentence corpus
  
 (******************)
  
@@ -232,8 +309,11 @@ let load_token in_channel =
    else if line.[0] = '#'
      then
        if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
-         then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
-              raise (Id_line id)
+       then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
+         raise (Id_line id)
+       else if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.tree" line
+         then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.tree" line in
+                raise (Id_line id)
          else failwith ("load_token: " ^ line)
      else
        match Xstring.split "\t" line with
@@ -283,5 +363,6 @@ let load_corpus in_channel =
     try
       let conll_sentence, tokens = load_sentence in_channel in
       pom ((conll_sentence, tokens) :: res)
-    with End_of_file -> res in
+    with End_of_file -> res
+    | e -> prerr_endline (Printexc.to_string e); res in
   (* match_corpus @@ *) List.rev @@ pom []