Poprawa algorytmów dzielenia na zdania

Wojciech Jaworski
1 parent 5625b0fd
Showing 8 changed files with 72 additions and 24 deletions
subsyntax/ENIAM_MWE.ml
subsyntax/ENIAMsentences.ml
subsyntax/ENIAMsubsyntax.ml
subsyntax/TODO
subsyntax/test.ml
tokenizer/ENIAMpatterns.ml
tokenizer/ENIAMtokens.ml
tokenizer/test.ml
@@ -27,6 +27,7 @@ type t =
     L of string * string * sel list
   | O of string
   | D of string * string
+  | I of string
  
 let process_interp lemma interp =
   match Xstring.split ":" interp with
@@ -103,7 +104,7 @@ let load_mwe_dict2 filename (dict,dict2) =
         (match List.hd orths with
             L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
           | O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2
-          | D _ -> failwith "load_mwe_dict2")
+          | _ -> failwith "load_mwe_dict2")
     | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'"))
  
 let load_mwe_dicts () =
@@ -144,7 +145,7 @@ let preselect orths lemmas rules l =
     let b = Xlist.fold match_list true (fun b -> function
         O s -> StringSet.mem orths s && b
       | L(s,_,_) -> StringSet.mem lemmas s && b
-      | D(s,_) -> failwith "preselect") in
+      | _ -> failwith "preselect") in
     if b then (Xlist.size match_list > 1,match_list,lemma,cat,interp) :: rules else rules)
  
 let preselect_dict orths lemmas dict rules =
@@ -165,6 +166,9 @@ let add_ordnum_rules orths rules =
       (* Printf.printf "%s %s\n%!" orth lemma; *)
       (false,[D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))
  
+let add_quot_rule rules =
+  (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules
+
 let select_rules paths mwe_dict mwe_dict2 =
   let orths = get_orths paths in
   let lemmas = get_lemmas paths in
@@ -172,6 +176,7 @@ let select_rules paths mwe_dict mwe_dict2 =
   let rules = preselect_dict orths lemmas mwe_dict [] in
   let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
   let rules = add_ordnum_rules intnum_orths rules in
+  let rules = add_quot_rule rules in
   rules
  
 let rec check_interp sels = function
@@ -214,6 +219,7 @@ let rec match_path_rec map found (t:token_env) sels rev = function
                  if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then
                    (new_t,get_sels sels (interp,interp2)) :: found2 else found2)
            | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
+           | I s, Interp s2 -> if s=s2 then (new_t,sels) :: found2 else found2
            | _ -> found2)) in
      Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)
  
@@ -230,6 +236,7 @@ let match_path map = function
                  if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then
                    (t,get_sels [] (interp,interp2)) :: found else found)
            | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
+           | I s, Interp s2 -> if s=s2 then (t,[]) :: found else found
            | _ -> found))) in
      Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)
  
@@ -117,6 +117,16 @@ let find_quoted_sentences paragraph tokens chart last =
     (fun tokens id -> (ExtArray.get tokens id).token = Interp "”s")
     (fun ids -> Tokens("quoted_sentences",ids))
  
+let find_paren_sentences paragraph tokens chart last =
+  parse_bracket_rule paragraph tokens chart last
+    (fun tokens id -> (ExtArray.get tokens id).token = Interp "(s")
+    (fun tokens id ->
+      match (ExtArray.get tokens id).token with
+        Tokens("sentence",_) -> true
+      | _ -> false)
+    (fun tokens id -> (ExtArray.get tokens id).token = Interp ")s")
+    (fun ids -> Tokens("paren_sentences",ids))
+
 let find_query paragraph tokens chart last =
   parse_bracket_rule paragraph tokens chart last
     (fun tokens id -> (ExtArray.get tokens id).token = Interp "<query>")
@@ -124,6 +134,7 @@ let find_query paragraph tokens chart last =
       match (ExtArray.get tokens id).token with
         Tokens("sentence",_) -> true
       | Tokens("quoted_sentences",_) -> true
+      | Tokens("paren_sentences",_) -> true
       | _ -> false)
     (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>")
     (fun ids -> Tokens("query",ids))
@@ -186,6 +197,10 @@ let rec extract_sentences_rec tokens id =
       [{id=string_of_int id; beg=t.beg; len=t.len; next=t.next; file_prefix="";
         sentence=AltSentence[Raw,RawSentence t.orth;
           Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}]
+  | Tokens("paren_sentences",ids) ->
+      [{id=string_of_int id; beg=t.beg; len=t.len; next=t.next; file_prefix="";
+        sentence=AltSentence[Raw,RawSentence t.orth;
+          Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}]
   | _ -> []
  
 let extract_sentences pid tokens chart last =
@@ -262,5 +277,6 @@ let split_into_sentences pid paragraph tokens paths =
   find_slash_or_sentence par tokens chart last;
   find_sentence par tokens chart last;
   find_quoted_sentences par tokens chart last;
+  find_paren_sentences par tokens chart last;
   find_query par tokens chart last;
   extract_sentences pid tokens chart last
@@ -300,11 +300,14 @@ let parse query =
   let l = ENIAMtokenizer.parse query in
 (*   print_endline "a6"; *)
   let paths = ENIAMpaths.translate_into_paths l in
-(*   print_endline "a7"; *)
+  (* print_endline "a7";
+  print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *)
   let paths = ENIAMpaths.lemmatize paths in
-(*   print_endline "a8"; *)
+  (* print_endline "a8";
+  print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *)
   let paths,_ = ENIAM_MWE.process paths in
-(*   print_endline "a12"; *)
+  (* print_endline "a12";
+  print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
 (*  let paths = find_proper_names paths in*)
   let paths =  List.rev (Xlist.rev_map paths find_proper_names) in
 (*   print_endline "a13"; *)
@@ -7,3 +7,5 @@
 - przerobić xxx w skrótach na wskazania kategorii składniowych
  
 - dodać część mowy do listy nazw własnych.
+
+- dodać rozróżnienie quoted_sentences od paren_sentences w extract_sentences_rec
@@ -33,8 +33,10 @@ let test_strings = [
   "o trąbach powietrznych";
   "trąba powietrzny";  *)
   (* "ul. III Poprzecznej"; *)
-  "ul. Stefana Banacha";
-  "Chłopcy mają ulicę kwiatami.";
+  (* "ul. Stefana Banacha";
+  "Chłopcy mają ulicę kwiatami."; *)
+  (* "„Dialog”"; *)
+  (* "( Głosujmy !)"; *)
 ]
  
 let test_strings2 = [
@@ -43,7 +45,12 @@ let test_strings2 = [
   (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *)
   (* "Dyplom uzyskał w 1994.";
   "dyplom uzyskał w 1994"; *)
-  "Chłopcy mają ulicę kwiatami.";
+  (* "Chłopcy mają ulicę kwiatami."; *)
+  (* "\"Throw out\" znaczy \"wyrzucić\".";
+  "„Dialog”";
+  "„Dialog”:"; *)
+  "- Votare! ( Głosujmy !)";
+  "( Głosujmy !)";
   ]
  
 let _ =
@@ -440,6 +440,7 @@ let manage_query_boundaries tokens =
     if find_beg_pattern [I "<query>";I "<or>"] tokens then
       if find_beg_pattern [I "<query>";I "<or>";I "<sentence>"] tokens then tokens else
       replace_beg_pattern [I "<query>";I "<or>"] add_sentence_beg tokens else
+    if find_beg_pattern [I "<query>";I "(s";I "<sentence>"] tokens then tokens else
     if find_beg_pattern [I "<query>";I "<sentence>"] tokens then tokens else
     replace_beg_pattern [I "<query>"] add_sentence_beg tokens in
   (* let b =
@@ -456,6 +457,7 @@ let manage_query_boundaries tokens =
     if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else
     if find_beg_pattern [I "</query>";I "”s"] tokens then
       replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else
+    if find_beg_pattern [I "</query>";I ")s";I "</sentence>"] tokens then tokens else
     replace_beg_pattern [I "</query>"] add_sentence_end tokens in
   let tokens = Xlist.rev_map tokens revert_tokens in
   tokens
@@ -157,6 +157,9 @@ let romanmonths = StringSet.of_list [&quot;I&quot;; &quot;II&quot;; &quot;III&quot;; &quot;IV&quot;; &quot;V&quot;; &quot;VI&quot;; &quot;VII&quot;; &quot;
 let s_beg i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "<sentence>"}
 let c_beg i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "<clause>"}
  
+let sc_quot_token orth i token =
+  Seq[s_beg i;c_beg (i+1);Token{empty_token_env with orth=orth;beg=i+2;len=factor - 2;next=i+factor; token=token; attrs=[MaybeCS]}]
+
 let dig_token orth i digs token =
   Token{empty_token_env with orth=orth;beg=i;len=Xlist.size digs * factor;next=i+Xlist.size digs * factor; token=token; attrs=[MaybeCS]}
  
@@ -675,26 +678,26 @@ let rec recognize_sign_group poss_s_beg i = function
   | (Sign " ") :: l -> create_sign_token poss_s_beg i [Sign " "] l (Symbol " ")
   | (Sign "\"") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "\""] l
   | (Sign "\"") :: l ->
-      let t,i = create_empty_sign_token i [Sign "\""] in
-      Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
+      let t,i2 = create_empty_sign_token i [Sign "\""] in
+      Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i2,l,poss_s_beg
   | (Sign "˝") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "˝"] l
   | (Sign "˝") :: l ->
-      let t,i = create_empty_sign_token i [Sign "˝"] in
-      Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
+      let t,i2 = create_empty_sign_token i [Sign "˝"] in
+      Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i2,l,poss_s_beg
   | (Sign "„") :: l ->
-      let t,i = create_empty_sign_token i [Sign "„"] in
-      Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i,l,poss_s_beg
+      let t,i2 = create_empty_sign_token i [Sign "„"] in
+      Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i2,l,poss_s_beg
   | (Sign "”") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "”"] l
   | (Sign "”") :: l ->
       let t,i = create_empty_sign_token i [Sign "”"] in
       Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
   | (Sign "“") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "“"] l
   | (Sign "“") :: l ->
-      let t,i = create_empty_sign_token i [Sign "“"] in
-      Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
+      let t,i2 = create_empty_sign_token i [Sign "“"] in
+      Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i2,l,poss_s_beg
   | (Sign ",") :: (Sign ",") :: l ->
-      let t,i = create_empty_sign_token i [Sign ",";Sign ","] in
-      Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i,l,poss_s_beg
+      let t,i2 = create_empty_sign_token i [Sign ",";Sign ","] in
+      Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i2,l,poss_s_beg
   | (Sign ",") :: l ->
       let t,i2 = create_empty_sign_token i [Sign ","] in
       if is_comma_digit_marker l then
@@ -714,7 +717,9 @@ let rec recognize_sign_group poss_s_beg i = function
   | (Sign "(") :: (Sign "!") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign "(") :: (Sign "!") :: (Sign ")") :: []) l (make_lemma ("(!)","sinterj"))
   | (Sign "(") :: (Sign "-") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign "(") :: (Sign "-") :: (Sign ")") :: []) l (make_lemma ("(-)","symbol"))
   | (Sign "(") :: (Sign "*") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign "(") :: (Sign "*") :: (Sign ")") :: []) l (make_lemma ("(*)","symbol"))
-  | (Sign "(") :: l -> create_sign_token poss_s_beg i [Sign "("] l (Interp "(")
+  | (Sign "(") :: l -> (*create_sign_token poss_s_beg i [Sign "("] l (Interp "(")*)
+      let t,i = create_empty_sign_token i [Sign "("] in
+      Variant[Token{t with token=Interp "("};Token{t with token=Interp "(s"}],i,l,poss_s_beg
   | (Sign ":") :: (Sign "(") :: (Sign "(") :: (Sign "(") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "(") :: (Sign "(") :: (Sign "(") :: []) l (make_lemma (":(((","sinterj"))
   | (Sign ":") :: (Sign "(") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "(") :: []) l (make_lemma (":(","sinterj"))
   | (Sign ":") :: (Sign "-") :: (Sign "(") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "(") :: []) l (make_lemma (":-(","sinterj"))
@@ -735,7 +740,9 @@ let rec recognize_sign_group poss_s_beg i = function
   | (Sign ":") :: (Sign "-") :: (Sign "/") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "/") :: []) l (make_lemma (":-/","sinterj"))
   | (Sign ":") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign ")") :: []) l (make_lemma (":)","sinterj"))
   | (Sign ";") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign ")") :: []) l (make_lemma (";)","sinterj"))
-  | (Sign ")") :: l -> create_sign_token poss_s_beg i [Sign ")"] l (Interp ")")
+  | (Sign ")") :: l -> (*create_sign_token poss_s_beg i [Sign ")"] l (Interp ")")*)
+      let t,i = create_empty_sign_token i [Sign ")"] in
+      Variant[Token{t with token=Interp ")"};Token{t with token=Interp ")s"}],i,l,poss_s_beg
   | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj"))
   | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj"))
   | (Sign "[") :: (Sign "+") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign "+") :: (Sign "]") :: []) l (make_lemma ("[+]","symbol"))
@@ -921,8 +928,8 @@ let rec recognize_sign_group poss_s_beg i = function
   | (Sign "‘") :: l -> create_sign_token poss_s_beg i [Sign "‘"] l (Interp "‘")
   | (Sign "´") :: l -> create_sign_token poss_s_beg i [Sign "´"] l (Symbol "’")
   | (Sign "`") :: (Sign "`") :: l ->
-      let t,i = create_empty_sign_token i [Sign "`";Sign "`"] in
-      Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i,l,poss_s_beg
+      let t,i2 = create_empty_sign_token i [Sign "`";Sign "`"] in
+      Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i2,l,poss_s_beg
   | (Sign "`") :: l -> create_sign_token poss_s_beg i [Sign "`"] l (Symbol "’")
   | (Sign "·") :: l -> create_sign_token poss_s_beg i [Sign "·"] l (Interp "·")
   | (Sign "•") :: l -> create_sign_token poss_s_beg i [Sign "•"] l (Interp "•")
@@ -61,8 +61,12 @@ let test_strings = [
   "2 jabłka";
   "- 2 jabłka";*)
   (* "ping-ponga" *)
-  "drukowanym w „Dialogu”";
-  "drukowanym w „Dialogu”.";
+  (* "drukowanym w „Dialogu”";
+  "drukowanym w „Dialogu”."; *)
+  "\"Throw out\" znaczy \"wyrzucić\".";
+  "- Votare! ( Głosujmy !)";
+  "( Głosujmy !)";
+  "„Dialog”";
   ]
  
 let _ =