Commit 1cd4d4aa68f8751329d8dd5489dca4d2f8da981a

Authored by Wojciech Jaworski
1 parent 5625b0fd

Poprawa algorytmów dzielenia na zdania

subsyntax/ENIAM_MWE.ml
... ... @@ -27,6 +27,7 @@ type t =
27 27 L of string * string * sel list
28 28 | O of string
29 29 | D of string * string
  30 + | I of string
30 31  
31 32 let process_interp lemma interp =
32 33 match Xstring.split ":" interp with
... ... @@ -103,7 +104,7 @@ let load_mwe_dict2 filename (dict,dict2) =
103 104 (match List.hd orths with
104 105 L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
105 106 | O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2
106   - | D _ -> failwith "load_mwe_dict2")
  107 + | _ -> failwith "load_mwe_dict2")
107 108 | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'"))
108 109  
109 110 let load_mwe_dicts () =
... ... @@ -144,7 +145,7 @@ let preselect orths lemmas rules l =
144 145 let b = Xlist.fold match_list true (fun b -> function
145 146 O s -> StringSet.mem orths s && b
146 147 | L(s,_,_) -> StringSet.mem lemmas s && b
147   - | D(s,_) -> failwith "preselect") in
  148 + | _ -> failwith "preselect") in
148 149 if b then (Xlist.size match_list > 1,match_list,lemma,cat,interp) :: rules else rules)
149 150  
150 151 let preselect_dict orths lemmas dict rules =
... ... @@ -165,6 +166,9 @@ let add_ordnum_rules orths rules =
165 166 (* Printf.printf "%s %s\n%!" orth lemma; *)
166 167 (false,[D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))
167 168  
  169 +let add_quot_rule rules =
  170 + (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules
  171 +
168 172 let select_rules paths mwe_dict mwe_dict2 =
169 173 let orths = get_orths paths in
170 174 let lemmas = get_lemmas paths in
... ... @@ -172,6 +176,7 @@ let select_rules paths mwe_dict mwe_dict2 =
172 176 let rules = preselect_dict orths lemmas mwe_dict [] in
173 177 let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
174 178 let rules = add_ordnum_rules intnum_orths rules in
  179 + let rules = add_quot_rule rules in
175 180 rules
176 181  
177 182 let rec check_interp sels = function
... ... @@ -214,6 +219,7 @@ let rec match_path_rec map found (t:token_env) sels rev = function
214 219 if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then
215 220 (new_t,get_sels sels (interp,interp2)) :: found2 else found2)
216 221 | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
  222 + | I s, Interp s2 -> if s=s2 then (new_t,sels) :: found2 else found2
217 223 | _ -> found2)) in
218 224 Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)
219 225  
... ... @@ -230,6 +236,7 @@ let match_path map = function
230 236 if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then
231 237 (t,get_sels [] (interp,interp2)) :: found else found)
232 238 | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
  239 + | I s, Interp s2 -> if s=s2 then (t,[]) :: found else found
233 240 | _ -> found))) in
234 241 Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)
235 242  
... ...
subsyntax/ENIAMsentences.ml
... ... @@ -117,6 +117,16 @@ let find_quoted_sentences paragraph tokens chart last =
117 117 (fun tokens id -> (ExtArray.get tokens id).token = Interp "”s")
118 118 (fun ids -> Tokens("quoted_sentences",ids))
119 119  
  120 +let find_paren_sentences paragraph tokens chart last =
  121 + parse_bracket_rule paragraph tokens chart last
  122 + (fun tokens id -> (ExtArray.get tokens id).token = Interp "(s")
  123 + (fun tokens id ->
  124 + match (ExtArray.get tokens id).token with
  125 + Tokens("sentence",_) -> true
  126 + | _ -> false)
  127 + (fun tokens id -> (ExtArray.get tokens id).token = Interp ")s")
  128 + (fun ids -> Tokens("paren_sentences",ids))
  129 +
120 130 let find_query paragraph tokens chart last =
121 131 parse_bracket_rule paragraph tokens chart last
122 132 (fun tokens id -> (ExtArray.get tokens id).token = Interp "<query>")
... ... @@ -124,6 +134,7 @@ let find_query paragraph tokens chart last =
124 134 match (ExtArray.get tokens id).token with
125 135 Tokens("sentence",_) -> true
126 136 | Tokens("quoted_sentences",_) -> true
  137 + | Tokens("paren_sentences",_) -> true
127 138 | _ -> false)
128 139 (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>")
129 140 (fun ids -> Tokens("query",ids))
... ... @@ -186,6 +197,10 @@ let rec extract_sentences_rec tokens id =
186 197 [{id=string_of_int id; beg=t.beg; len=t.len; next=t.next; file_prefix="";
187 198 sentence=AltSentence[Raw,RawSentence t.orth;
188 199 Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}]
  200 + | Tokens("paren_sentences",ids) ->
  201 + [{id=string_of_int id; beg=t.beg; len=t.len; next=t.next; file_prefix="";
  202 + sentence=AltSentence[Raw,RawSentence t.orth;
  203 + Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}]
189 204 | _ -> []
190 205  
191 206 let extract_sentences pid tokens chart last =
... ... @@ -262,5 +277,6 @@ let split_into_sentences pid paragraph tokens paths =
262 277 find_slash_or_sentence par tokens chart last;
263 278 find_sentence par tokens chart last;
264 279 find_quoted_sentences par tokens chart last;
  280 + find_paren_sentences par tokens chart last;
265 281 find_query par tokens chart last;
266 282 extract_sentences pid tokens chart last
... ...
subsyntax/ENIAMsubsyntax.ml
... ... @@ -300,11 +300,14 @@ let parse query =
300 300 let l = ENIAMtokenizer.parse query in
301 301 (* print_endline "a6"; *)
302 302 let paths = ENIAMpaths.translate_into_paths l in
303   -(* print_endline "a7"; *)
  303 + (* print_endline "a7";
  304 + print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *)
304 305 let paths = ENIAMpaths.lemmatize paths in
305   -(* print_endline "a8"; *)
  306 + (* print_endline "a8";
  307 + print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *)
306 308 let paths,_ = ENIAM_MWE.process paths in
307   -(* print_endline "a12"; *)
  309 + (* print_endline "a12";
  310 + print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
308 311 (* let paths = find_proper_names paths in*)
309 312 let paths = List.rev (Xlist.rev_map paths find_proper_names) in
310 313 (* print_endline "a13"; *)
... ...
subsyntax/TODO
... ... @@ -7,3 +7,5 @@
7 7 - przerobić xxx w skrótach na wskazania kategorii składniowych
8 8  
9 9 - dodać część mowy do listy nazw własnych.
  10 +
  11 +- dodać rozróżnienie quoted_sentences od paren_sentences w extract_sentences_rec
... ...
subsyntax/test.ml
... ... @@ -33,8 +33,10 @@ let test_strings = [
33 33 "o trąbach powietrznych";
34 34 "trąba powietrzny"; *)
35 35 (* "ul. III Poprzecznej"; *)
36   - "ul. Stefana Banacha";
37   - "Chłopcy mają ulicę kwiatami.";
  36 + (* "ul. Stefana Banacha";
  37 + "Chłopcy mają ulicę kwiatami."; *)
  38 + (* "„Dialog”"; *)
  39 + (* "( Głosujmy !)"; *)
38 40 ]
39 41  
40 42 let test_strings2 = [
... ... @@ -43,7 +45,12 @@ let test_strings2 = [
43 45 (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *)
44 46 (* "Dyplom uzyskał w 1994.";
45 47 "dyplom uzyskał w 1994"; *)
46   - "Chłopcy mają ulicę kwiatami.";
  48 + (* "Chłopcy mają ulicę kwiatami."; *)
  49 + (* "\"Throw out\" znaczy \"wyrzucić\".";
  50 + "„Dialog”";
  51 + "„Dialog”:"; *)
  52 + "- Votare! ( Głosujmy !)";
  53 + "( Głosujmy !)";
47 54 ]
48 55  
49 56 let _ =
... ...
tokenizer/ENIAMpatterns.ml
... ... @@ -440,6 +440,7 @@ let manage_query_boundaries tokens =
440 440 if find_beg_pattern [I "<query>";I "<or>"] tokens then
441 441 if find_beg_pattern [I "<query>";I "<or>";I "<sentence>"] tokens then tokens else
442 442 replace_beg_pattern [I "<query>";I "<or>"] add_sentence_beg tokens else
  443 + if find_beg_pattern [I "<query>";I "(s";I "<sentence>"] tokens then tokens else
443 444 if find_beg_pattern [I "<query>";I "<sentence>"] tokens then tokens else
444 445 replace_beg_pattern [I "<query>"] add_sentence_beg tokens in
445 446 (* let b =
... ... @@ -456,6 +457,7 @@ let manage_query_boundaries tokens =
456 457 if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else
457 458 if find_beg_pattern [I "</query>";I "”s"] tokens then
458 459 replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else
  460 + if find_beg_pattern [I "</query>";I ")s";I "</sentence>"] tokens then tokens else
459 461 replace_beg_pattern [I "</query>"] add_sentence_end tokens in
460 462 let tokens = Xlist.rev_map tokens revert_tokens in
461 463 tokens
... ...
tokenizer/ENIAMtokens.ml
... ... @@ -157,6 +157,9 @@ let romanmonths = StringSet.of_list [&quot;I&quot;; &quot;II&quot;; &quot;III&quot;; &quot;IV&quot;; &quot;V&quot;; &quot;VI&quot;; &quot;VII&quot;; &quot;
157 157 let s_beg i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "<sentence>"}
158 158 let c_beg i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "<clause>"}
159 159  
  160 +let sc_quot_token orth i token =
  161 + Seq[s_beg i;c_beg (i+1);Token{empty_token_env with orth=orth;beg=i+2;len=factor - 2;next=i+factor; token=token; attrs=[MaybeCS]}]
  162 +
160 163 let dig_token orth i digs token =
161 164 Token{empty_token_env with orth=orth;beg=i;len=Xlist.size digs * factor;next=i+Xlist.size digs * factor; token=token; attrs=[MaybeCS]}
162 165  
... ... @@ -675,26 +678,26 @@ let rec recognize_sign_group poss_s_beg i = function
675 678 | (Sign " ") :: l -> create_sign_token poss_s_beg i [Sign " "] l (Symbol " ")
676 679 | (Sign "\"") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "\""] l
677 680 | (Sign "\"") :: l ->
678   - let t,i = create_empty_sign_token i [Sign "\""] in
679   - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
  681 + let t,i2 = create_empty_sign_token i [Sign "\""] in
  682 + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i2,l,poss_s_beg
680 683 | (Sign "˝") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "˝"] l
681 684 | (Sign "˝") :: l ->
682   - let t,i = create_empty_sign_token i [Sign "˝"] in
683   - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
  685 + let t,i2 = create_empty_sign_token i [Sign "˝"] in
  686 + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i2,l,poss_s_beg
684 687 | (Sign "„") :: l ->
685   - let t,i = create_empty_sign_token i [Sign "„"] in
686   - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i,l,poss_s_beg
  688 + let t,i2 = create_empty_sign_token i [Sign "„"] in
  689 + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i2,l,poss_s_beg
687 690 | (Sign "”") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "”"] l
688 691 | (Sign "”") :: l ->
689 692 let t,i = create_empty_sign_token i [Sign "”"] in
690 693 Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
691 694 | (Sign "“") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "“"] l
692 695 | (Sign "“") :: l ->
693   - let t,i = create_empty_sign_token i [Sign "“"] in
694   - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
  696 + let t,i2 = create_empty_sign_token i [Sign "“"] in
  697 + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i2,l,poss_s_beg
695 698 | (Sign ",") :: (Sign ",") :: l ->
696   - let t,i = create_empty_sign_token i [Sign ",";Sign ","] in
697   - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i,l,poss_s_beg
  699 + let t,i2 = create_empty_sign_token i [Sign ",";Sign ","] in
  700 + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i2,l,poss_s_beg
698 701 | (Sign ",") :: l ->
699 702 let t,i2 = create_empty_sign_token i [Sign ","] in
700 703 if is_comma_digit_marker l then
... ... @@ -714,7 +717,9 @@ let rec recognize_sign_group poss_s_beg i = function
714 717 | (Sign "(") :: (Sign "!") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign "(") :: (Sign "!") :: (Sign ")") :: []) l (make_lemma ("(!)","sinterj"))
715 718 | (Sign "(") :: (Sign "-") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign "(") :: (Sign "-") :: (Sign ")") :: []) l (make_lemma ("(-)","symbol"))
716 719 | (Sign "(") :: (Sign "*") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign "(") :: (Sign "*") :: (Sign ")") :: []) l (make_lemma ("(*)","symbol"))
717   - | (Sign "(") :: l -> create_sign_token poss_s_beg i [Sign "("] l (Interp "(")
  720 + | (Sign "(") :: l -> (*create_sign_token poss_s_beg i [Sign "("] l (Interp "(")*)
  721 + let t,i = create_empty_sign_token i [Sign "("] in
  722 + Variant[Token{t with token=Interp "("};Token{t with token=Interp "(s"}],i,l,poss_s_beg
718 723 | (Sign ":") :: (Sign "(") :: (Sign "(") :: (Sign "(") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "(") :: (Sign "(") :: (Sign "(") :: []) l (make_lemma (":(((","sinterj"))
719 724 | (Sign ":") :: (Sign "(") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "(") :: []) l (make_lemma (":(","sinterj"))
720 725 | (Sign ":") :: (Sign "-") :: (Sign "(") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "(") :: []) l (make_lemma (":-(","sinterj"))
... ... @@ -735,7 +740,9 @@ let rec recognize_sign_group poss_s_beg i = function
735 740 | (Sign ":") :: (Sign "-") :: (Sign "/") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "/") :: []) l (make_lemma (":-/","sinterj"))
736 741 | (Sign ":") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign ")") :: []) l (make_lemma (":)","sinterj"))
737 742 | (Sign ";") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign ")") :: []) l (make_lemma (";)","sinterj"))
738   - | (Sign ")") :: l -> create_sign_token poss_s_beg i [Sign ")"] l (Interp ")")
  743 + | (Sign ")") :: l -> (*create_sign_token poss_s_beg i [Sign ")"] l (Interp ")")*)
  744 + let t,i = create_empty_sign_token i [Sign ")"] in
  745 + Variant[Token{t with token=Interp ")"};Token{t with token=Interp ")s"}],i,l,poss_s_beg
739 746 | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj"))
740 747 | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj"))
741 748 | (Sign "[") :: (Sign "+") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign "+") :: (Sign "]") :: []) l (make_lemma ("[+]","symbol"))
... ... @@ -921,8 +928,8 @@ let rec recognize_sign_group poss_s_beg i = function
921 928 | (Sign "‘") :: l -> create_sign_token poss_s_beg i [Sign "‘"] l (Interp "‘")
922 929 | (Sign "´") :: l -> create_sign_token poss_s_beg i [Sign "´"] l (Symbol "’")
923 930 | (Sign "`") :: (Sign "`") :: l ->
924   - let t,i = create_empty_sign_token i [Sign "`";Sign "`"] in
925   - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i,l,poss_s_beg
  931 + let t,i2 = create_empty_sign_token i [Sign "`";Sign "`"] in
  932 + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i2,l,poss_s_beg
926 933 | (Sign "`") :: l -> create_sign_token poss_s_beg i [Sign "`"] l (Symbol "’")
927 934 | (Sign "·") :: l -> create_sign_token poss_s_beg i [Sign "·"] l (Interp "·")
928 935 | (Sign "•") :: l -> create_sign_token poss_s_beg i [Sign "•"] l (Interp "•")
... ...
tokenizer/test.ml
... ... @@ -61,8 +61,12 @@ let test_strings = [
61 61 "2 jabłka";
62 62 "- 2 jabłka";*)
63 63 (* "ping-ponga" *)
64   - "drukowanym w „Dialogu”";
65   - "drukowanym w „Dialogu”.";
  64 + (* "drukowanym w „Dialogu”";
  65 + "drukowanym w „Dialogu”."; *)
  66 + "\"Throw out\" znaczy \"wyrzucić\".";
  67 + "- Votare! ( Głosujmy !)";
  68 + "( Głosujmy !)";
  69 + "„Dialog”";
66 70 ]
67 71  
68 72 let _ =
... ...