Commit 1cd4d4aa68f8751329d8dd5489dca4d2f8da981a
1 parent
5625b0fd
Poprawa algorytmów dzielenia na zdania
Showing
8 changed files
with
72 additions
and
24 deletions
subsyntax/ENIAM_MWE.ml
... | ... | @@ -27,6 +27,7 @@ type t = |
27 | 27 | L of string * string * sel list |
28 | 28 | | O of string |
29 | 29 | | D of string * string |
30 | + | I of string | |
30 | 31 | |
31 | 32 | let process_interp lemma interp = |
32 | 33 | match Xstring.split ":" interp with |
... | ... | @@ -103,7 +104,7 @@ let load_mwe_dict2 filename (dict,dict2) = |
103 | 104 | (match List.hd orths with |
104 | 105 | L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l) |
105 | 106 | | O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2 |
106 | - | D _ -> failwith "load_mwe_dict2") | |
107 | + | _ -> failwith "load_mwe_dict2") | |
107 | 108 | | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'")) |
108 | 109 | |
109 | 110 | let load_mwe_dicts () = |
... | ... | @@ -144,7 +145,7 @@ let preselect orths lemmas rules l = |
144 | 145 | let b = Xlist.fold match_list true (fun b -> function |
145 | 146 | O s -> StringSet.mem orths s && b |
146 | 147 | | L(s,_,_) -> StringSet.mem lemmas s && b |
147 | - | D(s,_) -> failwith "preselect") in | |
148 | + | _ -> failwith "preselect") in | |
148 | 149 | if b then (Xlist.size match_list > 1,match_list,lemma,cat,interp) :: rules else rules) |
149 | 150 | |
150 | 151 | let preselect_dict orths lemmas dict rules = |
... | ... | @@ -165,6 +166,9 @@ let add_ordnum_rules orths rules = |
165 | 166 | (* Printf.printf "%s %s\n%!" orth lemma; *) |
166 | 167 | (false,[D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules)) |
167 | 168 | |
169 | +let add_quot_rule rules = | |
170 | + (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules | |
171 | + | |
168 | 172 | let select_rules paths mwe_dict mwe_dict2 = |
169 | 173 | let orths = get_orths paths in |
170 | 174 | let lemmas = get_lemmas paths in |
... | ... | @@ -172,6 +176,7 @@ let select_rules paths mwe_dict mwe_dict2 = |
172 | 176 | let rules = preselect_dict orths lemmas mwe_dict [] in |
173 | 177 | let rules = preselect_dict2 orths lemmas mwe_dict2 rules in |
174 | 178 | let rules = add_ordnum_rules intnum_orths rules in |
179 | + let rules = add_quot_rule rules in | |
175 | 180 | rules |
176 | 181 | |
177 | 182 | let rec check_interp sels = function |
... | ... | @@ -214,6 +219,7 @@ let rec match_path_rec map found (t:token_env) sels rev = function |
214 | 219 | if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then |
215 | 220 | (new_t,get_sels sels (interp,interp2)) :: found2 else found2) |
216 | 221 | | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2 |
222 | + | I s, Interp s2 -> if s=s2 then (new_t,sels) :: found2 else found2 | |
217 | 223 | | _ -> found2)) in |
218 | 224 | Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l) |
219 | 225 | |
... | ... | @@ -230,6 +236,7 @@ let match_path map = function |
230 | 236 | if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then |
231 | 237 | (t,get_sels [] (interp,interp2)) :: found else found) |
232 | 238 | | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found |
239 | + | I s, Interp s2 -> if s=s2 then (t,[]) :: found else found | |
233 | 240 | | _ -> found))) in |
234 | 241 | Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l) |
235 | 242 | |
... | ... |
subsyntax/ENIAMsentences.ml
... | ... | @@ -117,6 +117,16 @@ let find_quoted_sentences paragraph tokens chart last = |
117 | 117 | (fun tokens id -> (ExtArray.get tokens id).token = Interp "”s") |
118 | 118 | (fun ids -> Tokens("quoted_sentences",ids)) |
119 | 119 | |
120 | +let find_paren_sentences paragraph tokens chart last = | |
121 | + parse_bracket_rule paragraph tokens chart last | |
122 | + (fun tokens id -> (ExtArray.get tokens id).token = Interp "(s") | |
123 | + (fun tokens id -> | |
124 | + match (ExtArray.get tokens id).token with | |
125 | + Tokens("sentence",_) -> true | |
126 | + | _ -> false) | |
127 | + (fun tokens id -> (ExtArray.get tokens id).token = Interp ")s") | |
128 | + (fun ids -> Tokens("paren_sentences",ids)) | |
129 | + | |
120 | 130 | let find_query paragraph tokens chart last = |
121 | 131 | parse_bracket_rule paragraph tokens chart last |
122 | 132 | (fun tokens id -> (ExtArray.get tokens id).token = Interp "<query>") |
... | ... | @@ -124,6 +134,7 @@ let find_query paragraph tokens chart last = |
124 | 134 | match (ExtArray.get tokens id).token with |
125 | 135 | Tokens("sentence",_) -> true |
126 | 136 | | Tokens("quoted_sentences",_) -> true |
137 | + | Tokens("paren_sentences",_) -> true | |
127 | 138 | | _ -> false) |
128 | 139 | (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>") |
129 | 140 | (fun ids -> Tokens("query",ids)) |
... | ... | @@ -186,6 +197,10 @@ let rec extract_sentences_rec tokens id = |
186 | 197 | [{id=string_of_int id; beg=t.beg; len=t.len; next=t.next; file_prefix=""; |
187 | 198 | sentence=AltSentence[Raw,RawSentence t.orth; |
188 | 199 | Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}] |
200 | + | Tokens("paren_sentences",ids) -> | |
201 | + [{id=string_of_int id; beg=t.beg; len=t.len; next=t.next; file_prefix=""; | |
202 | + sentence=AltSentence[Raw,RawSentence t.orth; | |
203 | + Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}] | |
189 | 204 | | _ -> [] |
190 | 205 | |
191 | 206 | let extract_sentences pid tokens chart last = |
... | ... | @@ -262,5 +277,6 @@ let split_into_sentences pid paragraph tokens paths = |
262 | 277 | find_slash_or_sentence par tokens chart last; |
263 | 278 | find_sentence par tokens chart last; |
264 | 279 | find_quoted_sentences par tokens chart last; |
280 | + find_paren_sentences par tokens chart last; | |
265 | 281 | find_query par tokens chart last; |
266 | 282 | extract_sentences pid tokens chart last |
... | ... |
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -300,11 +300,14 @@ let parse query = |
300 | 300 | let l = ENIAMtokenizer.parse query in |
301 | 301 | (* print_endline "a6"; *) |
302 | 302 | let paths = ENIAMpaths.translate_into_paths l in |
303 | -(* print_endline "a7"; *) | |
303 | + (* print_endline "a7"; | |
304 | + print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *) | |
304 | 305 | let paths = ENIAMpaths.lemmatize paths in |
305 | -(* print_endline "a8"; *) | |
306 | + (* print_endline "a8"; | |
307 | + print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *) | |
306 | 308 | let paths,_ = ENIAM_MWE.process paths in |
307 | -(* print_endline "a12"; *) | |
309 | + (* print_endline "a12"; | |
310 | + print_endline (ENIAMsubsyntaxStringOf.token_list paths); *) | |
308 | 311 | (* let paths = find_proper_names paths in*) |
309 | 312 | let paths = List.rev (Xlist.rev_map paths find_proper_names) in |
310 | 313 | (* print_endline "a13"; *) |
... | ... |
subsyntax/TODO
subsyntax/test.ml
... | ... | @@ -33,8 +33,10 @@ let test_strings = [ |
33 | 33 | "o trąbach powietrznych"; |
34 | 34 | "trąba powietrzny"; *) |
35 | 35 | (* "ul. III Poprzecznej"; *) |
36 | - "ul. Stefana Banacha"; | |
37 | - "Chłopcy mają ulicę kwiatami."; | |
36 | + (* "ul. Stefana Banacha"; | |
37 | + "Chłopcy mają ulicę kwiatami."; *) | |
38 | + (* "„Dialog”"; *) | |
39 | + (* "( Głosujmy !)"; *) | |
38 | 40 | ] |
39 | 41 | |
40 | 42 | let test_strings2 = [ |
... | ... | @@ -43,7 +45,12 @@ let test_strings2 = [ |
43 | 45 | (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *) |
44 | 46 | (* "Dyplom uzyskał w 1994."; |
45 | 47 | "dyplom uzyskał w 1994"; *) |
46 | - "Chłopcy mają ulicę kwiatami."; | |
48 | + (* "Chłopcy mają ulicę kwiatami."; *) | |
49 | + (* "\"Throw out\" znaczy \"wyrzucić\"."; | |
50 | + "„Dialog”"; | |
51 | + "„Dialog”:"; *) | |
52 | + "- Votare! ( Głosujmy !)"; | |
53 | + "( Głosujmy !)"; | |
47 | 54 | ] |
48 | 55 | |
49 | 56 | let _ = |
... | ... |
tokenizer/ENIAMpatterns.ml
... | ... | @@ -440,6 +440,7 @@ let manage_query_boundaries tokens = |
440 | 440 | if find_beg_pattern [I "<query>";I "<or>"] tokens then |
441 | 441 | if find_beg_pattern [I "<query>";I "<or>";I "<sentence>"] tokens then tokens else |
442 | 442 | replace_beg_pattern [I "<query>";I "<or>"] add_sentence_beg tokens else |
443 | + if find_beg_pattern [I "<query>";I "(s";I "<sentence>"] tokens then tokens else | |
443 | 444 | if find_beg_pattern [I "<query>";I "<sentence>"] tokens then tokens else |
444 | 445 | replace_beg_pattern [I "<query>"] add_sentence_beg tokens in |
445 | 446 | (* let b = |
... | ... | @@ -456,6 +457,7 @@ let manage_query_boundaries tokens = |
456 | 457 | if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else |
457 | 458 | if find_beg_pattern [I "</query>";I "”s"] tokens then |
458 | 459 | replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else |
460 | + if find_beg_pattern [I "</query>";I ")s";I "</sentence>"] tokens then tokens else | |
459 | 461 | replace_beg_pattern [I "</query>"] add_sentence_end tokens in |
460 | 462 | let tokens = Xlist.rev_map tokens revert_tokens in |
461 | 463 | tokens |
... | ... |
tokenizer/ENIAMtokens.ml
... | ... | @@ -157,6 +157,9 @@ let romanmonths = StringSet.of_list ["I"; "II"; "III"; "IV"; "V"; "VI"; "VII"; " |
157 | 157 | let s_beg i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "<sentence>"} |
158 | 158 | let c_beg i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "<clause>"} |
159 | 159 | |
160 | +let sc_quot_token orth i token = | |
161 | + Seq[s_beg i;c_beg (i+1);Token{empty_token_env with orth=orth;beg=i+2;len=factor - 2;next=i+factor; token=token; attrs=[MaybeCS]}] | |
162 | + | |
160 | 163 | let dig_token orth i digs token = |
161 | 164 | Token{empty_token_env with orth=orth;beg=i;len=Xlist.size digs * factor;next=i+Xlist.size digs * factor; token=token; attrs=[MaybeCS]} |
162 | 165 | |
... | ... | @@ -675,26 +678,26 @@ let rec recognize_sign_group poss_s_beg i = function |
675 | 678 | | (Sign " ") :: l -> create_sign_token poss_s_beg i [Sign " "] l (Symbol " ") |
676 | 679 | | (Sign "\"") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "\""] l |
677 | 680 | | (Sign "\"") :: l -> |
678 | - let t,i = create_empty_sign_token i [Sign "\""] in | |
679 | - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg | |
681 | + let t,i2 = create_empty_sign_token i [Sign "\""] in | |
682 | + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i2,l,poss_s_beg | |
680 | 683 | | (Sign "˝") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "˝"] l |
681 | 684 | | (Sign "˝") :: l -> |
682 | - let t,i = create_empty_sign_token i [Sign "˝"] in | |
683 | - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg | |
685 | + let t,i2 = create_empty_sign_token i [Sign "˝"] in | |
686 | + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i2,l,poss_s_beg | |
684 | 687 | | (Sign "„") :: l -> |
685 | - let t,i = create_empty_sign_token i [Sign "„"] in | |
686 | - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i,l,poss_s_beg | |
688 | + let t,i2 = create_empty_sign_token i [Sign "„"] in | |
689 | + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i2,l,poss_s_beg | |
687 | 690 | | (Sign "”") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "”"] l |
688 | 691 | | (Sign "”") :: l -> |
689 | 692 | let t,i = create_empty_sign_token i [Sign "”"] in |
690 | 693 | Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg |
691 | 694 | | (Sign "“") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "“"] l |
692 | 695 | | (Sign "“") :: l -> |
693 | - let t,i = create_empty_sign_token i [Sign "“"] in | |
694 | - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg | |
696 | + let t,i2 = create_empty_sign_token i [Sign "“"] in | |
697 | + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"};Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i2,l,poss_s_beg | |
695 | 698 | | (Sign ",") :: (Sign ",") :: l -> |
696 | - let t,i = create_empty_sign_token i [Sign ",";Sign ","] in | |
697 | - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i,l,poss_s_beg | |
699 | + let t,i2 = create_empty_sign_token i [Sign ",";Sign ","] in | |
700 | + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i2,l,poss_s_beg | |
698 | 701 | | (Sign ",") :: l -> |
699 | 702 | let t,i2 = create_empty_sign_token i [Sign ","] in |
700 | 703 | if is_comma_digit_marker l then |
... | ... | @@ -714,7 +717,9 @@ let rec recognize_sign_group poss_s_beg i = function |
714 | 717 | | (Sign "(") :: (Sign "!") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign "(") :: (Sign "!") :: (Sign ")") :: []) l (make_lemma ("(!)","sinterj")) |
715 | 718 | | (Sign "(") :: (Sign "-") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign "(") :: (Sign "-") :: (Sign ")") :: []) l (make_lemma ("(-)","symbol")) |
716 | 719 | | (Sign "(") :: (Sign "*") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign "(") :: (Sign "*") :: (Sign ")") :: []) l (make_lemma ("(*)","symbol")) |
717 | - | (Sign "(") :: l -> create_sign_token poss_s_beg i [Sign "("] l (Interp "(") | |
720 | + | (Sign "(") :: l -> (*create_sign_token poss_s_beg i [Sign "("] l (Interp "(")*) | |
721 | + let t,i = create_empty_sign_token i [Sign "("] in | |
722 | + Variant[Token{t with token=Interp "("};Token{t with token=Interp "(s"}],i,l,poss_s_beg | |
718 | 723 | | (Sign ":") :: (Sign "(") :: (Sign "(") :: (Sign "(") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "(") :: (Sign "(") :: (Sign "(") :: []) l (make_lemma (":(((","sinterj")) |
719 | 724 | | (Sign ":") :: (Sign "(") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "(") :: []) l (make_lemma (":(","sinterj")) |
720 | 725 | | (Sign ":") :: (Sign "-") :: (Sign "(") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "(") :: []) l (make_lemma (":-(","sinterj")) |
... | ... | @@ -735,7 +740,9 @@ let rec recognize_sign_group poss_s_beg i = function |
735 | 740 | | (Sign ":") :: (Sign "-") :: (Sign "/") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "/") :: []) l (make_lemma (":-/","sinterj")) |
736 | 741 | | (Sign ":") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign ")") :: []) l (make_lemma (":)","sinterj")) |
737 | 742 | | (Sign ";") :: (Sign ")") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign ")") :: []) l (make_lemma (";)","sinterj")) |
738 | - | (Sign ")") :: l -> create_sign_token poss_s_beg i [Sign ")"] l (Interp ")") | |
743 | + | (Sign ")") :: l -> (*create_sign_token poss_s_beg i [Sign ")"] l (Interp ")")*) | |
744 | + let t,i = create_empty_sign_token i [Sign ")"] in | |
745 | + Variant[Token{t with token=Interp ")"};Token{t with token=Interp ")s"}],i,l,poss_s_beg | |
739 | 746 | | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj")) |
740 | 747 | | (Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign ".") :: (Sign ".") :: (Sign "]") :: []) l (make_lemma ("(…)","sinterj")) |
741 | 748 | | (Sign "[") :: (Sign "+") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign "[") :: (Sign "+") :: (Sign "]") :: []) l (make_lemma ("[+]","symbol")) |
... | ... | @@ -921,8 +928,8 @@ let rec recognize_sign_group poss_s_beg i = function |
921 | 928 | | (Sign "‘") :: l -> create_sign_token poss_s_beg i [Sign "‘"] l (Interp "‘") |
922 | 929 | | (Sign "´") :: l -> create_sign_token poss_s_beg i [Sign "´"] l (Symbol "’") |
923 | 930 | | (Sign "`") :: (Sign "`") :: l -> |
924 | - let t,i = create_empty_sign_token i [Sign "`";Sign "`"] in | |
925 | - Variant[Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i,l,poss_s_beg | |
931 | + let t,i2 = create_empty_sign_token i [Sign "`";Sign "`"] in | |
932 | + Variant[sc_quot_token "\"" i (Interp "„x"); Token{t with token=Interp "„"};Token{t with token=Interp "„s"}],i2,l,poss_s_beg | |
926 | 933 | | (Sign "`") :: l -> create_sign_token poss_s_beg i [Sign "`"] l (Symbol "’") |
927 | 934 | | (Sign "·") :: l -> create_sign_token poss_s_beg i [Sign "·"] l (Interp "·") |
928 | 935 | | (Sign "•") :: l -> create_sign_token poss_s_beg i [Sign "•"] l (Interp "•") |
... | ... |
tokenizer/test.ml
... | ... | @@ -61,8 +61,12 @@ let test_strings = [ |
61 | 61 | "2 jabłka"; |
62 | 62 | "- 2 jabłka";*) |
63 | 63 | (* "ping-ponga" *) |
64 | - "drukowanym w „Dialogu”"; | |
65 | - "drukowanym w „Dialogu”."; | |
64 | + (* "drukowanym w „Dialogu”"; | |
65 | + "drukowanym w „Dialogu”."; *) | |
66 | + "\"Throw out\" znaczy \"wyrzucić\"."; | |
67 | + "- Votare! ( Głosujmy !)"; | |
68 | + "( Głosujmy !)"; | |
69 | + "„Dialog”"; | |
66 | 70 | ] |
67 | 71 | |
68 | 72 | let _ = |
... | ... |