Commit 95c86f112d51360e2f641c04f3c359afbb38feb8
1 parent
762b53f4
Poprawki w subsyntax
Showing
8 changed files
with
80 additions
and
9 deletions
exec/semparser.ml
... | ... | @@ -40,6 +40,7 @@ let img = ref 1 |
40 | 40 | let timeout = ref 30. |
41 | 41 | let select_sentence_modes_flag = ref false |
42 | 42 | let select_sentences_flag = ref true |
43 | +let semantic_processing_flag = ref true | |
43 | 44 | let output_dir = ref "results/" |
44 | 45 | |
45 | 46 | let spec_list = [ |
... | ... | @@ -67,6 +68,8 @@ let spec_list = [ |
67 | 68 | "--no_sel_modes", Arg.Unit (fun () -> select_sentence_modes_flag:=false), "Do not select sencence modes (default)"; |
68 | 69 | "--sel_sent", Arg.Unit (fun () -> select_sentences_flag:=true), "Select parsed sentences (default)"; |
69 | 70 | "--no_sel_sent", Arg.Unit (fun () -> select_sentences_flag:=false), "Do not select parsed sentences"; |
71 | + "--sem", Arg.Unit (fun () -> semantic_processing_flag:=true), "Perform semantic processing (default)"; | |
72 | + "--no_sem", Arg.Unit (fun () -> semantic_processing_flag:=false), "Do not perforf semantic processing"; | |
70 | 73 | ] |
71 | 74 | |
72 | 75 | let usage_msg = |
... | ... | @@ -103,8 +106,13 @@ let assign_lex_sems proj_map cats_map tokens = |
103 | 106 | let lex_sems = ExtArray.make (ExtArray.size tokens) ENIAMlexSemanticsTypes.empty_lex_sem in |
104 | 107 | let _ = ExtArray.add lex_sems ENIAMlexSemanticsTypes.empty_lex_sem in |
105 | 108 | Int.iter 1 (ExtArray.size tokens - 1) (fun i -> |
109 | + let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens i).token in | |
110 | + let pos = ENIAMtokens.get_pos (ExtArray.get tokens i).token in | |
106 | 111 | let cats = expand_projections proj_map (get_cats cats_map (ExtArray.get tokens i).token) in |
107 | - let lex_sem = {ENIAMlexSemanticsTypes.empty_lex_sem with ENIAMlexSemanticsTypes.cats=cats} in | |
112 | + let frames = | |
113 | + Xlist.rev_map (ENIAMvalence.get_aroles [] lemma pos) (fun (sel,arole,arole_attr,arev) -> | |
114 | + {ENIAMlexSemanticsTypes.empty_frame with ENIAMlexSemanticsTypes.selectors=sel; ENIAMlexSemanticsTypes.arole=arole; ENIAMlexSemanticsTypes.arole_attr=arole_attr; ENIAMlexSemanticsTypes.arev=arev}) in | |
115 | + let lex_sem = {ENIAMlexSemanticsTypes.empty_lex_sem with ENIAMlexSemanticsTypes.cats=cats; ENIAMlexSemanticsTypes.frames=frames} in | |
108 | 116 | let _ = ExtArray.add lex_sems lex_sem in |
109 | 117 | ()); |
110 | 118 | lex_sems |
... | ... | @@ -123,6 +131,7 @@ let rec main_loop sub_in sub_out = |
123 | 131 | let text = ENIAMexec.parse !timeout !verbosity rules dep_rules tokens lex_sems text in |
124 | 132 | let text = if !select_sentence_modes_flag then ENIAMselectSent.select_sentence_modes_text text else text in |
125 | 133 | let text = if !select_sentences_flag then ENIAMselectSent.select_sentences_text ENIAMexecTypes.Struct text else text in |
134 | + let text = if !semantic_processing_flag then ENIAMexec.semantic_processing !verbosity tokens lex_sems text else text in | |
126 | 135 | ENIAMvisualization.print_html_text !output_dir "parsed_text" text !img !verbosity tokens); |
127 | 136 | prerr_endline "Done!"; |
128 | 137 | main_loop sub_in sub_out) |
... | ... |
morphology/resources/alt_supplement.tab
subsyntax/ENIAM_MWE.ml
... | ... | @@ -143,6 +143,31 @@ let get_intnum_orths paths = |
143 | 143 | Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) |
144 | 144 | | _ -> orths))) |
145 | 145 | |
146 | +let get_intnum_orths paths = | |
147 | + IntMap.fold paths StringMap.empty (fun orths _ map -> | |
148 | + IntMap.fold map orths (fun orths _ l -> | |
149 | + TokenEnvSet.fold l orths (fun orths t -> | |
150 | + match t.token with | |
151 | + Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) | |
152 | + | _ -> orths))) | |
153 | + | |
154 | +let get_year_orths paths = | |
155 | + IntMap.fold paths StringSet.empty (fun orths _ map -> | |
156 | + IntMap.fold map orths (fun orths _ l -> | |
157 | + TokenEnvSet.fold l orths (fun orths t -> | |
158 | + match t.token with | |
159 | + Dig(lemma,"year") -> StringSet.add orths lemma | |
160 | + | _ -> orths))) | |
161 | + | |
162 | +let get_single_letter_orths paths = | |
163 | + IntMap.fold paths StringSet.empty (fun orths _ map -> | |
164 | + IntMap.fold map orths (fun orths _ l -> | |
165 | + TokenEnvSet.fold l orths (fun orths t -> | |
166 | + match t.token with | |
167 | + SmallLetter lemma -> StringSet.add orths lemma | |
168 | + | CapLetter(lemma,_) -> StringSet.add orths lemma | |
169 | + | _ -> orths))) | |
170 | + | |
146 | 171 | let preselect orths lemmas rules l = |
147 | 172 | Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) -> |
148 | 173 | let b = Xlist.fold match_list true (fun b -> function |
... | ... | @@ -172,14 +197,33 @@ let add_ordnum_rules orths rules = |
172 | 197 | let add_quot_rule rules = |
173 | 198 | (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules |
174 | 199 | |
200 | +let add_building_number_rules dig_orths letter_orths rules = | |
201 | + StringSet.fold dig_orths rules (fun rules dig1 -> | |
202 | + let rules = StringSet.fold letter_orths rules (fun rules letter1 -> | |
203 | + (true,[D(dig1,"year");O letter1],dig1^letter1,"building-number",[]) :: rules) in | |
204 | + StringSet.fold dig_orths rules (fun rules dig2 -> | |
205 | + let rules = (true,[D(dig1,"year");O "/";D(dig2,"year")],dig1^"/"^dig2,"building-number",[]) :: rules in | |
206 | + let rules = StringSet.fold letter_orths rules (fun rules letter1 -> | |
207 | + (true,[D(dig1,"year");O letter1;O "/";D(dig2,"year")],dig1^letter1^"/"^dig2,"building-number",[]) :: | |
208 | + (true,[D(dig1,"year");O "/";D(dig2,"year");O letter1],dig1^"/"^dig2^letter1,"building-number",[]) :: rules) in | |
209 | + StringSet.fold dig_orths rules (fun rules dig3 -> | |
210 | + let rules = (true,[D(dig1,"year");O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^"/"^dig2^"/"^dig3,"building-number",[]) :: rules in | |
211 | + let rules = StringSet.fold letter_orths rules (fun rules letter1 -> | |
212 | + (true,[D(dig1,"year");O letter1;O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^letter1^"/"^dig2^"/"^dig3,"building-number",[]) :: | |
213 | + (true,[D(dig1,"year");O "/";D(dig2,"year");O letter1;O "/";D(dig3,"year")],dig1^"/"^dig2^letter1^"/"^dig3,"building-number",[]) :: rules) in | |
214 | + rules))) | |
215 | + | |
175 | 216 | let select_rules paths mwe_dict mwe_dict2 = |
176 | 217 | let orths = get_orths paths in |
177 | 218 | let lemmas = get_lemmas paths in |
178 | 219 | let intnum_orths = get_intnum_orths paths in |
220 | + let year_orths = get_year_orths paths in | |
221 | + let letter_orths = get_single_letter_orths paths in | |
179 | 222 | let rules = preselect_dict orths lemmas mwe_dict [] in |
180 | 223 | let rules = preselect_dict2 orths lemmas mwe_dict2 rules in |
181 | 224 | let rules = add_ordnum_rules intnum_orths rules in |
182 | 225 | let rules = add_quot_rule rules in |
226 | + let rules = add_building_number_rules year_orths letter_orths rules in | |
183 | 227 | rules |
184 | 228 | |
185 | 229 | let rec check_interp sels = function |
... | ... | @@ -223,6 +267,8 @@ let rec match_path_rec map found (t:token_env) sels rev = function |
223 | 267 | (new_t,get_sels sels (interp,interp2)) :: found2 else found2) |
224 | 268 | | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2 |
225 | 269 | | I s, Interp s2 -> if s=s2 then (new_t,sels) :: found2 else found2 |
270 | + (* | SL, SmallLetter _ -> (new_t,sels) :: found | |
271 | + | SL, CapLetter _ -> (new_t,sels) :: found *) | |
226 | 272 | | _ -> found2)) in |
227 | 273 | Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l) |
228 | 274 | |
... | ... | @@ -240,6 +286,8 @@ let match_path map = function |
240 | 286 | (t,get_sels [] (interp,interp2)) :: found else found) |
241 | 287 | | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found |
242 | 288 | | I s, Interp s2 -> if s=s2 then (t,[]) :: found else found |
289 | + (* | SL, SmallLetter _ -> (t,[]) :: found | |
290 | + | SL, CapLetter _ -> (t,[]) :: found *) | |
243 | 291 | | _ -> found))) in |
244 | 292 | Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l) |
245 | 293 | |
... | ... |
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -92,8 +92,9 @@ let translate_digs paths = |
92 | 92 | | Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])} |
93 | 93 | | Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])} |
94 | 94 | | Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])} |
95 | - | Dig(cat,_) -> failwith ("translate_digs: Dig " ^ cat) | |
96 | - | RomanDig(cat,_) -> failwith ("translate_digs: Romandig " ^ cat) | |
95 | + | Dig(lemma,"list-item") -> {t with token=Lemma(lemma,"list-item",[[]])} | |
96 | + | Dig(lemma,cat) -> failwith ("translate_digs: Dig " ^ cat) | |
97 | + | RomanDig(lemma,cat) -> failwith ("translate_digs: Romandig " ^ cat) | |
97 | 98 | | Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t) |
98 | 99 | | _ -> t) |
99 | 100 | |
... | ... |
subsyntax/test.ml
... | ... | @@ -37,6 +37,7 @@ let test_strings = [ |
37 | 37 | "Chłopcy mają ulicę kwiatami."; *) |
38 | 38 | (* "„Dialog”"; *) |
39 | 39 | (* "( Głosujmy !)"; *) |
40 | + "Jakie są ceny w obu firmach za a) wymianę płyty głównej; b) wymianę portu HDMI" | |
40 | 41 | ] |
41 | 42 | |
42 | 43 | let test_strings2 = [ |
... | ... | @@ -51,7 +52,7 @@ let test_strings2 = [ |
51 | 52 | "„Dialog”:"; *) |
52 | 53 | (* "- Votare! ( Głosujmy !)"; |
53 | 54 | "( Głosujmy !)"; *) |
54 | - "À propos"; | |
55 | + (* "À propos"; *) | |
55 | 56 | ] |
56 | 57 | |
57 | 58 | let _ = |
... | ... |
tokenizer/ENIAMpatterns.ml
... | ... | @@ -102,7 +102,6 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb |
102 | 102 | [D "hour"; S "."; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns5"); |
103 | 103 | [D "hour"; S ":"; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns6"); |
104 | 104 | [D "intnum"; S ":"; D "intnum"], (function [x;_;y] -> Compound("match-result",[x.token;y.token]) | _ -> failwith "digit_patterns7"); |
105 | - [D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"postal-code",[[]],["postal-code"])); | |
106 | 105 | [D "3dig"; S "-"; D "3dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); |
107 | 106 | [D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); |
108 | 107 | [D "3dig"; S "-"; D "2dig"; S "-"; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); |
... | ... | @@ -123,7 +122,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb |
123 | 122 | [O "0"; S "-"; D "3dig"; S "-"; D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); |
124 | 123 | [D "3dig"; S " "; D "3dig"; S " "; D "2dig"; S " "; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); |
125 | 124 | [D "3dig"; S " "; D "3dig"; S " "; D "4dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); |
126 | - [D "year"; SL], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) | |
125 | +(* [D "year"; SL], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) | |
127 | 126 | [D "year"; S " "; SL2], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) |
128 | 127 | [D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) |
129 | 128 | [D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) |
... | ... | @@ -132,8 +131,9 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb |
132 | 131 | [D "year"; SL; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) |
133 | 132 | [D "year"; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) |
134 | 133 | [D "year"; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) |
135 | - [D "year"; SL; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) | |
134 | + [D "year"; SL; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)*) | |
136 | 135 | [SL; S ")"], (fun tokens -> Dig(concat_orths tokens,"list-item")); |
136 | + [D "intnum"; S "."; D "dig"], (function [x;_;y] -> Dig(dig_value x ^ "," ^ dig_value y,"realnum") | _ -> failwith "digit_patterns8"); | |
137 | 137 | ] (* bez 1 i *2 *3 *4 mamy rec *) (* w morfeuszu zawsze num:pl?*) |
138 | 138 | |
139 | 139 | let digit_patterns2 = [ |
... | ... | @@ -165,6 +165,7 @@ let compose_ordnum_lemma t interp = |
165 | 165 | let digit_patterns3 = [ |
166 | 166 | [S "-"; D "intnum"], (function [_;x] -> Dig("-" ^ dig_value x,"intnum") | _ -> failwith "digit_patterns10"); |
167 | 167 | [S "-"; D "realnum"], (function [_;x] -> Dig("-" ^ dig_value x,"realnum") | _ -> failwith "digit_patterns10"); |
168 | + [D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"postal-code",[[]],["postal-code"])); | |
168 | 169 | [D "intnum"; S "-"; D "intnum"], (function [x;_;y] -> Compound("intnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns11"); |
169 | 170 | [D "realnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *) |
170 | 171 | [D "intnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *) |
... | ... | @@ -526,7 +527,7 @@ let match_token = function |
526 | 527 | | CL, AllCap _ -> true |
527 | 528 | | CL, SomeCap _ -> true |
528 | 529 | | SL, SmallLetter _ -> true |
529 | - | SL2, SmallLetter x -> x <> "o" && x <> "w" (* FIXME !!! *) | |
530 | + (* | SL2, SmallLetter x -> x <> "o" && x <> "w" (* FIXME !!! *) *) | |
530 | 531 | | SL, CapLetter _ -> true |
531 | 532 | | I pat, Interp s -> pat = s |
532 | 533 | | _ -> false |
... | ... | @@ -745,5 +746,11 @@ let rec set_next_id n = function |
745 | 746 | let rec remove_spaces rev = function |
746 | 747 | [] -> List.rev rev |
747 | 748 | | x :: Token{token=Symbol " "; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l) |
749 | + | x :: Token{token=Symbol "\t"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l) | |
750 | + | x :: Token{token=Symbol "\n"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l) | |
751 | + | x :: Token{token=Symbol "\r"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l) | |
748 | 752 | | Token{token=Symbol " "} :: l -> remove_spaces rev l |
753 | + | Token{token=Symbol "\t"} :: l -> remove_spaces rev l | |
754 | + | Token{token=Symbol "\n"} :: l -> remove_spaces rev l | |
755 | + | Token{token=Symbol "\r"} :: l -> remove_spaces rev l | |
749 | 756 | | x :: l -> remove_spaces (x :: rev) l |
... | ... |
tokenizer/ENIAMtokenizerTypes.ml
... | ... | @@ -66,7 +66,7 @@ type tokens = |
66 | 66 | | Variant of tokens list |
67 | 67 | | Seq of tokens list |
68 | 68 | |
69 | -type pat = L | CL | SL | SL2 | D of string | C of string | S of string | RD of string | O of string | I of string | |
69 | +type pat = L | CL | SL | (*SL2 |*) D of string | C of string | S of string | RD of string | O of string | I of string | |
70 | 70 | |
71 | 71 | let empty_token_env = { |
72 | 72 | orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.} |
... | ... |
tokenizer/ENIAMtokens.ml
... | ... | @@ -958,6 +958,7 @@ let rec recognize_sign_group poss_s_beg i = function |
958 | 958 | let t,i = create_empty_sign_token i [Sign "»"] in |
959 | 959 | Variant[Token{t with token=Interp "»"};Token{t with token=Interp "»s"}],i,l,poss_s_beg |
960 | 960 | | (Sign "<") :: (Sign "<") :: l -> create_sign_token poss_s_beg i [Sign "<";Sign "<"] l (Interp "«") (* prawy cudzysłów *) |
961 | + | (Sign "<") :: (Digit "3") :: l -> create_sign_token poss_s_beg i [Sign "<";Sign "3"] l (make_lemma ("<3","sinterj")) | |
961 | 962 | | (Sign "<") :: l -> (* prawy cudzysłów i element wzoru matematycznego *) |
962 | 963 | let t,i = create_empty_sign_token i [Sign "<"] in |
963 | 964 | Variant[Token{t with token=Interp "«"};Token{t with token=Symbol "<"}],i,l,poss_s_beg |
... | ... | @@ -1014,6 +1015,9 @@ let rec recognize_sign_group poss_s_beg i = function |
1014 | 1015 | | (Sign "²") :: l -> create_sign_token poss_s_beg i [Sign "²"] l (Symbol "²") |
1015 | 1016 | | (Sign "°") :: l -> create_sign_token poss_s_beg i [Sign "°"] l (make_lemma ("stopień","subst:_:_:m3")) |
1016 | 1017 | | (Sign "§") :: l -> create_sign_token false i [Sign "§"] l (make_lemma ("paragraf","subst:_:_:m3")) |
1018 | + | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t") | |
1019 | + | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r") | |
1020 | + | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n") | |
1017 | 1021 | | (Sign s) :: l -> print_endline ("recognize_sign_group: " ^ s); create_sign_token poss_s_beg i [Sign s] l (Symbol s) |
1018 | 1022 | | l -> failwith "recognize_sign_group" |
1019 | 1023 | |
... | ... |