Commit 95c86f112d51360e2f641c04f3c359afbb38feb8

Authored by Wojciech Jaworski
1 parent 762b53f4

Poprawki w subsyntax

exec/semparser.ml
... ... @@ -40,6 +40,7 @@ let img = ref 1
40 40 let timeout = ref 30.
41 41 let select_sentence_modes_flag = ref false
42 42 let select_sentences_flag = ref true
  43 +let semantic_processing_flag = ref true
43 44 let output_dir = ref "results/"
44 45  
45 46 let spec_list = [
... ... @@ -67,6 +68,8 @@ let spec_list = [
67 68 "--no_sel_modes", Arg.Unit (fun () -> select_sentence_modes_flag:=false), "Do not select sencence modes (default)";
68 69 "--sel_sent", Arg.Unit (fun () -> select_sentences_flag:=true), "Select parsed sentences (default)";
69 70 "--no_sel_sent", Arg.Unit (fun () -> select_sentences_flag:=false), "Do not select parsed sentences";
  71 + "--sem", Arg.Unit (fun () -> semantic_processing_flag:=true), "Perform semantic processing (default)";
  72 + "--no_sem", Arg.Unit (fun () -> semantic_processing_flag:=false), "Do not perforf semantic processing";
70 73 ]
71 74  
72 75 let usage_msg =
... ... @@ -103,8 +106,13 @@ let assign_lex_sems proj_map cats_map tokens =
103 106 let lex_sems = ExtArray.make (ExtArray.size tokens) ENIAMlexSemanticsTypes.empty_lex_sem in
104 107 let _ = ExtArray.add lex_sems ENIAMlexSemanticsTypes.empty_lex_sem in
105 108 Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
  109 + let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens i).token in
  110 + let pos = ENIAMtokens.get_pos (ExtArray.get tokens i).token in
106 111 let cats = expand_projections proj_map (get_cats cats_map (ExtArray.get tokens i).token) in
107   - let lex_sem = {ENIAMlexSemanticsTypes.empty_lex_sem with ENIAMlexSemanticsTypes.cats=cats} in
  112 + let frames =
  113 + Xlist.rev_map (ENIAMvalence.get_aroles [] lemma pos) (fun (sel,arole,arole_attr,arev) ->
  114 + {ENIAMlexSemanticsTypes.empty_frame with ENIAMlexSemanticsTypes.selectors=sel; ENIAMlexSemanticsTypes.arole=arole; ENIAMlexSemanticsTypes.arole_attr=arole_attr; ENIAMlexSemanticsTypes.arev=arev}) in
  115 + let lex_sem = {ENIAMlexSemanticsTypes.empty_lex_sem with ENIAMlexSemanticsTypes.cats=cats; ENIAMlexSemanticsTypes.frames=frames} in
108 116 let _ = ExtArray.add lex_sems lex_sem in
109 117 ());
110 118 lex_sems
... ... @@ -123,6 +131,7 @@ let rec main_loop sub_in sub_out =
123 131 let text = ENIAMexec.parse !timeout !verbosity rules dep_rules tokens lex_sems text in
124 132 let text = if !select_sentence_modes_flag then ENIAMselectSent.select_sentence_modes_text text else text in
125 133 let text = if !select_sentences_flag then ENIAMselectSent.select_sentences_text ENIAMexecTypes.Struct text else text in
  134 + let text = if !semantic_processing_flag then ENIAMexec.semantic_processing !verbosity tokens lex_sems text else text in
126 135 ENIAMvisualization.print_html_text !output_dir "parsed_text" text !img !verbosity tokens);
127 136 prerr_endline "Done!";
128 137 main_loop sub_in sub_out)
... ...
morphology/resources/alt_supplement.tab
... ... @@ -3,4 +3,5 @@ siebie siebie siebie:acc.gen
3 3 sobie siebie siebie:dat.loc
4 4 sobą siebie siebie:inst
5 5 to to pred
  6 +yay yay interj
6 7  
... ...
subsyntax/ENIAM_MWE.ml
... ... @@ -143,6 +143,31 @@ let get_intnum_orths paths =
143 143 Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
144 144 | _ -> orths)))
145 145  
  146 +let get_intnum_orths paths =
  147 + IntMap.fold paths StringMap.empty (fun orths _ map ->
  148 + IntMap.fold map orths (fun orths _ l ->
  149 + TokenEnvSet.fold l orths (fun orths t ->
  150 + match t.token with
  151 + Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
  152 + | _ -> orths)))
  153 +
  154 +let get_year_orths paths =
  155 + IntMap.fold paths StringSet.empty (fun orths _ map ->
  156 + IntMap.fold map orths (fun orths _ l ->
  157 + TokenEnvSet.fold l orths (fun orths t ->
  158 + match t.token with
  159 + Dig(lemma,"year") -> StringSet.add orths lemma
  160 + | _ -> orths)))
  161 +
  162 +let get_single_letter_orths paths =
  163 + IntMap.fold paths StringSet.empty (fun orths _ map ->
  164 + IntMap.fold map orths (fun orths _ l ->
  165 + TokenEnvSet.fold l orths (fun orths t ->
  166 + match t.token with
  167 + SmallLetter lemma -> StringSet.add orths lemma
  168 + | CapLetter(lemma,_) -> StringSet.add orths lemma
  169 + | _ -> orths)))
  170 +
146 171 let preselect orths lemmas rules l =
147 172 Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
148 173 let b = Xlist.fold match_list true (fun b -> function
... ... @@ -172,14 +197,33 @@ let add_ordnum_rules orths rules =
172 197 let add_quot_rule rules =
173 198 (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules
174 199  
  200 +let add_building_number_rules dig_orths letter_orths rules =
  201 + StringSet.fold dig_orths rules (fun rules dig1 ->
  202 + let rules = StringSet.fold letter_orths rules (fun rules letter1 ->
  203 + (true,[D(dig1,"year");O letter1],dig1^letter1,"building-number",[]) :: rules) in
  204 + StringSet.fold dig_orths rules (fun rules dig2 ->
  205 + let rules = (true,[D(dig1,"year");O "/";D(dig2,"year")],dig1^"/"^dig2,"building-number",[]) :: rules in
  206 + let rules = StringSet.fold letter_orths rules (fun rules letter1 ->
  207 + (true,[D(dig1,"year");O letter1;O "/";D(dig2,"year")],dig1^letter1^"/"^dig2,"building-number",[]) ::
  208 + (true,[D(dig1,"year");O "/";D(dig2,"year");O letter1],dig1^"/"^dig2^letter1,"building-number",[]) :: rules) in
  209 + StringSet.fold dig_orths rules (fun rules dig3 ->
  210 + let rules = (true,[D(dig1,"year");O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^"/"^dig2^"/"^dig3,"building-number",[]) :: rules in
  211 + let rules = StringSet.fold letter_orths rules (fun rules letter1 ->
  212 + (true,[D(dig1,"year");O letter1;O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^letter1^"/"^dig2^"/"^dig3,"building-number",[]) ::
  213 + (true,[D(dig1,"year");O "/";D(dig2,"year");O letter1;O "/";D(dig3,"year")],dig1^"/"^dig2^letter1^"/"^dig3,"building-number",[]) :: rules) in
  214 + rules)))
  215 +
175 216 let select_rules paths mwe_dict mwe_dict2 =
176 217 let orths = get_orths paths in
177 218 let lemmas = get_lemmas paths in
178 219 let intnum_orths = get_intnum_orths paths in
  220 + let year_orths = get_year_orths paths in
  221 + let letter_orths = get_single_letter_orths paths in
179 222 let rules = preselect_dict orths lemmas mwe_dict [] in
180 223 let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
181 224 let rules = add_ordnum_rules intnum_orths rules in
182 225 let rules = add_quot_rule rules in
  226 + let rules = add_building_number_rules year_orths letter_orths rules in
183 227 rules
184 228  
185 229 let rec check_interp sels = function
... ... @@ -223,6 +267,8 @@ let rec match_path_rec map found (t:token_env) sels rev = function
223 267 (new_t,get_sels sels (interp,interp2)) :: found2 else found2)
224 268 | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
225 269 | I s, Interp s2 -> if s=s2 then (new_t,sels) :: found2 else found2
  270 + (* | SL, SmallLetter _ -> (new_t,sels) :: found
  271 + | SL, CapLetter _ -> (new_t,sels) :: found *)
226 272 | _ -> found2)) in
227 273 Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)
228 274  
... ... @@ -240,6 +286,8 @@ let match_path map = function
240 286 (t,get_sels [] (interp,interp2)) :: found else found)
241 287 | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
242 288 | I s, Interp s2 -> if s=s2 then (t,[]) :: found else found
  289 + (* | SL, SmallLetter _ -> (t,[]) :: found
  290 + | SL, CapLetter _ -> (t,[]) :: found *)
243 291 | _ -> found))) in
244 292 Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)
245 293  
... ...
subsyntax/ENIAMsubsyntax.ml
... ... @@ -92,8 +92,9 @@ let translate_digs paths =
92 92 | Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])}
93 93 | Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])}
94 94 | Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])}
95   - | Dig(cat,_) -> failwith ("translate_digs: Dig " ^ cat)
96   - | RomanDig(cat,_) -> failwith ("translate_digs: Romandig " ^ cat)
  95 + | Dig(lemma,"list-item") -> {t with token=Lemma(lemma,"list-item",[[]])}
  96 + | Dig(lemma,cat) -> failwith ("translate_digs: Dig " ^ cat)
  97 + | RomanDig(lemma,cat) -> failwith ("translate_digs: Romandig " ^ cat)
97 98 | Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t)
98 99 | _ -> t)
99 100  
... ...
subsyntax/test.ml
... ... @@ -37,6 +37,7 @@ let test_strings = [
37 37 "Chłopcy mają ulicę kwiatami."; *)
38 38 (* "„Dialog”"; *)
39 39 (* "( Głosujmy !)"; *)
  40 + "Jakie są ceny w obu firmach za a) wymianę płyty głównej; b) wymianę portu HDMI"
40 41 ]
41 42  
42 43 let test_strings2 = [
... ... @@ -51,7 +52,7 @@ let test_strings2 = [
51 52 "„Dialog”:"; *)
52 53 (* "- Votare! ( Głosujmy !)";
53 54 "( Głosujmy !)"; *)
54   - "À propos";
  55 + (* "À propos"; *)
55 56 ]
56 57  
57 58 let _ =
... ...
tokenizer/ENIAMpatterns.ml
... ... @@ -102,7 +102,6 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb
102 102 [D "hour"; S "."; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns5");
103 103 [D "hour"; S ":"; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns6");
104 104 [D "intnum"; S ":"; D "intnum"], (function [x;_;y] -> Compound("match-result",[x.token;y.token]) | _ -> failwith "digit_patterns7");
105   - [D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"postal-code",[[]],["postal-code"]));
106 105 [D "3dig"; S "-"; D "3dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
107 106 [D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
108 107 [D "3dig"; S "-"; D "2dig"; S "-"; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
... ... @@ -123,7 +122,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb
123 122 [O "0"; S "-"; D "3dig"; S "-"; D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
124 123 [D "3dig"; S " "; D "3dig"; S " "; D "2dig"; S " "; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
125 124 [D "3dig"; S " "; D "3dig"; S " "; D "4dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
126   - [D "year"; SL], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
  125 +(* [D "year"; SL], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
127 126 [D "year"; S " "; SL2], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
128 127 [D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
129 128 [D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
... ... @@ -132,8 +131,9 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb
132 131 [D "year"; SL; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
133 132 [D "year"; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
134 133 [D "year"; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
135   - [D "year"; SL; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
  134 + [D "year"; SL; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)*)
136 135 [SL; S ")"], (fun tokens -> Dig(concat_orths tokens,"list-item"));
  136 + [D "intnum"; S "."; D "dig"], (function [x;_;y] -> Dig(dig_value x ^ "," ^ dig_value y,"realnum") | _ -> failwith "digit_patterns8");
137 137 ] (* bez 1 i *2 *3 *4 mamy rec *) (* w morfeuszu zawsze num:pl?*)
138 138  
139 139 let digit_patterns2 = [
... ... @@ -165,6 +165,7 @@ let compose_ordnum_lemma t interp =
165 165 let digit_patterns3 = [
166 166 [S "-"; D "intnum"], (function [_;x] -> Dig("-" ^ dig_value x,"intnum") | _ -> failwith "digit_patterns10");
167 167 [S "-"; D "realnum"], (function [_;x] -> Dig("-" ^ dig_value x,"realnum") | _ -> failwith "digit_patterns10");
  168 + [D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"postal-code",[[]],["postal-code"]));
168 169 [D "intnum"; S "-"; D "intnum"], (function [x;_;y] -> Compound("intnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns11");
169 170 [D "realnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *)
170 171 [D "intnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *)
... ... @@ -526,7 +527,7 @@ let match_token = function
526 527 | CL, AllCap _ -> true
527 528 | CL, SomeCap _ -> true
528 529 | SL, SmallLetter _ -> true
529   - | SL2, SmallLetter x -> x <> "o" && x <> "w" (* FIXME !!! *)
  530 + (* | SL2, SmallLetter x -> x <> "o" && x <> "w" (* FIXME !!! *) *)
530 531 | SL, CapLetter _ -> true
531 532 | I pat, Interp s -> pat = s
532 533 | _ -> false
... ... @@ -745,5 +746,11 @@ let rec set_next_id n = function
745 746 let rec remove_spaces rev = function
746 747 [] -> List.rev rev
747 748 | x :: Token{token=Symbol " "; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l)
  749 + | x :: Token{token=Symbol "\t"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l)
  750 + | x :: Token{token=Symbol "\n"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l)
  751 + | x :: Token{token=Symbol "\r"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l)
748 752 | Token{token=Symbol " "} :: l -> remove_spaces rev l
  753 + | Token{token=Symbol "\t"} :: l -> remove_spaces rev l
  754 + | Token{token=Symbol "\n"} :: l -> remove_spaces rev l
  755 + | Token{token=Symbol "\r"} :: l -> remove_spaces rev l
749 756 | x :: l -> remove_spaces (x :: rev) l
... ...
tokenizer/ENIAMtokenizerTypes.ml
... ... @@ -66,7 +66,7 @@ type tokens =
66 66 | Variant of tokens list
67 67 | Seq of tokens list
68 68  
69   -type pat = L | CL | SL | SL2 | D of string | C of string | S of string | RD of string | O of string | I of string
  69 +type pat = L | CL | SL | (*SL2 |*) D of string | C of string | S of string | RD of string | O of string | I of string
70 70  
71 71 let empty_token_env = {
72 72 orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.}
... ...
tokenizer/ENIAMtokens.ml
... ... @@ -958,6 +958,7 @@ let rec recognize_sign_group poss_s_beg i = function
958 958 let t,i = create_empty_sign_token i [Sign "»"] in
959 959 Variant[Token{t with token=Interp "»"};Token{t with token=Interp "»s"}],i,l,poss_s_beg
960 960 | (Sign "<") :: (Sign "<") :: l -> create_sign_token poss_s_beg i [Sign "<";Sign "<"] l (Interp "«") (* prawy cudzysłów *)
  961 + | (Sign "<") :: (Digit "3") :: l -> create_sign_token poss_s_beg i [Sign "<";Sign "3"] l (make_lemma ("<3","sinterj"))
961 962 | (Sign "<") :: l -> (* prawy cudzysłów i element wzoru matematycznego *)
962 963 let t,i = create_empty_sign_token i [Sign "<"] in
963 964 Variant[Token{t with token=Interp "«"};Token{t with token=Symbol "<"}],i,l,poss_s_beg
... ... @@ -1014,6 +1015,9 @@ let rec recognize_sign_group poss_s_beg i = function
1014 1015 | (Sign "²") :: l -> create_sign_token poss_s_beg i [Sign "²"] l (Symbol "²")
1015 1016 | (Sign "°") :: l -> create_sign_token poss_s_beg i [Sign "°"] l (make_lemma ("stopień","subst:_:_:m3"))
1016 1017 | (Sign "§") :: l -> create_sign_token false i [Sign "§"] l (make_lemma ("paragraf","subst:_:_:m3"))
  1018 + | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t")
  1019 + | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r")
  1020 + | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n")
1017 1021 | (Sign s) :: l -> print_endline ("recognize_sign_group: " ^ s); create_sign_token poss_s_beg i [Sign s] l (Symbol s)
1018 1022 | l -> failwith "recognize_sign_group"
1019 1023  
... ...