Commit 95c86f112d51360e2f641c04f3c359afbb38feb8

Authored by Wojciech Jaworski
1 parent 762b53f4

Poprawki w subsyntax

exec/semparser.ml
@@ -40,6 +40,7 @@ let img = ref 1 @@ -40,6 +40,7 @@ let img = ref 1
40 let timeout = ref 30. 40 let timeout = ref 30.
41 let select_sentence_modes_flag = ref false 41 let select_sentence_modes_flag = ref false
42 let select_sentences_flag = ref true 42 let select_sentences_flag = ref true
  43 +let semantic_processing_flag = ref true
43 let output_dir = ref "results/" 44 let output_dir = ref "results/"
44 45
45 let spec_list = [ 46 let spec_list = [
@@ -67,6 +68,8 @@ let spec_list = [ @@ -67,6 +68,8 @@ let spec_list = [
67 "--no_sel_modes", Arg.Unit (fun () -> select_sentence_modes_flag:=false), "Do not select sencence modes (default)"; 68 "--no_sel_modes", Arg.Unit (fun () -> select_sentence_modes_flag:=false), "Do not select sencence modes (default)";
68 "--sel_sent", Arg.Unit (fun () -> select_sentences_flag:=true), "Select parsed sentences (default)"; 69 "--sel_sent", Arg.Unit (fun () -> select_sentences_flag:=true), "Select parsed sentences (default)";
69 "--no_sel_sent", Arg.Unit (fun () -> select_sentences_flag:=false), "Do not select parsed sentences"; 70 "--no_sel_sent", Arg.Unit (fun () -> select_sentences_flag:=false), "Do not select parsed sentences";
  71 + "--sem", Arg.Unit (fun () -> semantic_processing_flag:=true), "Perform semantic processing (default)";
  72 + "--no_sem", Arg.Unit (fun () -> semantic_processing_flag:=false), "Do not perforf semantic processing";
70 ] 73 ]
71 74
72 let usage_msg = 75 let usage_msg =
@@ -103,8 +106,13 @@ let assign_lex_sems proj_map cats_map tokens = @@ -103,8 +106,13 @@ let assign_lex_sems proj_map cats_map tokens =
103 let lex_sems = ExtArray.make (ExtArray.size tokens) ENIAMlexSemanticsTypes.empty_lex_sem in 106 let lex_sems = ExtArray.make (ExtArray.size tokens) ENIAMlexSemanticsTypes.empty_lex_sem in
104 let _ = ExtArray.add lex_sems ENIAMlexSemanticsTypes.empty_lex_sem in 107 let _ = ExtArray.add lex_sems ENIAMlexSemanticsTypes.empty_lex_sem in
105 Int.iter 1 (ExtArray.size tokens - 1) (fun i -> 108 Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
  109 + let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens i).token in
  110 + let pos = ENIAMtokens.get_pos (ExtArray.get tokens i).token in
106 let cats = expand_projections proj_map (get_cats cats_map (ExtArray.get tokens i).token) in 111 let cats = expand_projections proj_map (get_cats cats_map (ExtArray.get tokens i).token) in
107 - let lex_sem = {ENIAMlexSemanticsTypes.empty_lex_sem with ENIAMlexSemanticsTypes.cats=cats} in 112 + let frames =
  113 + Xlist.rev_map (ENIAMvalence.get_aroles [] lemma pos) (fun (sel,arole,arole_attr,arev) ->
  114 + {ENIAMlexSemanticsTypes.empty_frame with ENIAMlexSemanticsTypes.selectors=sel; ENIAMlexSemanticsTypes.arole=arole; ENIAMlexSemanticsTypes.arole_attr=arole_attr; ENIAMlexSemanticsTypes.arev=arev}) in
  115 + let lex_sem = {ENIAMlexSemanticsTypes.empty_lex_sem with ENIAMlexSemanticsTypes.cats=cats; ENIAMlexSemanticsTypes.frames=frames} in
108 let _ = ExtArray.add lex_sems lex_sem in 116 let _ = ExtArray.add lex_sems lex_sem in
109 ()); 117 ());
110 lex_sems 118 lex_sems
@@ -123,6 +131,7 @@ let rec main_loop sub_in sub_out = @@ -123,6 +131,7 @@ let rec main_loop sub_in sub_out =
123 let text = ENIAMexec.parse !timeout !verbosity rules dep_rules tokens lex_sems text in 131 let text = ENIAMexec.parse !timeout !verbosity rules dep_rules tokens lex_sems text in
124 let text = if !select_sentence_modes_flag then ENIAMselectSent.select_sentence_modes_text text else text in 132 let text = if !select_sentence_modes_flag then ENIAMselectSent.select_sentence_modes_text text else text in
125 let text = if !select_sentences_flag then ENIAMselectSent.select_sentences_text ENIAMexecTypes.Struct text else text in 133 let text = if !select_sentences_flag then ENIAMselectSent.select_sentences_text ENIAMexecTypes.Struct text else text in
  134 + let text = if !semantic_processing_flag then ENIAMexec.semantic_processing !verbosity tokens lex_sems text else text in
126 ENIAMvisualization.print_html_text !output_dir "parsed_text" text !img !verbosity tokens); 135 ENIAMvisualization.print_html_text !output_dir "parsed_text" text !img !verbosity tokens);
127 prerr_endline "Done!"; 136 prerr_endline "Done!";
128 main_loop sub_in sub_out) 137 main_loop sub_in sub_out)
morphology/resources/alt_supplement.tab
@@ -3,4 +3,5 @@ siebie siebie siebie:acc.gen @@ -3,4 +3,5 @@ siebie siebie siebie:acc.gen
3 sobie siebie siebie:dat.loc 3 sobie siebie siebie:dat.loc
4 sobą siebie siebie:inst 4 sobą siebie siebie:inst
5 to to pred 5 to to pred
  6 +yay yay interj
6 7
subsyntax/ENIAM_MWE.ml
@@ -143,6 +143,31 @@ let get_intnum_orths paths = @@ -143,6 +143,31 @@ let get_intnum_orths paths =
143 Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) 143 Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
144 | _ -> orths))) 144 | _ -> orths)))
145 145
  146 +let get_intnum_orths paths =
  147 + IntMap.fold paths StringMap.empty (fun orths _ map ->
  148 + IntMap.fold map orths (fun orths _ l ->
  149 + TokenEnvSet.fold l orths (fun orths t ->
  150 + match t.token with
  151 + Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
  152 + | _ -> orths)))
  153 +
  154 +let get_year_orths paths =
  155 + IntMap.fold paths StringSet.empty (fun orths _ map ->
  156 + IntMap.fold map orths (fun orths _ l ->
  157 + TokenEnvSet.fold l orths (fun orths t ->
  158 + match t.token with
  159 + Dig(lemma,"year") -> StringSet.add orths lemma
  160 + | _ -> orths)))
  161 +
  162 +let get_single_letter_orths paths =
  163 + IntMap.fold paths StringSet.empty (fun orths _ map ->
  164 + IntMap.fold map orths (fun orths _ l ->
  165 + TokenEnvSet.fold l orths (fun orths t ->
  166 + match t.token with
  167 + SmallLetter lemma -> StringSet.add orths lemma
  168 + | CapLetter(lemma,_) -> StringSet.add orths lemma
  169 + | _ -> orths)))
  170 +
146 let preselect orths lemmas rules l = 171 let preselect orths lemmas rules l =
147 Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) -> 172 Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
148 let b = Xlist.fold match_list true (fun b -> function 173 let b = Xlist.fold match_list true (fun b -> function
@@ -172,14 +197,33 @@ let add_ordnum_rules orths rules = @@ -172,14 +197,33 @@ let add_ordnum_rules orths rules =
172 let add_quot_rule rules = 197 let add_quot_rule rules =
173 (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules 198 (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules
174 199
  200 +let add_building_number_rules dig_orths letter_orths rules =
  201 + StringSet.fold dig_orths rules (fun rules dig1 ->
  202 + let rules = StringSet.fold letter_orths rules (fun rules letter1 ->
  203 + (true,[D(dig1,"year");O letter1],dig1^letter1,"building-number",[]) :: rules) in
  204 + StringSet.fold dig_orths rules (fun rules dig2 ->
  205 + let rules = (true,[D(dig1,"year");O "/";D(dig2,"year")],dig1^"/"^dig2,"building-number",[]) :: rules in
  206 + let rules = StringSet.fold letter_orths rules (fun rules letter1 ->
  207 + (true,[D(dig1,"year");O letter1;O "/";D(dig2,"year")],dig1^letter1^"/"^dig2,"building-number",[]) ::
  208 + (true,[D(dig1,"year");O "/";D(dig2,"year");O letter1],dig1^"/"^dig2^letter1,"building-number",[]) :: rules) in
  209 + StringSet.fold dig_orths rules (fun rules dig3 ->
  210 + let rules = (true,[D(dig1,"year");O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^"/"^dig2^"/"^dig3,"building-number",[]) :: rules in
  211 + let rules = StringSet.fold letter_orths rules (fun rules letter1 ->
  212 + (true,[D(dig1,"year");O letter1;O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^letter1^"/"^dig2^"/"^dig3,"building-number",[]) ::
  213 + (true,[D(dig1,"year");O "/";D(dig2,"year");O letter1;O "/";D(dig3,"year")],dig1^"/"^dig2^letter1^"/"^dig3,"building-number",[]) :: rules) in
  214 + rules)))
  215 +
175 let select_rules paths mwe_dict mwe_dict2 = 216 let select_rules paths mwe_dict mwe_dict2 =
176 let orths = get_orths paths in 217 let orths = get_orths paths in
177 let lemmas = get_lemmas paths in 218 let lemmas = get_lemmas paths in
178 let intnum_orths = get_intnum_orths paths in 219 let intnum_orths = get_intnum_orths paths in
  220 + let year_orths = get_year_orths paths in
  221 + let letter_orths = get_single_letter_orths paths in
179 let rules = preselect_dict orths lemmas mwe_dict [] in 222 let rules = preselect_dict orths lemmas mwe_dict [] in
180 let rules = preselect_dict2 orths lemmas mwe_dict2 rules in 223 let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
181 let rules = add_ordnum_rules intnum_orths rules in 224 let rules = add_ordnum_rules intnum_orths rules in
182 let rules = add_quot_rule rules in 225 let rules = add_quot_rule rules in
  226 + let rules = add_building_number_rules year_orths letter_orths rules in
183 rules 227 rules
184 228
185 let rec check_interp sels = function 229 let rec check_interp sels = function
@@ -223,6 +267,8 @@ let rec match_path_rec map found (t:token_env) sels rev = function @@ -223,6 +267,8 @@ let rec match_path_rec map found (t:token_env) sels rev = function
223 (new_t,get_sels sels (interp,interp2)) :: found2 else found2) 267 (new_t,get_sels sels (interp,interp2)) :: found2 else found2)
224 | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2 268 | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
225 | I s, Interp s2 -> if s=s2 then (new_t,sels) :: found2 else found2 269 | I s, Interp s2 -> if s=s2 then (new_t,sels) :: found2 else found2
  270 + (* | SL, SmallLetter _ -> (new_t,sels) :: found
  271 + | SL, CapLetter _ -> (new_t,sels) :: found *)
226 | _ -> found2)) in 272 | _ -> found2)) in
227 Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l) 273 Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)
228 274
@@ -240,6 +286,8 @@ let match_path map = function @@ -240,6 +286,8 @@ let match_path map = function
240 (t,get_sels [] (interp,interp2)) :: found else found) 286 (t,get_sels [] (interp,interp2)) :: found else found)
241 | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found 287 | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
242 | I s, Interp s2 -> if s=s2 then (t,[]) :: found else found 288 | I s, Interp s2 -> if s=s2 then (t,[]) :: found else found
  289 + (* | SL, SmallLetter _ -> (t,[]) :: found
  290 + | SL, CapLetter _ -> (t,[]) :: found *)
243 | _ -> found))) in 291 | _ -> found))) in
244 Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l) 292 Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)
245 293
subsyntax/ENIAMsubsyntax.ml
@@ -92,8 +92,9 @@ let translate_digs paths = @@ -92,8 +92,9 @@ let translate_digs paths =
92 | Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])} 92 | Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])}
93 | Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])} 93 | Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])}
94 | Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])} 94 | Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])}
95 - | Dig(cat,_) -> failwith ("translate_digs: Dig " ^ cat)  
96 - | RomanDig(cat,_) -> failwith ("translate_digs: Romandig " ^ cat) 95 + | Dig(lemma,"list-item") -> {t with token=Lemma(lemma,"list-item",[[]])}
  96 + | Dig(lemma,cat) -> failwith ("translate_digs: Dig " ^ cat)
  97 + | RomanDig(lemma,cat) -> failwith ("translate_digs: Romandig " ^ cat)
97 | Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t) 98 | Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t)
98 | _ -> t) 99 | _ -> t)
99 100
subsyntax/test.ml
@@ -37,6 +37,7 @@ let test_strings = [ @@ -37,6 +37,7 @@ let test_strings = [
37 "Chłopcy mają ulicę kwiatami."; *) 37 "Chłopcy mają ulicę kwiatami."; *)
38 (* "„Dialog”"; *) 38 (* "„Dialog”"; *)
39 (* "( Głosujmy !)"; *) 39 (* "( Głosujmy !)"; *)
  40 + "Jakie są ceny w obu firmach za a) wymianę płyty głównej; b) wymianę portu HDMI"
40 ] 41 ]
41 42
42 let test_strings2 = [ 43 let test_strings2 = [
@@ -51,7 +52,7 @@ let test_strings2 = [ @@ -51,7 +52,7 @@ let test_strings2 = [
51 "„Dialog”:"; *) 52 "„Dialog”:"; *)
52 (* "- Votare! ( Głosujmy !)"; 53 (* "- Votare! ( Głosujmy !)";
53 "( Głosujmy !)"; *) 54 "( Głosujmy !)"; *)
54 - "À propos"; 55 + (* "À propos"; *)
55 ] 56 ]
56 57
57 let _ = 58 let _ =
tokenizer/ENIAMpatterns.ml
@@ -102,7 +102,6 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb @@ -102,7 +102,6 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb
102 [D "hour"; S "."; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns5"); 102 [D "hour"; S "."; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns5");
103 [D "hour"; S ":"; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns6"); 103 [D "hour"; S ":"; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns6");
104 [D "intnum"; S ":"; D "intnum"], (function [x;_;y] -> Compound("match-result",[x.token;y.token]) | _ -> failwith "digit_patterns7"); 104 [D "intnum"; S ":"; D "intnum"], (function [x;_;y] -> Compound("match-result",[x.token;y.token]) | _ -> failwith "digit_patterns7");
105 - [D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"postal-code",[[]],["postal-code"]));  
106 [D "3dig"; S "-"; D "3dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); 105 [D "3dig"; S "-"; D "3dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
107 [D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); 106 [D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
108 [D "3dig"; S "-"; D "2dig"; S "-"; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); 107 [D "3dig"; S "-"; D "2dig"; S "-"; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
@@ -123,7 +122,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb @@ -123,7 +122,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb
123 [O "0"; S "-"; D "3dig"; S "-"; D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); 122 [O "0"; S "-"; D "3dig"; S "-"; D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
124 [D "3dig"; S " "; D "3dig"; S " "; D "2dig"; S " "; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); 123 [D "3dig"; S " "; D "3dig"; S " "; D "2dig"; S " "; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
125 [D "3dig"; S " "; D "3dig"; S " "; D "4dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); 124 [D "3dig"; S " "; D "3dig"; S " "; D "4dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"]));
126 - [D "year"; SL], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) 125 +(* [D "year"; SL], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
127 [D "year"; S " "; SL2], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) 126 [D "year"; S " "; SL2], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
128 [D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) 127 [D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
129 [D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) 128 [D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
@@ -132,8 +131,9 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb @@ -132,8 +131,9 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb
132 [D "year"; SL; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) 131 [D "year"; SL; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
133 [D "year"; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) 132 [D "year"; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
134 [D "year"; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) 133 [D "year"; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)
135 - [D "year"; SL; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) 134 + [D "year"; SL; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)*)
136 [SL; S ")"], (fun tokens -> Dig(concat_orths tokens,"list-item")); 135 [SL; S ")"], (fun tokens -> Dig(concat_orths tokens,"list-item"));
  136 + [D "intnum"; S "."; D "dig"], (function [x;_;y] -> Dig(dig_value x ^ "," ^ dig_value y,"realnum") | _ -> failwith "digit_patterns8");
137 ] (* bez 1 i *2 *3 *4 mamy rec *) (* w morfeuszu zawsze num:pl?*) 137 ] (* bez 1 i *2 *3 *4 mamy rec *) (* w morfeuszu zawsze num:pl?*)
138 138
139 let digit_patterns2 = [ 139 let digit_patterns2 = [
@@ -165,6 +165,7 @@ let compose_ordnum_lemma t interp = @@ -165,6 +165,7 @@ let compose_ordnum_lemma t interp =
165 let digit_patterns3 = [ 165 let digit_patterns3 = [
166 [S "-"; D "intnum"], (function [_;x] -> Dig("-" ^ dig_value x,"intnum") | _ -> failwith "digit_patterns10"); 166 [S "-"; D "intnum"], (function [_;x] -> Dig("-" ^ dig_value x,"intnum") | _ -> failwith "digit_patterns10");
167 [S "-"; D "realnum"], (function [_;x] -> Dig("-" ^ dig_value x,"realnum") | _ -> failwith "digit_patterns10"); 167 [S "-"; D "realnum"], (function [_;x] -> Dig("-" ^ dig_value x,"realnum") | _ -> failwith "digit_patterns10");
  168 + [D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"postal-code",[[]],["postal-code"]));
168 [D "intnum"; S "-"; D "intnum"], (function [x;_;y] -> Compound("intnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns11"); 169 [D "intnum"; S "-"; D "intnum"], (function [x;_;y] -> Compound("intnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns11");
169 [D "realnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *) 170 [D "realnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *)
170 [D "intnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *) 171 [D "intnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *)
@@ -526,7 +527,7 @@ let match_token = function @@ -526,7 +527,7 @@ let match_token = function
526 | CL, AllCap _ -> true 527 | CL, AllCap _ -> true
527 | CL, SomeCap _ -> true 528 | CL, SomeCap _ -> true
528 | SL, SmallLetter _ -> true 529 | SL, SmallLetter _ -> true
529 - | SL2, SmallLetter x -> x <> "o" && x <> "w" (* FIXME !!! *) 530 + (* | SL2, SmallLetter x -> x <> "o" && x <> "w" (* FIXME !!! *) *)
530 | SL, CapLetter _ -> true 531 | SL, CapLetter _ -> true
531 | I pat, Interp s -> pat = s 532 | I pat, Interp s -> pat = s
532 | _ -> false 533 | _ -> false
@@ -745,5 +746,11 @@ let rec set_next_id n = function @@ -745,5 +746,11 @@ let rec set_next_id n = function
745 let rec remove_spaces rev = function 746 let rec remove_spaces rev = function
746 [] -> List.rev rev 747 [] -> List.rev rev
747 | x :: Token{token=Symbol " "; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l) 748 | x :: Token{token=Symbol " "; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l)
  749 + | x :: Token{token=Symbol "\t"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l)
  750 + | x :: Token{token=Symbol "\n"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l)
  751 + | x :: Token{token=Symbol "\r"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l)
748 | Token{token=Symbol " "} :: l -> remove_spaces rev l 752 | Token{token=Symbol " "} :: l -> remove_spaces rev l
  753 + | Token{token=Symbol "\t"} :: l -> remove_spaces rev l
  754 + | Token{token=Symbol "\n"} :: l -> remove_spaces rev l
  755 + | Token{token=Symbol "\r"} :: l -> remove_spaces rev l
749 | x :: l -> remove_spaces (x :: rev) l 756 | x :: l -> remove_spaces (x :: rev) l
tokenizer/ENIAMtokenizerTypes.ml
@@ -66,7 +66,7 @@ type tokens = @@ -66,7 +66,7 @@ type tokens =
66 | Variant of tokens list 66 | Variant of tokens list
67 | Seq of tokens list 67 | Seq of tokens list
68 68
69 -type pat = L | CL | SL | SL2 | D of string | C of string | S of string | RD of string | O of string | I of string 69 +type pat = L | CL | SL | (*SL2 |*) D of string | C of string | S of string | RD of string | O of string | I of string
70 70
71 let empty_token_env = { 71 let empty_token_env = {
72 orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.} 72 orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.}
tokenizer/ENIAMtokens.ml
@@ -958,6 +958,7 @@ let rec recognize_sign_group poss_s_beg i = function @@ -958,6 +958,7 @@ let rec recognize_sign_group poss_s_beg i = function
958 let t,i = create_empty_sign_token i [Sign "»"] in 958 let t,i = create_empty_sign_token i [Sign "»"] in
959 Variant[Token{t with token=Interp "»"};Token{t with token=Interp "»s"}],i,l,poss_s_beg 959 Variant[Token{t with token=Interp "»"};Token{t with token=Interp "»s"}],i,l,poss_s_beg
960 | (Sign "<") :: (Sign "<") :: l -> create_sign_token poss_s_beg i [Sign "<";Sign "<"] l (Interp "«") (* prawy cudzysłów *) 960 | (Sign "<") :: (Sign "<") :: l -> create_sign_token poss_s_beg i [Sign "<";Sign "<"] l (Interp "«") (* prawy cudzysłów *)
  961 + | (Sign "<") :: (Digit "3") :: l -> create_sign_token poss_s_beg i [Sign "<";Sign "3"] l (make_lemma ("<3","sinterj"))
961 | (Sign "<") :: l -> (* prawy cudzysłów i element wzoru matematycznego *) 962 | (Sign "<") :: l -> (* prawy cudzysłów i element wzoru matematycznego *)
962 let t,i = create_empty_sign_token i [Sign "<"] in 963 let t,i = create_empty_sign_token i [Sign "<"] in
963 Variant[Token{t with token=Interp "«"};Token{t with token=Symbol "<"}],i,l,poss_s_beg 964 Variant[Token{t with token=Interp "«"};Token{t with token=Symbol "<"}],i,l,poss_s_beg
@@ -1014,6 +1015,9 @@ let rec recognize_sign_group poss_s_beg i = function @@ -1014,6 +1015,9 @@ let rec recognize_sign_group poss_s_beg i = function
1014 | (Sign "²") :: l -> create_sign_token poss_s_beg i [Sign "²"] l (Symbol "²") 1015 | (Sign "²") :: l -> create_sign_token poss_s_beg i [Sign "²"] l (Symbol "²")
1015 | (Sign "°") :: l -> create_sign_token poss_s_beg i [Sign "°"] l (make_lemma ("stopień","subst:_:_:m3")) 1016 | (Sign "°") :: l -> create_sign_token poss_s_beg i [Sign "°"] l (make_lemma ("stopień","subst:_:_:m3"))
1016 | (Sign "§") :: l -> create_sign_token false i [Sign "§"] l (make_lemma ("paragraf","subst:_:_:m3")) 1017 | (Sign "§") :: l -> create_sign_token false i [Sign "§"] l (make_lemma ("paragraf","subst:_:_:m3"))
  1018 + | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t")
  1019 + | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r")
  1020 + | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n")
1017 | (Sign s) :: l -> print_endline ("recognize_sign_group: " ^ s); create_sign_token poss_s_beg i [Sign s] l (Symbol s) 1021 | (Sign s) :: l -> print_endline ("recognize_sign_group: " ^ s); create_sign_token poss_s_beg i [Sign s] l (Symbol s)
1018 | l -> failwith "recognize_sign_group" 1022 | l -> failwith "recognize_sign_group"
1019 1023