Commit 02cc66bd1d330f18f407fb8e98b1d6921399acac

Authored by Wojciech Jaworski
1 parent 78e8cc50

Poprawki w przetwarzaniu nazwisk obcych z SGJP

morphology/data/fonetics.dic
... ... @@ -175,8 +175,16 @@ aux {die}dʲi die ε
175 175 aux {bee}b′e bee ε
176 176 aux {bee}b′i bee ε
177 177  
178   -#lemma=e gender=m1
  178 +#lemma=e/ndm gender=m1
  179 +aux {chais}še chais ε
  180 +aux {lais}le lais ε
  181 +aux {nais}ne nais ε
  182 +aux {rès}re rès ε
  183 +aux {rés}re rés ε
  184 +aux {ré}re ré ε
179 185 aux {mée}me mée ε
  186 +
  187 +#lemma=e gender=m1
180 188 aux {ge}g′e ge ε
181 189 aux {ke}k′e ke ε
182 190  
... ... @@ -197,13 +205,9 @@ aux {cy}sʲi cy ε
197 205 aux {şi}sʲi şi ε
198 206 aux {thy}tʲi thy ε
199 207  
200   -#lemma=e/ndm gender=m1
201   -aux {chais}še chais ε
202   -aux {lais}le lais ε
203   -aux {nais}ne nais ε
204   -aux {rès}re rès ε
205   -aux {rés}re rés ε
206   -aux {ré}re ré ε
  208 +aux {ni}n′ ni iε
  209 +aux {ri}rʲ ri iε
  210 +aux {ny}n′ ny iε
207 211  
208 212 #lemma=ε
209 213 aux {de}d de ε
... ... @@ -243,6 +247,7 @@ aux k′i ky ε
243 247 aux k′i kij ε
244 248 aux k′i koj ε
245 249 aux k′i kyj ε
  250 +aux k′i kie ε
246 251 s s k′i
247 252 aux sk′i szky ε
248 253 l l i
... ... @@ -278,6 +283,7 @@ aux d dh ε
278 283 f f ε
279 284 aux f phe ε
280 285 aux f ph ε
  286 +aux f fe ε
281 287 g g ε
282 288 aux g gue ε
283 289 aux g gues ε
... ... @@ -313,6 +319,7 @@ aux t thes ε
313 319 aux t th ε
314 320 aux t the ε
315 321 aux t tt ε
  322 +aux {v}v ve ε
316 323  
317 324 e e j
318 325 aux ej ey ε
... ... @@ -607,10 +614,18 @@ aux {bee}b′e bee α
607 614 aux {bee}b′i bee β
608 615  
609 616 #lemma=e gender=m1
610   -aux {mée}me mée α
611 617 aux {ge}g′e ge α
612 618 aux {ke}k′e ke α
613 619  
  620 +#lemma=e/ndm gender=m1
  621 +aux {chais}še chais ’
  622 +aux {lais}le lais ’
  623 +aux {nais}ne nais ’
  624 +aux {rès}re rès ’
  625 +aux {rés}re rés ’
  626 +aux {ré}re ré ά
  627 +aux {mée}me mée α
  628 +
614 629 #lemma=y gender=m1
615 630 aux {by}b′ by ’eά
616 631 aux {by}b′i by β
... ... @@ -654,13 +669,13 @@ aux {thy}tʲ thy eά
654 669 aux {thy}tʲ thy owieε
655 670 aux {thy}tʲi thy β
656 671  
657   -#lemma=e/ndm gender=m1
658   -aux {chais}še chais ’
659   -aux {lais}le lais ’
660   -aux {nais}ne nais ’
661   -aux {rès}re rès ’
662   -aux {rés}re rés ’
663   -aux {ré}re ré ά
  672 +aux {ni}n′ ni iβ
  673 +aux {ni}n′ ni eά
  674 +aux {ri}rʲ ri eά
  675 +aux {ri}rʲi ri iβ
  676 +aux {ny}n′ ny iβ
  677 +aux {ny}n′ nyi eά
  678 +aux {ny}n′ nyi owieε
664 679  
665 680 #lemma=ε
666 681 aux {de}d de ’
... ... @@ -794,6 +809,7 @@ acro {X}ks X
794 809 acro {Y}y Y
795 810 acro {Z}z Z
796 811 acro {Ż}ž Ż
  812 +acro {Ż}žet Ż
797 813 acro {J}jot J-ot δ
798 814 acro {Z}zet Z-et δ
799 815 acro {Ż}žet Ż-et δ
... ...
morphology/dict.ml
... ... @@ -231,8 +231,14 @@ let epsilon_lemmata = StringSet.of_list [
231 231 ""; ""; ""; ""; ""; ""; ""; ""; "";
232 232 ]
233 233  
234   -let e_lemmata = StringSet.of_list [
235   - "Barrès"; "Beaumarchais"; "Marchais"; "Montesquieu"; "Rabelais"; "Resnais"; "Richelieu"; ""; "";
  234 +let e_y_lemmata = StringSet.of_list [
  235 + "Montesquieu"; "Richelieu"; "Rushdie"; "Muskie";
  236 + "Gillespie"; "Depardieu"; "Christie"; "Carnegie"; "Bandtkie"; "Barrie"; ""; ""; "";
  237 + ""; ""; ""; ""; ""; ""; ""; ""; "";
  238 + ]
  239 +
  240 +let e_ndm_lemmata = StringSet.of_list [
  241 + "Barrès"; "Beaumarchais"; "Marchais"; "Rabelais"; "Resnais"; "Mérimée"; "";
236 242 ""; ""; ""; ""; ""; ""; ""; ""; "";
237 243 ""; ""; ""; ""; ""; ""; ""; ""; "";
238 244 ]
... ... @@ -247,7 +253,8 @@ let y_lemmata = StringSet.of_list [
247 253 let get_lemma_suf lemma =
248 254 let lemma = Stem.simplify_lemma lemma in
249 255 if StringSet.mem epsilon_lemmata lemma then "ε" else
250   - if StringSet.mem e_lemmata lemma then "e" else
  256 + if StringSet.mem e_ndm_lemmata lemma then "e/ndm" else
  257 + if StringSet.mem e_y_lemmata lemma then "e/y" else
251 258 if StringSet.mem y_lemmata lemma then "y" else
252 259 let lemma_suf =
253 260 if lemma = "" then "" else
... ... @@ -397,10 +404,25 @@ let merge_interps lemma forms =
397 404 | "o",["subst:sg:loc:m1";"subst:sg:voc:m1"] ->
398 405 if orth_suf = "e" then {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms
399 406 else {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms
  407 + | "e/ndm",["depr:pl:nom.acc.voc:m2";"subst:pl:dat:m1";"subst:pl:gen.acc:m1";"subst:pl:inst:m1";"subst:pl:loc:m1";"subst:pl:nom.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] ->
  408 + {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: {empty_form with orth=orth; interp="subst:pl:nom.gen.dat.acc.inst.loc.voc:m1"} :: forms
  409 + | "e/ndm",["depr:pl:nom.acc.voc:m2";"subst:pl:dat:m1";"subst:pl:gen.acc:m1";"subst:pl:inst:m1";"subst:pl:loc:m1";"subst:pl:nom.voc:m1";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] ->
  410 + {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: {empty_form with orth=orth; interp="subst:pl:nom.gen.dat.acc.inst.loc.voc:m1"} :: {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1"} :: forms
  411 + | "e/ndm",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
  412 + | "e/ndm",["subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1"} :: forms
  413 + | "e/y",["depr:pl:nom.acc.voc:m2";"subst:pl:nom.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] ->
  414 + {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
  415 + | "e/y",["depr:pl:nom.acc.voc:m2";"subst:pl:nom.voc:m1";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] ->
  416 + {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1"} :: forms
  417 + | "e/y",["depr:pl:nom.acc.voc:m2";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
  418 + | "e/y",["subst:pl:dat:m1";"subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1|subst:pl:dat:m1"} :: forms
  419 + | "e/y",["subst:pl:gen.acc:m1";"subst:pl:loc:m1"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m1"} :: forms
  420 + | "e/y",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
  421 + | "e/y",["subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1"} :: forms
400 422 | _,["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
401 423 | _,[interp] -> {empty_form with orth=orth; interp=interp} :: forms
402 424 | _,interps ->
403   - (* print_endline ("merge_interps: " ^ lemma_suf ^ " " ^ orth ^ " [\"" ^ String.concat "\";\"" interps ^ "\"]"); *)
  425 + (* print_endline ("merge_interps: " ^ lemma_suf ^ (*" " ^ orth ^*) " [\"" ^ String.concat "\";\"" interps ^ "\"]"); *)
404 426 Xlist.fold interps forms (fun forms interp ->
405 427 {empty_form with orth=orth; interp=interp} :: forms))
406 428  
... ... @@ -428,6 +450,36 @@ let process_interps dict =
428 450 {entry with aspect=aspect; forms=forms} else
429 451 {entry with forms=merge_interps entry.lemma entry.forms})
430 452  
  453 +let process_polimorf_gender g =
  454 + String.concat "." (Xlist.map (Xstring.split "\\." g) (function
  455 + "n1" -> "n"
  456 + | "n2" -> "n"
  457 + | "p1" -> "m1"
  458 + | "p2" -> "n"
  459 + | "p3" -> "n"
  460 + | s -> s))
  461 +
  462 +let process_polimorf_interps dict =
  463 + Xlist.rev_map dict (fun entry ->
  464 + let forms = Xlist.rev_map entry.forms (fun f ->
  465 + let interp = match (Xstring.split ":" f.interp) with
  466 + ["ger";n;c;g;a;ac] -> String.concat ":" ["ger";n;c;process_polimorf_gender g;a;ac]
  467 + | ["praet";n;g;a] -> String.concat ":" ["praet";n;process_polimorf_gender g;a]
  468 + | ["praet";n;g;a;ac] -> String.concat ":" ["praet";n;process_polimorf_gender g;a;ac]
  469 + | "praet" :: _ -> failwith "process_polimorf_interps"
  470 + | ["ppas";n;c;g;a;ac] -> String.concat ":" ["ppas";n;c;process_polimorf_gender g;a;ac]
  471 + | ["pact";n;c;g;a;ac] -> String.concat ":" ["pact";n;c;process_polimorf_gender g;a;ac]
  472 + | ["adj";n;c;g;gr] -> String.concat ":" ["adj";n;c;process_polimorf_gender g;gr]
  473 + | ["adv"] -> "adv:pos"
  474 + | ["subst";n;c;"n1"] -> String.concat ":" ["subst";n;c;"n";"col"]
  475 + | ["subst";n;c;"n2"] -> String.concat ":" ["subst";n;c;"n";"ncol"]
  476 + | ["subst";n;c;"p1"] -> String.concat ":" ["subst";n;c;"m1";"pt"]
  477 + | ["subst";n;c;"p2"] -> String.concat ":" ["subst";n;c;"n";"pt"]
  478 + | ["subst";n;c;"p3"] -> String.concat ":" ["subst";n;c;"n";"pt"]
  479 + | _ -> f.interp in
  480 + {f with interp=interp}) in
  481 + {entry with forms=forms})
  482 +
431 483 (**********************************************************************************)
432 484  
433 485 (*let mark_ndm dict =
... ... @@ -741,6 +793,81 @@ let validate_interp rules dict =
741 793 if candidates = [] then ((*printf "validate_interp: %s\t%s\t%s\n" form.orth entry.lemma form.interp;*) {form with validated=false}) else {form with validated=true}) in
742 794 {entry with forms=forms})
743 795  
  796 +let interp_translation = Xlist.fold [
  797 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3";
  798 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  799 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";
  800 + ] StringMap.empty (fun map (k,v) -> StringMap.add_inc map k (StringSet.singleton v) (fun set -> StringSet.add set v))
  801 +
  802 +let neg_interp_translation = Xlist.fold [
  803 + "adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos", "subst:pl:nom.gen.dat.acc.inst.loc.voc:m1:pt";
  804 + "adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos|adja", "subst:pl:nom.gen.dat.acc.inst.loc.voc:m1:pt";
  805 + "adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3";
  806 + "adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos|adja", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3";
  807 + "adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  808 + "adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos|adja", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  809 + "subst:pl:nom.gen.dat.acc.inst.loc.voc:n:pt", "subst:pl:nom.gen.dat.acc.inst.loc.voc:m1:pt";
  810 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f", "subst:pl:nom.gen.dat.acc.inst.loc.voc:m1:pt";
  811 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1", "subst:pl:nom.gen.dat.acc.inst.loc.voc:m1:pt";
  812 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2", "subst:pl:nom.gen.dat.acc.inst.loc.voc:m1:pt";
  813 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m2", "subst:pl:nom.gen.dat.acc.inst.loc.voc:m1:pt";
  814 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3", "subst:pl:nom.gen.dat.acc.inst.loc.voc:m1:pt";
  815 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol", "subst:pl:nom.gen.dat.acc.inst.loc.voc:m1:pt";
  816 + "subst:pl:nom.gen.dat.acc.inst.loc.voc:n:pt", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3";
  817 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3";
  818 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3";
  819 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3";
  820 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m2", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3";
  821 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3";
  822 + "adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";
  823 + "adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos|adja", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";
  824 + "subst:pl:nom.gen.dat.acc.inst.loc.voc:n:pt", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";
  825 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";
  826 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";
  827 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";
  828 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m2", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";
  829 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";
  830 + "subst:pl:nom.gen.dat.acc.inst.loc.voc:n:pt", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  831 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  832 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  833 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m2", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  834 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m3", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  835 + "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  836 + "subst:sg:nom.acc:m3", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  837 + "subst:sg:nom:m1", "subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:f";
  838 + "subst:pl:nom.acc.voc:f", "subst:sg:gen:f";
  839 + "subst:sg:gen:f", "subst:pl:nom.acc.voc:f";
  840 + ] StringMap.empty (fun map (k,v) -> StringMap.add_inc map k (StringSet.singleton v) (fun set -> StringSet.add set v))
  841 +
  842 +let expand_interp interp =
  843 + List.flatten (Xlist.map (Xstring.split "|" interp) (fun interp ->
  844 + Xlist.map (Xlist.multiply_list (Xlist.map (Xstring.split ":" interp) (Xstring.split "\\."))) (String.concat ":")))
  845 +
  846 +let is_subset rule_interps form_interps =
  847 + let rule_interps = StringSet.of_list (expand_interp rule_interps) in
  848 + let form_interps = StringSet.of_list (expand_interp form_interps) in
  849 + StringSet.size (StringSet.intersection rule_interps form_interps) = StringSet.size form_interps
  850 +
  851 +let validate_interp_translate rules dict =
  852 + Xlist.rev_map dict (fun entry ->
  853 + let simple_lemma = Stem.simplify_lemma entry.lemma in
  854 + let forms = Xlist.rev_map entry.forms (fun form ->
  855 + let candidates = Xlist.fold (Rules.CharTrees.find rules form.orth) [] (fun candidates (stem,rule) ->
  856 + if stem ^ rule.set = simple_lemma && is_subset rule.interp form.interp then
  857 + (stem,rule) :: candidates else candidates) in
  858 + if candidates = [] then {form with validated=false} else {form with validated=true}) in
  859 + (* let pos_candidates,neg_candidates = Xlist.fold candidates ([],[]) (fun (pos_candidates,neg_candidates) (stem,rule) ->
  860 + let interps = try StringMap.find interp_translation rule.interp with Not_found -> StringSet.empty in
  861 + if StringSet.mem interps form.interp then
  862 + (stem,rule) :: pos_candidates,neg_candidates else pos_candidates,(stem,rule) :: neg_candidates) in
  863 + if pos_candidates = [] then (
  864 + Xlist.iter neg_candidates (fun (stem,rule) ->
  865 + let neg_interps = try StringMap.find neg_interp_translation rule.interp with Not_found -> StringSet.empty in
  866 + if StringSet.mem neg_interps form.interp then () else
  867 + printf " \"%s\", \"%s\";\n" rule.interp form.interp);
  868 + {form with validated=false}) else {form with validated=true}) in *)
  869 + {entry with forms=forms})
  870 +
744 871 let phon_validate_interp rules dict =
745 872 Xlist.rev_map dict (fun entry ->
746 873 let entry = create_candidates true rules entry in
... ... @@ -922,11 +1049,19 @@ let generate_interp_rules rules interp_rules selected_tags path filename rules_f
922 1049 fst (Rules.RuleQMap.fold freq_rules (Rules.RuleQMap.empty,1) (fun (freq_rules,i) rule freq ->
923 1050 Rules.RuleQMap.add_val freq_rules {rule with id = "N" ^ string_of_int i} freq, i+1)) *)
924 1051  
925   -let manage_x_lemma stem suffix lemma =
  1052 +(* let manage_x_lemma stem suffix lemma =
926 1053 match suffix with
927 1054 "s" -> if Xstring.check_sufix "x" lemma && Xstring.check_sufix "k" stem then Xstring.cut_sufix "k" stem else stem
928 1055 | "sa" -> if Xstring.check_sufix "xa" lemma && Xstring.check_sufix "k" stem then Xstring.cut_sufix "k" stem else stem
929   - | _ -> stem
  1056 + | _ -> stem *)
  1057 +
  1058 +let rec get_longest_common_prefix_rec rev = function
  1059 + a :: la, b :: lb -> if a = b then get_longest_common_prefix_rec (a :: rev) (la,lb) else rev
  1060 + | _ -> rev
  1061 +
  1062 +let get_longest_common_prefix a b =
  1063 + let rev = get_longest_common_prefix_rec [] (Xunicode.utf8_chars_of_utf8_string a, Xunicode.utf8_chars_of_utf8_string b) in
  1064 + String.concat "" (List.rev rev)
930 1065  
931 1066 let generate_rule_frequencies rules path filename rules_filename =
932 1067 let dict = load_tab (path ^ filename) in
... ... @@ -944,18 +1079,24 @@ let generate_rule_frequencies rules path filename rules_filename =
944 1079 (* print_endline simple_lemma; *)
945 1080 Xlist.fold entry.forms freq_rules (fun freq_rules form ->
946 1081 let candidates = Xlist.fold form.candidates [] (fun candidates (stem,rule,s) ->
947   - let x_stem = manage_x_lemma stem rule.set simple_lemma in
948   - let rule,pref_stem,short_stem = match rule.pref with
949   - "naj" ->
  1082 + (* let x_stem = manage_x_lemma stem rule.set simple_lemma in *)
  1083 + let rule,short_stem = match rule.pref with
  1084 + "naj" -> rule, Xstring.cut_prefix "naj" (Fonetics.get_short_stem "" ("naj" ^ stem) s.mapping)
  1085 + | "n′e" -> {rule with pref="nie"}, Xstring.cut_prefix "nie" (Fonetics.get_short_stem "" ("n′e" ^ stem) s.mapping)
  1086 + | "" -> rule, Fonetics.get_short_stem "" stem s.mapping
  1087 + | _ -> failwith "generate_rule_frequencies" in
  1088 + let short_stem = get_longest_common_prefix short_stem simple_lemma in
  1089 + let pref_stem = rule.pref ^ short_stem in
  1090 + (* "naj" ->
950 1091 let pref_stem = Fonetics.get_short_stem "" ("naj" ^ x_stem) s.mapping in
951 1092 rule, pref_stem, Xstring.cut_prefix "naj" pref_stem
952   - | "ne" ->
  1093 + | "nie" ->
953 1094 let pref_stem = Fonetics.get_short_stem "" ("nie" ^ x_stem) s.mapping in
954 1095 {rule with pref="nie"}, pref_stem, Xstring.cut_prefix "nie" pref_stem
955 1096 | "" ->
956 1097 let pref_stem = Fonetics.get_short_stem "" x_stem s.mapping in
957 1098 rule, pref_stem, pref_stem
958   - | _ -> failwith "generate_rule_frequencies" in
  1099 + | _ -> failwith "generate_rule_frequencies" in *)
959 1100 (* printf "%s %s %s\n%!" simple_lemma stem pref_stem; *)
960 1101 let rule = {rule with
961 1102 find = Xstring.cut_prefix pref_stem form.orth;
... ... @@ -976,41 +1117,62 @@ let generate_rule_frequencies rules path filename rules_filename =
976 1117 if n > min_n then min_n,min_l else
977 1118 min_n, (stem,rule) :: min_l) in
978 1119 let map = Xlist.fold candidates StringMap.empty (fun map (_,r) -> StringMap.add map (string_of_rule r) r) in
979   - match StringMap.fold map [] (fun l s r -> (s,Rules.get_tag r.tags "con",Rules.get_tag r.tags "group",Rules.get_tag r.tags "lemma",r) :: l) with
  1120 + match StringMap.fold map [] (fun l s r -> (s,Rules.get_tag r.tags "con",Rules.get_tag r.tags "flex",Rules.get_tag r.tags "group",Rules.get_tag r.tags "lemma",r) :: l) with
980 1121 (* match Rules.RuleSet.to_list (Xlist.fold candidates Rules.RuleSet.empty (fun set (_,r) -> Rules.RuleSet.add set r)) with *)
981 1122 [] -> freq_rules
982   - | [_,_,_,_,r] -> Rules.RuleQMap.add freq_rules r
983   - | [_,"ʲ",_,_,_;_,"j",_,_,r] -> Rules.RuleQMap.add freq_rules r
984   - | [_,"ʲ",_,_,_;_,"r",_,_,r] -> Rules.RuleQMap.add freq_rules r
985   - | [_,"ʲ",_,_,_;_,"c",_,_,r] -> Rules.RuleQMap.add freq_rules r
986   - | [_,"ʲ",_,_,_;_,"d",_,_,r] -> Rules.RuleQMap.add freq_rules r
987   - | [_,"ʲ",_,_,_;_,"s",_,_,r] -> Rules.RuleQMap.add freq_rules r
988   - | [_,"ʲ",_,_,_;_,"a",_,_,r] -> Rules.RuleQMap.add freq_rules r
989   - | [_,"m′",_,_,_;_,"m",_,_,r] -> Rules.RuleQMap.add freq_rules r
990   - | [_,"b′",_,_,_;_,"b",_,_,r] -> Rules.RuleQMap.add freq_rules r
991   - | [_,"f′",_,_,_;_,"f",_,_,r] -> Rules.RuleQMap.add freq_rules r
992   - | [_,"v′",_,_,_;_,"j",_,_,r] -> Rules.RuleQMap.add freq_rules r
993   - | [_,"z",_,_,_;_,"s",_,_,r] -> Rules.RuleQMap.add freq_rules r
994   - | [_,"c",_,_,_;_,"z",_,_,r] -> Rules.RuleQMap.add freq_rules r
995   - | [_,"p′",_,_,_;_,"p",_,_,r] -> Rules.RuleQMap.add freq_rules r
996   - | [_,"ǯ",_,_,_;_,"ž",_,_,r] -> Rules.RuleQMap.add freq_rules r
997   - | [_,"v′",_,_,_;_,"v",_,_,r] -> Rules.RuleQMap.add freq_rules r
998   - | [_,"g′",_,_,_;_,"g",_,_,r] -> Rules.RuleQMap.add freq_rules r
999   - | [_,"š",_,_,_;_,"x",_,_,r] -> Rules.RuleQMap.add freq_rules r
1000   - | [_,"j",_,_,r;_,"b′",_,_,_] -> Rules.RuleQMap.add freq_rules r
1001   - | [_,"j",_,_,r;_,"g′",_,_,_] -> Rules.RuleQMap.add freq_rules r
1002   - | [_,"j",_,_,r;_,"k",_,_,_] -> Rules.RuleQMap.add freq_rules r
1003   - | [_,"n′",_,_,_;_,"j",_,_,r] -> Rules.RuleQMap.add freq_rules r
1004   - | [_,"ł",_,_,_;_,"v",_,_,r] -> Rules.RuleQMap.add freq_rules r
1005   - | [_,"k′",_,_,_;_,"k",_,_,r] -> Rules.RuleQMap.add freq_rules r
1006   - | [_,"c",_,_,r;_,"k",_,_,_] -> Rules.RuleQMap.add freq_rules r
1007   - | [_,_,_,"ε",_;_,_,_,"e",r] -> Rules.RuleQMap.add freq_rules r
1008   - | [_,_,_,"y",_;_,_,_,"e",r] -> Rules.RuleQMap.add freq_rules r
1009   - (* | [_;"\t\t\tcat=ndm",r] -> Rules.RuleQMap.add freq_rules r
1010   - | [_;_;"\t\t\tcat=ndm",r] -> Rules.RuleQMap.add freq_rules r *)
  1123 + | [_,_,_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1124 + | [_,"ʲ",_,_,_,_;_,"j",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1125 + | [_,"ʲ",_,_,_,_;_,"r",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1126 + | [_,"ʲ",_,_,_,_;_,"c",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1127 + | [_,"ʲ",_,_,_,_;_,"d",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1128 + | [_,"ʲ",_,_,_,_;_,"s",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1129 + | [_,"ʲ",_,_,_,_;_,"a",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1130 + | [_,"m′",_,_,_,_;_,"m",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1131 + | [_,"b′",_,_,_,_;_,"b",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1132 + | [_,"f′",_,_,_,_;_,"f",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1133 + | [_,"v′",_,_,_,_;_,"j",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1134 + | [_,"z",_,_,_,_;_,"s",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1135 + | [_,"c",_,_,_,_;_,"z",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1136 + | [_,"p′",_,_,_,_;_,"p",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1137 + | [_,"ǯ",_,_,_,_;_,"ž",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1138 + | [_,"v′",_,_,_,_;_,"v",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1139 + | [_,"g′",_,_,_,_;_,"g",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1140 + | [_,"š",_,_,_,_;_,"x",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1141 + | [_,"j",_,_,_,r;_,"b′",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1142 + | [_,"j",_,_,_,r;_,"g′",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1143 + | [_,"j",_,_,_,r;_,"k",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1144 + | [_,"j",_,_,_,r;_,"f′",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1145 + | [_,"n′",_,_,_,_;_,"j",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1146 + | [_,"p′",_,_,_,_;_,"j",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1147 + | [_,"m′",_,_,_,_;_,"j",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1148 + | [_,"k′",_,_,_,_;_,"j",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1149 + | [_,"ł",_,_,_,_;_,"v",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1150 + | [_,"k′",_,_,_,_;_,"k",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1151 + | [_,"c",_,_,_,r;_,"k",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1152 + | [_,"ž",_,_,_,_;_,"t",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1153 + | [_,"x",_,_,_,r;_,"h",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1154 + | [_,_,_,_,"ε",_;_,_,_,_,"e",r] -> Rules.RuleQMap.add freq_rules r
  1155 + | [_,_,_,_,"y",_;_,_,_,_,"e",r] -> Rules.RuleQMap.add freq_rules r
  1156 + | [_,"v′",_,_,_,_;_,"ł",_,_,_,_;_,"v",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1157 + | [_,"k′",_,_,_,r;_,"j",_,_,_,_;_,"k",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1158 + | [_,"k",_,_,"y",_;_,"k′",_,_,"e",r;_,"k",_,_,"e",_] -> Rules.RuleQMap.add freq_rules r
  1159 + | [_,"p′",_,_,"y",_;_,"p′",_,_,"e",r;_,"j",_,_,"e",_] -> Rules.RuleQMap.add freq_rules r
  1160 + | [_,"k",_,_,"y",_;_,"k′",_,_,"e",r;_,"j",_,_,"e",_;_,"k",_,_,"e",_] -> Rules.RuleQMap.add freq_rules r
  1161 + | [_,_,"ym",_,_,_;_,_,"ym",_,_,_;_,_,"em",_,_,r] -> Rules.RuleQMap.add freq_rules r
  1162 + | [_,_,"ym",_,_,_;_,_,"ym",_,_,_;_,_,"em",_,_,_;_,"v","em",_,_,r] -> Rules.RuleQMap.add freq_rules r
  1163 + | [_,_,"ym",_,_,_;_,_,"em",_,_,r] -> Rules.RuleQMap.add freq_rules r
  1164 + | [_,"ʲ",_,_,_,_;_,"g′",_,_,_,r;_,"g",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1165 + | [_,"ʲ",_,_,_,_;_,"ʲ",_,_,_,_;_,"g′",_,_,_,r;_,"g",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1166 + | [_,"ʲ",_,_,_,_;_,"ʲ",_,_,_,_;_,"j",_,_,_,_;_,"g′",_,_,_,r;_,"g",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1167 + | [_,"ʲ",_,_,_,_;_,"j",_,_,_,_;_,"g′",_,_,_,r;_,"g",_,_,_,_] -> Rules.RuleQMap.add freq_rules r
  1168 + | [_,"ʲ",_,_,_,_;_,"ʲ",_,_,_,_;_,"j",_,_,_,r] -> Rules.RuleQMap.add freq_rules r
  1169 + | [_,_,_,"e",_,_;_,_,_,"a",_,r] -> Rules.RuleQMap.add freq_rules r
  1170 + | [_,_,_,"Jε",_,_;_,_,_,"J",_,r] -> Rules.RuleQMap.add freq_rules r
  1171 + | [_,_,_,"ε",_,_;_,_,_,"n",_,r] -> Rules.RuleQMap.add freq_rules r
  1172 + | [_,_,_,"a",_,_;_,_,_,"Je",_,r] -> Rules.RuleQMap.add freq_rules r
1011 1173 | l ->
1012   - printf "%s %s\n %s\n" form.orth entry.lemma (String.concat "\n " (Xlist.map l (fun (s,_,_,_,_) -> s)));
1013   - let _,_,_,_,r = List.hd l in
  1174 + printf "%s %s\n %s\n" form.orth entry.lemma (String.concat "\n " (Xlist.map l (fun (s,_,_,_,_,_) -> s)));
  1175 + let _,_,_,_,_,r = List.hd l in
1014 1176 Rules.RuleQMap.add freq_rules r)) in
1015 1177 File.file_out rules_filename (fun file ->
1016 1178 Rules.RuleQMap.iter freq_rules (fun rule freq ->
... ...
morphology/generate.ml
... ... @@ -156,7 +156,7 @@ let _ =
156 156 (* test_process_interps results_path adj_sgjp_filename; *)
157 157 (* test_process_interps results_path verb_sgjp_filename; *)
158 158 (* test_process_interps results_path noun_sgjp_filename; *)
159   - (* test_process_interps results_path "interp_not_validated_lang_all.tab"; *)
  159 + (* test_process_interps results_path "lang_all_sgjp-20170730.tab"; *)
160 160 ()
161 161  
162 162  
... ... @@ -451,6 +451,7 @@ let _ =
451 451 (* test_lemmatize "münsterski" "münstersku"; *)
452 452 (* test_lemmatize "würzburski" "würzburskiemu"; *)
453 453 (* test_lemmatize "polje" "poljom"; *)
  454 + (* test_lemmatize "drivie" "drive"; *)
454 455 (* test_lemmatize "" "";
455 456 test_lemmatize "" "";
456 457 test_lemmatize "" ""; *)
... ... @@ -568,7 +569,14 @@ let _ =
568 569 test_interp_lemmatize "Bogorodckij" "Bogorodckiego";
569 570 test_interp_lemmatize "BUW" "BUW-ie";
570 571 test_interp_lemmatize "Bush" "Bushe"; *)
571   - (* test_interp_lemmatize "ensemble" "ensemblowi"; *)
  572 + (* test_interp_lemmatize "ensemble" "ensemblowi";
  573 + test_interp_lemmatize "Anouilh" "Anouilhe";
  574 + test_interp_lemmatize "Bandtkie" "Bandtkimi";*)
  575 + (* test_interp_lemmatize "Jokai" "Jokaiemu"; *)
  576 +(* test_interp_lemmatize "Joszua" "Joszui";
  577 + test_interp_lemmatize "Linde" "Lindymi";
  578 + test_interp_lemmatize "drive" "drivie"; *)
  579 + (* test_interp_lemmatize "FPŻ" "FPŻ-y"; *)
572 580 (*test_interp_lemmatize "" "";
573 581 test_interp_lemmatize "" "";
574 582 test_interp_lemmatize "" "";*)
... ... @@ -598,11 +606,14 @@ let _ =
598 606  
599 607 (* Generowanie złożonych reguł zaopatrzonych we frekwencje *)
600 608 let _ =
601   - (* Dict.generate_rule_frequencies interp_compound_rule_trees results_path adj_sgjp_filename "results/freq_rules-adj.tab"; *)
602   - (* Dict.generate_rule_frequencies interp_compound_rule_trees results_path "verb_sgjp_no_pref.tab" "results/freq_rules-verb.tab"; *)
  609 + (* Dict.generate_rule_frequencies interp_compound_rule_trees results_path "verb_sgjp_no_pref.tab" "results/freq_rules-verb.tab";
  610 + Dict.generate_rule_frequencies interp_compound_rule_trees results_path adj_sgjp_filename "results/freq_rules-adj.tab";
  611 + Dict.generate_rule_frequencies interp_compound_rule_trees results_path adv_sgjp_filename "results/freq_rules-adv.tab";*)
  612 + (* Dict.generate_rule_frequencies interp_compound_rule_trees results_path noun_sgjp_filename "results/freq_rules-noun.tab"; *)
603 613 (* Dict.generate_rule_frequencies interp_compound_rule_trees results_path verb_sgjp_filename "results/freq_rules-verb.tab"; *)
604 614 (* Dict.generate_rule_frequencies interp_compound_rule_trees results_path "lang_all_sgjp-20170730.tab" "results/freq_rules-lang.tab"; *)
605 615 (* Dict.generate_rule_frequencies interp_compound_rule_trees sgjp_path sgjp_filename "results/freq_rules.tab"; *)
  616 + (* Dict.generate_rule_frequencies interp_compound_rule_trees results_path "noun_zmiekczenie.tab" "results/freq_rules-zmiekczenie.tab"; *)
606 617 (* ignore (Sys.command "totem ~/Dokumenty/Inne/gong/gong_05m_00s.ogg"); *)
607 618 ()
608 619  
... ... @@ -618,11 +629,29 @@ let generate_alt rules_filename path filename out_filename =
618 629 let dict = Dict.remove_validated_forms dict in
619 630 Dict.print out_filename dict
620 631  
  632 +let generate_alt_translate rules_filename path filename out_filename =
  633 + let rules = Rules.load_freq_rules rules_filename in
  634 + let rules = Rules.CharTrees.create rules in
  635 + let dict = Dict.load_tab (path ^ filename) in
  636 + let dict = Dict.merge_entries dict in
  637 + (* let dict = Dict.process_interps dict in *)
  638 + let dict = Dict.process_polimorf_interps dict in
  639 + let dict = Dict.remove_cat "cond" dict in
  640 + (* let dict = Dict.mark_ndm dict in *)
  641 + let dict = Dict.validate_interp_translate rules dict in
  642 + let dict = Dict.remove_validated_forms dict in
  643 + Dict.print out_filename dict
  644 +
621 645 (* Walidacja reguł zaopatrznych we frekwencje/generowanie listy wyjątków *)
622 646 let _ =
623 647 (* generate_alt "results/freq_rules-adj.tab" results_path adj_sgjp_filename "results/alt-adj.tab"; *)
624 648 (* generate_alt "results/freq_rules.tab" sgjp_path sgjp_filename "results/alt.tab"; *)
625 649 (* generate_alt "results/freq_rules-lang.tab" results_path "lang_all_sgjp-20170730.tab" "results/alt-lang.tab"; *)
  650 + (* generate_alt "results/freq_rules.tab" results_path "lang_all_sgjp-20170730.tab" "results/alt-lang.tab"; *)
  651 + (* generate_alt_translate "results/freq_rules.tab" results_path "lang_all_sgjp-20170730.tab" "results/alt-lang-tr.tab"; *)
  652 + (* generate_alt_translate "results/freq_rules.tab" sgjp_path sgjp_filename "results/alt-tr.tab"; *)
  653 + generate_alt_translate "results/freq_rules.tab" sgjp_path polimorf_filename "results/alt-polimorf-tr.tab";
  654 + (* ignore (Sys.command "totem ~/Dokumenty/Inne/gong/gong_05m_00s.ogg"); *)
626 655 ()
627 656  
628 657 (* Generowanie stemów z regułami *)
... ...