Commit 7028da65ef43a51d60dbb2f01b7ea7d961d0bf7a

Authored by Wojciech Jaworski
1 parent 74683cab

algorytm selekcji lematów

Showing 59 changed files with 266391 additions and 138 deletions

Too many changes to show.

To preserve performance only 13 of 59 files are displayed.

LCGlexicon/ENIAM_LCGlexicon.ml
... ... @@ -380,3 +380,9 @@ let create_entries rules id orth cats valence lex_entries =
380 380 let rules = make_term id orth rules in
381 381 (* print_endline "create_entries 5"; *)
382 382 rules @ l)
  383 +
  384 +let initialize () =
  385 + ENIAMcategoriesPL.initialize ();
  386 + let filenames = [rules_filename; user_lexicon_filename] @ Xlist.map (!theories_paths) (fun path -> path ^ "/lexicon.dic") in
  387 + rules := make_rules_list false filenames;
  388 + dep_rules := make_rules_list true filenames
... ...
LCGlexicon/ENIAM_LCGlexiconTypes.ml
... ... @@ -17,6 +17,8 @@
17 17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 18 *)
19 19  
  20 +open Xstd
  21 +
20 22 type categories = {lemma: string; pos: string; pos2: string;
21 23 cat: string; coerced: string list; roles: string list; snode: string list; phrase: string list;
22 24 numbers: string list; cases: string list; genders: string list; persons: string list;
... ... @@ -126,3 +128,7 @@ let subst_time_lexemes_filename = resource_path ^ &quot;/LCGlexicon/subst_time.dat&quot;
126 128  
127 129 let adv_modes_filename = resource_path ^ "/Walenty/adv_modes.tab"
128 130 let num_nsems_filename = resource_path ^ "/LCGlexicon/num.tab"
  131 +
  132 +let rules = ref (StringMap.empty : (entry list StringMap.t * entry list) StringMap.t)
  133 +let dep_rules = ref (StringMap.empty : (entry list StringMap.t * entry list) StringMap.t)
  134 +let theories_paths = ref ([] : string list)
... ...
LCGlexicon/ENIAMcategoriesPL.ml
... ... @@ -29,7 +29,7 @@ let all_persons = [&quot;pri&quot;;&quot;sec&quot;;&quot;ter&quot;]
29 29  
30 30 let all_phrases = [
31 31 "np";"adjp";"advp";"infp";"ip";
32   - "prepnp";"cp";"ncp";"prepncp";"padvp";"colonp";"mp";"intp";
  32 + "prepnp";"cp";"ncp";"prepncp";"padvp";"colonp";"mp";"intp";"admod";
33 33 "adja";"prepadjp";"comparp";"xp";"xpnom";"xpgen";"symbol";"fixed";
34 34 "s";"<root>";"<sentence>";"<paragraph>";(*"";"";"";"";"";"";"";"";*)]
35 35  
... ...
LCGlexicon/resources/lexicon-pl.dic
... ... @@ -36,9 +36,6 @@ measure_weight=1
36 36  
37 37 @LEXICON
38 38  
39   -pos=ordnum,phrase=adjp:
40   - adjp*number*case*gender*grad*coerced*role*node;
41   -
42 39 #oznaczenia godzin i minut
43 40 pos=hour-minute|hour,phrase=np:
44 41 QUANT[number=sg,case=nom&gen&dat&acc&inst&loc,gender=f,person=ter,role=0]
... ... @@ -211,24 +208,6 @@ pos=subst,case=gen,nsyn=pronoun,cat!=Measure|imię|nazwisko,phrase=np:
211 208 {distant-schema}{\np*unumber*case*ugender*uperson*Measure*Measure*concept,
212 209 schema};
213 210  
214   -# liczebniki
215   -#pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=count:
216   -# QUANT[role=0]
217   -# num*number*case*gender*person*acm*nsem*role*node; # FIXME: jak usunięcie Phrase ProNG wpływa na pokrycie?
218   -#pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=mass:
219   -# QUANT[role=0]
220   -# num*number*case*gender*person*acm*nsem*role*node; # FIXME: jak usunięcie Phrase ProNG wpływa na pokrycie?
221   -pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=count,phrase=np:
222   - QUANT[cat=Number,role=Count]
223   - num*number*case*gender*person*acm*nsem*role*node|(1+fixed*T*OpAdNum*Mod*concept);
224   -pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=mass,phrase=np:
225   - QUANT[cat=Amount,role=Amount]
226   - num*number*case*gender*person*acm*nsem*role*node|(1+fixed*T*OpAdNum*Mod*concept);
227   -
228   -lemma=jeden,pos=adj,grad=pos,phrase=np:
229   - QUANT[person=all_persons,acm=congr,nsem=count,role=Count]
230   - num*number*case*gender*person*acm*nsem*role*node;
231   -
232 211 # pojemniki
233 212 pos=subst,cat=Measure:
234 213 np*number*case*gender*person*cat*role*node
... ... @@ -240,6 +219,8 @@ pos=subst,case=gen,cat=Measure:
240 219 {distant-schema}{\num*number*case*gender*person*rec*count*T*concept}
241 220 {schema}{local-schema}: measure_weight; # UWAGA: number "sg" i gender "n", żeby uzgadniać z podmiotem czasownika
242 221  
  222 +pos=subst,phrase=xp:
  223 + xp*coerced*role*node{distant-schema}{schema}{local-schema};
243 224  
244 225 #frazy przymiotnikowe
245 226 pos=adj|adjc|adjp,phrase=adjp:
... ... @@ -284,15 +265,15 @@ lemma=po,pos=prep,phrase=prepadjp:
284 265 prepadjp*lemma*case*cat*role*node
285 266 {/adjp*T*case*T*T*cat*CORE*node+adjp*sg*dat*m1*T*cat*CORE*node};
286 267  
287   -lemma=za|zbyt|niezbyt,pos=prep,phrase=adjp:
288   - QUANT[cat=0,number=all_numbers,case=all_cases,gender=all_genders,grad=pos]
289   - adjp*number*case*gender*grad*cat*role*node
290   - {/adjp*number*case*gender*grad*cat*Arg*node};
  268 +#lemma=za|zbyt|niezbyt,pos=x,phrase=adjp:
  269 +# QUANT[cat=0,coerced=0,number=all_numbers,case=all_cases,gender=all_genders,grad=pos]
  270 +# adjp*number*case*gender*grad*cat*role*node
  271 +# {/adjp*number*case*gender*grad*cat*Arg*node};
291 272  
292   -lemma=jak,pos=x,phrase=adjp:
293   - QUANT[cat=0,coerced=0,number=all_numbers,case=all_cases,gender=all_genders]
294   - adjp*number*case*gender*sup*cat*role*node
295   - {/adjp*number*case*gender*sup*cat*Arg*node};
  273 +#lemma=jak,pos=x,phrase=adjp:
  274 +# QUANT[cat=0,coerced=0,number=all_numbers,case=all_cases,gender=all_genders]
  275 +# adjp*number*case*gender*sup*cat*role*node
  276 +# {/adjp*number*case*gender*sup*cat*Arg*node};
296 277  
297 278 pos=compar:
298 279 QUANT[cat=0]
... ... @@ -304,6 +285,22 @@ pos=compar:
304 285 comparp*lemma*case*cat*role*node
305 286 /xp*cat*CORE*node;
306 287  
  288 +#modyfikatory przyimków i przysłówków
  289 +lemma=za,pos=qub,phrase=admod:
  290 + QUANT[grad=pos]
  291 + admod*grad*cat*role*node{distant-schema}{schema}{local-schema};
  292 +
  293 +lemma=bardzo|zbyt|niezbyt,pos=adv,phrase=admod:
  294 + QUANT[grad=pos]
  295 + admod*grad*cat*role*node{distant-schema}{schema}{local-schema};
  296 +
  297 +lemma=nieco|trochę,pos=adv,phrase=admod:
  298 + QUANT[grad=pos&com]
  299 + admod*grad*cat*role*node{distant-schema}{schema}{local-schema};
  300 +
  301 +lemma=jak,pos=adv,phrase=admod:
  302 + QUANT[grad=sup]
  303 + admod*grad*cat*role*node{distant-schema}{schema}{local-schema};
307 304  
308 305 # czasowniki
309 306 pos=ger,phrase=np:
... ...
NKJP2/validateMorphology.ml
... ... @@ -35,7 +35,7 @@ let rec add_ntoken stats = function
35 35 Token t ->
36 36 (try
37 37 let nlemma,ncat,ninterp = get_ntoken t.attrs in
38   - StringQMap.add stats (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp])
  38 + StringQMap.add stats (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtagset.render [ninterp])
39 39 with Not_found -> stats)
40 40 | Seq l -> Xlist.fold l stats add_ntoken
41 41 | Variant l -> Xlist.fold l stats add_ntoken
... ... @@ -228,7 +228,7 @@ let get_lemma_cat_interp = function
228 228 | nlemma,lemma,"subst","subst",[["pl"];c;g],[["pl"];["nom";"voc"];_] -> lemma,"subst",[["pl"];c;g]
229 229 (* | "5","5","adj","dig",ninterp,[] -> "piąty","adj",ninterp
230 230 | "6","6","adj","dig",ninterp,[] -> "szósty","adj",ninterp *)
231   - (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtokens.string_of_interps [ninterp] ^ " " ^ ENIAMtokens.string_of_interps [interp]); raise Not_found *)
  231 + (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtagset.render [ninterp] ^ " " ^ ENIAMtagset.render [interp]); raise Not_found *)
232 232 | _ -> raise Not_found
233 233  
234 234  
... ... @@ -323,7 +323,7 @@ let rec validate_ntoken_token name id_div paragraph stats = function
323 323 (try
324 324 let nlemma,ncat,ninterp = get_ntoken t.attrs in
325 325 process_ntoken2 stats 1 name id_div t.orth t.beg paragraph nlemma ncat ninterp
326   - (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]);
  326 + (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtagset.render [ninterp]);
327 327 Printf.printf "%s\t%d\t%s\t%d\n" name id_div t.orth t.beg;
328 328 print_endline paragraph;
329 329 stats *)
... ...
corpora/CONLL2.ml
... ... @@ -46,7 +46,7 @@ let load_token beg compound in_channel =
46 46 [a;b] -> (try int_of_string b - int_of_string a with _ -> failwith "load_token: interval id")
47 47 | _ -> failwith "load_token: interval id" in
48 48 raise (Interval_id len) in
49   - let pos,tags = match ENIAMtokens.parse_postags interp with [x] -> x | _ -> failwith "n_token" in
  49 + let pos,tags = match ENIAMtagset.parse interp with [x] -> x | _ -> failwith "n_token" in
50 50 {empty_token_env with orth = orth; beg=beg; len=len; next=next;
51 51 token = Lemma(lemma,pos,[tags])}, next, id, sl, sem in
52 52 let line = input_line in_channel in
... ... @@ -139,7 +139,7 @@ let get_tagset corpus =
139 139 Int.fold 1 (ExtArray.size tokens - 1) qmap (fun qmap i ->
140 140 let t = ExtArray.get tokens i in
141 141 match t.token with
142   - Lemma(lemma,cat,interp) -> StringQMap.add qmap (cat ^ ":" ^ ENIAMtokens.string_of_interps interp)
  142 + Lemma(lemma,cat,interp) -> StringQMap.add qmap (cat ^ ":" ^ ENIAMtagset.render interp)
143 143 | _ -> failwith "get_tagset"))
144 144  
145 145 let numbers = StringSet.of_list ["sg";"pl"]
... ... @@ -315,9 +315,9 @@ let convert_tagset corpus =
315 315 | _,"discourse:emo",_ -> "discourse:emo"
316 316 | (Lemma(lemma1,"subst",[[_] :: [c1] :: _]) as s),"case",(Lemma(lemma2,"prep",[[c2] :: _]) as t) ->
317 317 if c1 = c2 then "subst" ^ " -> case -> " ^ "prep" else ENIAMtokens.string_of_token s ^ " -> " ^ "case" ^ " -> " ^ ENIAMtokens.string_of_token t
318   - | Lemma(lemma1,cat1,interp1),"case",Lemma(lemma2,"adv",interp2) -> cat1 ^ ":" ^ ENIAMtokens.string_of_interps interp1 ^ " -> case -> " ^ lemma2 ^ ":" ^ "adv" ^ ":" ^ ENIAMtokens.string_of_interps interp2
  318 + | Lemma(lemma1,cat1,interp1),"case",Lemma(lemma2,"adv",interp2) -> cat1 ^ ":" ^ ENIAMtagset.render interp1 ^ " -> case -> " ^ lemma2 ^ ":" ^ "adv" ^ ":" ^ ENIAMtagset.render interp2
319 319 | Lemma(lemma1,cat1,interp1),label,Lemma(lemma2,cat2,interp2) ->
320   - cat1 ^ ":" ^ ENIAMtokens.string_of_interps interp1 ^ " -> " ^ label ^ " -> " ^ cat2 ^ ":" ^ ENIAMtokens.string_of_interps interp2
  320 + cat1 ^ ":" ^ ENIAMtagset.render interp1 ^ " -> " ^ label ^ " -> " ^ cat2 ^ ":" ^ ENIAMtagset.render interp2
321 321 | s,label,t -> ENIAMtokens.string_of_token s ^ " -> " ^ label ^ " -> " ^ ENIAMtokens.string_of_token t
322 322  
323 323 let list_dependencies corpus =
... ... @@ -348,12 +348,12 @@ let string_of_sem sem =
348 348 if sem = "" then "" else "[" ^ sem ^ "]"
349 349  
350 350 let string_of_lci d =
351   - let interp = ENIAMtokens.string_of_interps d.interp in
  351 + let interp = ENIAMtagset.render d.interp in
352 352 if interp = "" then Printf.sprintf "%s,%s" d.lemma d.cat
353 353 else Printf.sprintf "%s,%s:%s" d.lemma d.cat interp
354 354  
355 355 let string_of_phrase (phrase,interp) =
356   - let interp = ENIAMtokens.string_of_interps interp in
  356 + let interp = ENIAMtagset.render interp in
357 357 if interp = "" then phrase
358 358 else Printf.sprintf "%s:%s" phrase interp
359 359  
... ... @@ -585,9 +585,9 @@ let rec flatten_coordination is_coord ulabel usem = function
585 585  
586 586 let string_of_dependency2 is_coord (lemma1,cat1,interp1) label sem (lemma2,cat2,interp2) =
587 587 (if is_coord then "COORD " else "") ^
588   - lemma1 ^ ":" ^ cat1 ^ ":" ^ ENIAMtokens.string_of_interps interp1 ^
  588 + lemma1 ^ ":" ^ cat1 ^ ":" ^ ENIAMtagset.render interp1 ^
589 589 " -> " ^ label ^ (if sem = "" then "" else "["^sem^"]") ^ " -> "
590   - (*^ lemma2 ^ ":"*) ^ cat2 ^ ":" ^ ENIAMtokens.string_of_interps interp2
  590 + (*^ lemma2 ^ ":"*) ^ cat2 ^ ":" ^ ENIAMtagset.render interp2
591 591  
592 592 type sel = Any | Value of string list | Agr of string
593 593 type coord = Coord | Gen
... ... @@ -914,34 +914,34 @@ let rec split_tree forest = function
914 914  
915 915 (* let rec rules_of_tree2 = function
916 916 Dep({sons=[]} as d) ->
917   - d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp
  917 + d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp
918 918 | Dep({sons=[Dep d2]} as d) ->
919   - "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^
  919 + "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^
920 920 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]"
921 921 | Dep({sons=[Dep d2;Dep d3]} as d) ->
922   - "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^
  922 + "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^
923 923 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]"
924 924 | _ -> failwith "rules_of_tree2" *)
925 925  
926 926 (* let rec rules_of_tree2 = function
927 927 Dep({sons=[]} as d) ->
928   - "_:" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp
  928 + "_:" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp
929 929 | Dep({sons=[Dep d2]} as d) ->
930   - "[ _:" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^
  930 + "[ _:" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^
931 931 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]"
932 932 | Dep({sons=[Dep d2;Dep d3]} as d) ->
933   - "[ _:" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^
  933 + "[ _:" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^
934 934 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]"
935 935 | _ -> failwith "rules_of_tree2" *)
936 936  
937 937 let rec rules_of_tree2 = function
938 938 Dep({sons=[]} as d) ->
939   - "_:" ^ d.cat (*^ ":" ^ ENIAMtokens.string_of_interps d.interp*)
  939 + "_:" ^ d.cat (*^ ":" ^ ENIAMtagset.render d.interp*)
940 940 | Dep({sons=[Dep d2]} as d) ->
941   - "[ _:" ^ d.cat ^ (*":" ^ ENIAMtokens.string_of_interps d.interp ^*)
  941 + "[ _:" ^ d.cat ^ (*":" ^ ENIAMtagset.render d.interp ^*)
942 942 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]"
943 943 | Dep({sons=[Dep d2;Dep d3]} as d) ->
944   - "[ _:" ^ d.cat ^ (*":" ^ ENIAMtokens.string_of_interps d.interp ^*)
  944 + "[ _:" ^ d.cat ^ (*":" ^ ENIAMtagset.render d.interp ^*)
945 945 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]"
946 946 | _ -> failwith "rules_of_tree2"
947 947  
... ...
lexSemantics/ENIAMvalence.ml
... ... @@ -59,6 +59,7 @@ let transform_phrase pos lemma = function
59 59 | XP as morf -> [morf]
60 60 | AdjA as morf -> [morf]
61 61 | PadvP as morf -> [morf]
  62 + | AdMod _ as morf -> [morf]
62 63 | Qub as morf -> [morf]
63 64 | FixedP _ as morf -> [morf]
64 65 | SymbolP as morf -> [morf]
... ...
lexSemantics/ENIAMwalRenderer.ml
... ... @@ -270,6 +270,8 @@ let render_phrase_cat cat role node = function
270 270 | Measure(AllUAgr) -> Tensor[Atom "measure"; AVar "unumber"; AVar "ucase"; AVar "ugender"; AVar "uperson"] *)
271 271 | Or -> Tensor[Atom "or"; Atom cat; Atom role; Atom node]
272 272 | Qub -> Tensor[Atom "qub"; Atom cat; Atom role; Atom node]
  273 + | AdMod(GradAgr) -> Tensor[Atom "admod"; AVar "grad"; Atom cat; Atom role; Atom node]
  274 + | AdMod(Grad grad) -> Tensor[Atom "admod"; Atom grad; Atom cat; Atom role; Atom node]
273 275 (* | Inclusion -> Tensor[Atom "inclusion"]
274 276 | Adja -> Tensor[Atom "adja"]
275 277 | Aglt -> Tensor[Atom "aglt"; AVar "number"; AVar "person"]
... ...
lexSemantics/ENIAMwalStringOf.ml
... ... @@ -86,6 +86,7 @@ let gender = function
86 86  
87 87 let grad = function
88 88 Grad s -> s
  89 + | GradAgr -> "agr"
89 90 | GradUndef -> "_"
90 91  
91 92 (* let psem = function
... ... @@ -154,6 +155,7 @@ let rec phrase = function
154 155 (* | Num(c,a) -> "num(" ^ case c ^ "," ^ acm a ^ ")" *)
155 156 | Or -> "or"
156 157 | Qub -> "qub"
  158 + | AdMod g -> "admod(" ^ grad g ^ ")"
157 159 | Inclusion -> "inclusion"
158 160 | Pro -> "pro"
159 161 | ProNG -> "prong"
... ...
lexSemantics/ENIAMwalTypes.ml
... ... @@ -29,7 +29,7 @@ type comp = Comp of string | Zeby | Gdy | CompUndef
29 29 type comp_type = Int | Rel | CompTypeUndef (*| CompTypeAgr*)
30 30 type number = Number of string | NumberUndef | NumberAgr
31 31 type gender = Gender of string | GenderUndef | GenderAgr | Genders of string list
32   -type grad = Grad of string | GradUndef
  32 +type grad = Grad of string | GradUndef | GradAgr
33 33 (* type psem = Psem | Pnosem *)
34 34 (* type refl = (*ReflEmpty |*) ReflTrue | ReflFalse | ReflUndef *)
35 35 (* type acm = Acm of string | AcmUndef *)
... ... @@ -87,6 +87,7 @@ type phrase =
87 87 (* | Refl
88 88 | Recip *)
89 89 | Qub
  90 + | AdMod of grad
90 91 | Inclusion
91 92 | Pro
92 93 | ProNG
... ...
morphology/ENIAMinflexion.ml
... ... @@ -108,13 +108,15 @@ let alt = ref (StringMap.empty : (bool * t) list StringMap.t)
108 108 let stems = ref (StringMap.empty : (string * string * StringSet.t) list StringMap.t)
109 109 let rules = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list)
110 110 let wyglos = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list)
  111 +let lemmata = ref StringSet.empty
111 112  
112 113 let initialize () =
113 114 alt := prepare_alt StringMap.empty alt_filename;
114 115 alt := prepare_alt !alt alt_supplement_filename;
115 116 stems := load_stems stem_filename;
116 117 rules := prepare_rules rules_filename;
117   - wyglos := prepare_wyglos wyglos_filename
  118 + wyglos := prepare_wyglos wyglos_filename;
  119 + lemmata := StringSet.of_list (File.load_lines lemmata_filename)
118 120  
119 121 (* let initialize () =
120 122 alt := prepare_alt StringMap.empty "resources/alt.tab";
... ...
morphology/ENIAMmorphologyTypes.ml
... ... @@ -65,5 +65,6 @@ let alt_filename = resource_path ^ &quot;/morphology/alt.tab&quot;
65 65 let stem_filename = resource_path ^ "/morphology/stem.tab"
66 66 let rules_filename = resource_path ^ "/morphology/freq_rules.tab"
67 67 let wyglos_filename = resource_path ^ "/morphology/wyglos.tab"
  68 +let lemmata_filename = resource_path ^ "/morphology/lemmata.tab"
68 69  
69 70 let alt_supplement_filename = resource_path ^ "/morphology/alt_supplement.tab"
... ...
morphology/generate.ml
  1 +open Xstd
  2 +
1 3 let nlp_resources_path = "../../NLP resources/"
2 4 let sgjp_path = nlp_resources_path ^ "SGJP/"
3 5 let sgjp_filename = "sgjp-20170730.tab.gz"
... ... @@ -35,9 +37,18 @@ let generate_alt rules_filename path filename out_filename =
35 37 let dict = Dict.remove_validated_forms dict in
36 38 Dict.print out_filename dict
37 39  
  40 +let generate_lemmata path filename out_filename =
  41 + let dict = Dict.load_tab (path ^ filename) in
  42 + let lemmata = Xlist.fold dict StringSet.empty (fun set e ->
  43 + StringSet.add set (Stem.simplify_lemma e.ENIAMmorphologyTypes.lemma)) in
  44 + File.file_out out_filename (fun file ->
  45 + StringSet.iter lemmata (Printf.fprintf file "%s\n"))
  46 +
  47 +
38 48 let _ =
39   - Dict.generate_rule_frequencies_list interp_compound_rule_trees sources "resources/freq_rules.tab";
  49 + (* Dict.generate_rule_frequencies_list interp_compound_rule_trees sources "resources/freq_rules.tab";
40 50 generate_alt "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/alt.tab";
41 51 Dict.generate_stem_dict "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/stem.tab";
42   - Dict.generate_wyglos sgjp_path sgjp_filename "resources/wyglos.tab";
  52 + Dict.generate_wyglos sgjp_path sgjp_filename "resources/wyglos.tab"; *)
  53 + generate_lemmata sgjp_path sgjp_filename "resources/lemmata.tab";
43 54 ()
... ...