Commit 7028da65ef43a51d60dbb2f01b7ea7d961d0bf7a
1 parent
74683cab
algorytm selekcji lematów
Showing
59 changed files
with
266391 additions
and
138 deletions
Too many changes to show.
To preserve performance only 13 of 59 files are displayed.
LCGlexicon/ENIAM_LCGlexicon.ml
... | ... | @@ -380,3 +380,9 @@ let create_entries rules id orth cats valence lex_entries = |
380 | 380 | let rules = make_term id orth rules in |
381 | 381 | (* print_endline "create_entries 5"; *) |
382 | 382 | rules @ l) |
383 | + | |
384 | +let initialize () = | |
385 | + ENIAMcategoriesPL.initialize (); | |
386 | + let filenames = [rules_filename; user_lexicon_filename] @ Xlist.map (!theories_paths) (fun path -> path ^ "/lexicon.dic") in | |
387 | + rules := make_rules_list false filenames; | |
388 | + dep_rules := make_rules_list true filenames | |
... | ... |
LCGlexicon/ENIAM_LCGlexiconTypes.ml
... | ... | @@ -17,6 +17,8 @@ |
17 | 17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
18 | 18 | *) |
19 | 19 | |
20 | +open Xstd | |
21 | + | |
20 | 22 | type categories = {lemma: string; pos: string; pos2: string; |
21 | 23 | cat: string; coerced: string list; roles: string list; snode: string list; phrase: string list; |
22 | 24 | numbers: string list; cases: string list; genders: string list; persons: string list; |
... | ... | @@ -126,3 +128,7 @@ let subst_time_lexemes_filename = resource_path ^ "/LCGlexicon/subst_time.dat" |
126 | 128 | |
127 | 129 | let adv_modes_filename = resource_path ^ "/Walenty/adv_modes.tab" |
128 | 130 | let num_nsems_filename = resource_path ^ "/LCGlexicon/num.tab" |
131 | + | |
132 | +let rules = ref (StringMap.empty : (entry list StringMap.t * entry list) StringMap.t) | |
133 | +let dep_rules = ref (StringMap.empty : (entry list StringMap.t * entry list) StringMap.t) | |
134 | +let theories_paths = ref ([] : string list) | |
... | ... |
LCGlexicon/ENIAMcategoriesPL.ml
... | ... | @@ -29,7 +29,7 @@ let all_persons = ["pri";"sec";"ter"] |
29 | 29 | |
30 | 30 | let all_phrases = [ |
31 | 31 | "np";"adjp";"advp";"infp";"ip"; |
32 | - "prepnp";"cp";"ncp";"prepncp";"padvp";"colonp";"mp";"intp"; | |
32 | + "prepnp";"cp";"ncp";"prepncp";"padvp";"colonp";"mp";"intp";"admod"; | |
33 | 33 | "adja";"prepadjp";"comparp";"xp";"xpnom";"xpgen";"symbol";"fixed"; |
34 | 34 | "s";"<root>";"<sentence>";"<paragraph>";(*"";"";"";"";"";"";"";"";*)] |
35 | 35 | |
... | ... |
LCGlexicon/resources/lexicon-pl.dic
... | ... | @@ -36,9 +36,6 @@ measure_weight=1 |
36 | 36 | |
37 | 37 | @LEXICON |
38 | 38 | |
39 | -pos=ordnum,phrase=adjp: | |
40 | - adjp*number*case*gender*grad*coerced*role*node; | |
41 | - | |
42 | 39 | #oznaczenia godzin i minut |
43 | 40 | pos=hour-minute|hour,phrase=np: |
44 | 41 | QUANT[number=sg,case=nom&gen&dat&acc&inst&loc,gender=f,person=ter,role=0] |
... | ... | @@ -211,24 +208,6 @@ pos=subst,case=gen,nsyn=pronoun,cat!=Measure|imię|nazwisko,phrase=np: |
211 | 208 | {distant-schema}{\np*unumber*case*ugender*uperson*Measure*Measure*concept, |
212 | 209 | schema}; |
213 | 210 | |
214 | -# liczebniki | |
215 | -#pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=count: | |
216 | -# QUANT[role=0] | |
217 | -# num*number*case*gender*person*acm*nsem*role*node; # FIXME: jak usunięcie Phrase ProNG wpływa na pokrycie? | |
218 | -#pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=mass: | |
219 | -# QUANT[role=0] | |
220 | -# num*number*case*gender*person*acm*nsem*role*node; # FIXME: jak usunięcie Phrase ProNG wpływa na pokrycie? | |
221 | -pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=count,phrase=np: | |
222 | - QUANT[cat=Number,role=Count] | |
223 | - num*number*case*gender*person*acm*nsem*role*node|(1+fixed*T*OpAdNum*Mod*concept); | |
224 | -pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=mass,phrase=np: | |
225 | - QUANT[cat=Amount,role=Amount] | |
226 | - num*number*case*gender*person*acm*nsem*role*node|(1+fixed*T*OpAdNum*Mod*concept); | |
227 | - | |
228 | -lemma=jeden,pos=adj,grad=pos,phrase=np: | |
229 | - QUANT[person=all_persons,acm=congr,nsem=count,role=Count] | |
230 | - num*number*case*gender*person*acm*nsem*role*node; | |
231 | - | |
232 | 211 | # pojemniki |
233 | 212 | pos=subst,cat=Measure: |
234 | 213 | np*number*case*gender*person*cat*role*node |
... | ... | @@ -240,6 +219,8 @@ pos=subst,case=gen,cat=Measure: |
240 | 219 | {distant-schema}{\num*number*case*gender*person*rec*count*T*concept} |
241 | 220 | {schema}{local-schema}: measure_weight; # UWAGA: number "sg" i gender "n", żeby uzgadniać z podmiotem czasownika |
242 | 221 | |
222 | +pos=subst,phrase=xp: | |
223 | + xp*coerced*role*node{distant-schema}{schema}{local-schema}; | |
243 | 224 | |
244 | 225 | #frazy przymiotnikowe |
245 | 226 | pos=adj|adjc|adjp,phrase=adjp: |
... | ... | @@ -284,15 +265,15 @@ lemma=po,pos=prep,phrase=prepadjp: |
284 | 265 | prepadjp*lemma*case*cat*role*node |
285 | 266 | {/adjp*T*case*T*T*cat*CORE*node+adjp*sg*dat*m1*T*cat*CORE*node}; |
286 | 267 | |
287 | -lemma=za|zbyt|niezbyt,pos=prep,phrase=adjp: | |
288 | - QUANT[cat=0,number=all_numbers,case=all_cases,gender=all_genders,grad=pos] | |
289 | - adjp*number*case*gender*grad*cat*role*node | |
290 | - {/adjp*number*case*gender*grad*cat*Arg*node}; | |
268 | +#lemma=za|zbyt|niezbyt,pos=x,phrase=adjp: | |
269 | +# QUANT[cat=0,coerced=0,number=all_numbers,case=all_cases,gender=all_genders,grad=pos] | |
270 | +# adjp*number*case*gender*grad*cat*role*node | |
271 | +# {/adjp*number*case*gender*grad*cat*Arg*node}; | |
291 | 272 | |
292 | -lemma=jak,pos=x,phrase=adjp: | |
293 | - QUANT[cat=0,coerced=0,number=all_numbers,case=all_cases,gender=all_genders] | |
294 | - adjp*number*case*gender*sup*cat*role*node | |
295 | - {/adjp*number*case*gender*sup*cat*Arg*node}; | |
273 | +#lemma=jak,pos=x,phrase=adjp: | |
274 | +# QUANT[cat=0,coerced=0,number=all_numbers,case=all_cases,gender=all_genders] | |
275 | +# adjp*number*case*gender*sup*cat*role*node | |
276 | +# {/adjp*number*case*gender*sup*cat*Arg*node}; | |
296 | 277 | |
297 | 278 | pos=compar: |
298 | 279 | QUANT[cat=0] |
... | ... | @@ -304,6 +285,22 @@ pos=compar: |
304 | 285 | comparp*lemma*case*cat*role*node |
305 | 286 | /xp*cat*CORE*node; |
306 | 287 | |
288 | +#modyfikatory przyimków i przysłówków | |
289 | +lemma=za,pos=qub,phrase=admod: | |
290 | + QUANT[grad=pos] | |
291 | + admod*grad*cat*role*node{distant-schema}{schema}{local-schema}; | |
292 | + | |
293 | +lemma=bardzo|zbyt|niezbyt,pos=adv,phrase=admod: | |
294 | + QUANT[grad=pos] | |
295 | + admod*grad*cat*role*node{distant-schema}{schema}{local-schema}; | |
296 | + | |
297 | +lemma=nieco|trochę,pos=adv,phrase=admod: | |
298 | + QUANT[grad=pos&com] | |
299 | + admod*grad*cat*role*node{distant-schema}{schema}{local-schema}; | |
300 | + | |
301 | +lemma=jak,pos=adv,phrase=admod: | |
302 | + QUANT[grad=sup] | |
303 | + admod*grad*cat*role*node{distant-schema}{schema}{local-schema}; | |
307 | 304 | |
308 | 305 | # czasowniki |
309 | 306 | pos=ger,phrase=np: |
... | ... |
NKJP2/validateMorphology.ml
... | ... | @@ -35,7 +35,7 @@ let rec add_ntoken stats = function |
35 | 35 | Token t -> |
36 | 36 | (try |
37 | 37 | let nlemma,ncat,ninterp = get_ntoken t.attrs in |
38 | - StringQMap.add stats (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]) | |
38 | + StringQMap.add stats (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtagset.render [ninterp]) | |
39 | 39 | with Not_found -> stats) |
40 | 40 | | Seq l -> Xlist.fold l stats add_ntoken |
41 | 41 | | Variant l -> Xlist.fold l stats add_ntoken |
... | ... | @@ -228,7 +228,7 @@ let get_lemma_cat_interp = function |
228 | 228 | | nlemma,lemma,"subst","subst",[["pl"];c;g],[["pl"];["nom";"voc"];_] -> lemma,"subst",[["pl"];c;g] |
229 | 229 | (* | "5","5","adj","dig",ninterp,[] -> "piąty","adj",ninterp |
230 | 230 | | "6","6","adj","dig",ninterp,[] -> "szósty","adj",ninterp *) |
231 | - (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtokens.string_of_interps [ninterp] ^ " " ^ ENIAMtokens.string_of_interps [interp]); raise Not_found *) | |
231 | + (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtagset.render [ninterp] ^ " " ^ ENIAMtagset.render [interp]); raise Not_found *) | |
232 | 232 | | _ -> raise Not_found |
233 | 233 | |
234 | 234 | |
... | ... | @@ -323,7 +323,7 @@ let rec validate_ntoken_token name id_div paragraph stats = function |
323 | 323 | (try |
324 | 324 | let nlemma,ncat,ninterp = get_ntoken t.attrs in |
325 | 325 | process_ntoken2 stats 1 name id_div t.orth t.beg paragraph nlemma ncat ninterp |
326 | - (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]); | |
326 | + (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtagset.render [ninterp]); | |
327 | 327 | Printf.printf "%s\t%d\t%s\t%d\n" name id_div t.orth t.beg; |
328 | 328 | print_endline paragraph; |
329 | 329 | stats *) |
... | ... |
corpora/CONLL2.ml
... | ... | @@ -46,7 +46,7 @@ let load_token beg compound in_channel = |
46 | 46 | [a;b] -> (try int_of_string b - int_of_string a with _ -> failwith "load_token: interval id") |
47 | 47 | | _ -> failwith "load_token: interval id" in |
48 | 48 | raise (Interval_id len) in |
49 | - let pos,tags = match ENIAMtokens.parse_postags interp with [x] -> x | _ -> failwith "n_token" in | |
49 | + let pos,tags = match ENIAMtagset.parse interp with [x] -> x | _ -> failwith "n_token" in | |
50 | 50 | {empty_token_env with orth = orth; beg=beg; len=len; next=next; |
51 | 51 | token = Lemma(lemma,pos,[tags])}, next, id, sl, sem in |
52 | 52 | let line = input_line in_channel in |
... | ... | @@ -139,7 +139,7 @@ let get_tagset corpus = |
139 | 139 | Int.fold 1 (ExtArray.size tokens - 1) qmap (fun qmap i -> |
140 | 140 | let t = ExtArray.get tokens i in |
141 | 141 | match t.token with |
142 | - Lemma(lemma,cat,interp) -> StringQMap.add qmap (cat ^ ":" ^ ENIAMtokens.string_of_interps interp) | |
142 | + Lemma(lemma,cat,interp) -> StringQMap.add qmap (cat ^ ":" ^ ENIAMtagset.render interp) | |
143 | 143 | | _ -> failwith "get_tagset")) |
144 | 144 | |
145 | 145 | let numbers = StringSet.of_list ["sg";"pl"] |
... | ... | @@ -315,9 +315,9 @@ let convert_tagset corpus = |
315 | 315 | | _,"discourse:emo",_ -> "discourse:emo" |
316 | 316 | | (Lemma(lemma1,"subst",[[_] :: [c1] :: _]) as s),"case",(Lemma(lemma2,"prep",[[c2] :: _]) as t) -> |
317 | 317 | if c1 = c2 then "subst" ^ " -> case -> " ^ "prep" else ENIAMtokens.string_of_token s ^ " -> " ^ "case" ^ " -> " ^ ENIAMtokens.string_of_token t |
318 | - | Lemma(lemma1,cat1,interp1),"case",Lemma(lemma2,"adv",interp2) -> cat1 ^ ":" ^ ENIAMtokens.string_of_interps interp1 ^ " -> case -> " ^ lemma2 ^ ":" ^ "adv" ^ ":" ^ ENIAMtokens.string_of_interps interp2 | |
318 | + | Lemma(lemma1,cat1,interp1),"case",Lemma(lemma2,"adv",interp2) -> cat1 ^ ":" ^ ENIAMtagset.render interp1 ^ " -> case -> " ^ lemma2 ^ ":" ^ "adv" ^ ":" ^ ENIAMtagset.render interp2 | |
319 | 319 | | Lemma(lemma1,cat1,interp1),label,Lemma(lemma2,cat2,interp2) -> |
320 | - cat1 ^ ":" ^ ENIAMtokens.string_of_interps interp1 ^ " -> " ^ label ^ " -> " ^ cat2 ^ ":" ^ ENIAMtokens.string_of_interps interp2 | |
320 | + cat1 ^ ":" ^ ENIAMtagset.render interp1 ^ " -> " ^ label ^ " -> " ^ cat2 ^ ":" ^ ENIAMtagset.render interp2 | |
321 | 321 | | s,label,t -> ENIAMtokens.string_of_token s ^ " -> " ^ label ^ " -> " ^ ENIAMtokens.string_of_token t |
322 | 322 | |
323 | 323 | let list_dependencies corpus = |
... | ... | @@ -348,12 +348,12 @@ let string_of_sem sem = |
348 | 348 | if sem = "" then "" else "[" ^ sem ^ "]" |
349 | 349 | |
350 | 350 | let string_of_lci d = |
351 | - let interp = ENIAMtokens.string_of_interps d.interp in | |
351 | + let interp = ENIAMtagset.render d.interp in | |
352 | 352 | if interp = "" then Printf.sprintf "%s,%s" d.lemma d.cat |
353 | 353 | else Printf.sprintf "%s,%s:%s" d.lemma d.cat interp |
354 | 354 | |
355 | 355 | let string_of_phrase (phrase,interp) = |
356 | - let interp = ENIAMtokens.string_of_interps interp in | |
356 | + let interp = ENIAMtagset.render interp in | |
357 | 357 | if interp = "" then phrase |
358 | 358 | else Printf.sprintf "%s:%s" phrase interp |
359 | 359 | |
... | ... | @@ -585,9 +585,9 @@ let rec flatten_coordination is_coord ulabel usem = function |
585 | 585 | |
586 | 586 | let string_of_dependency2 is_coord (lemma1,cat1,interp1) label sem (lemma2,cat2,interp2) = |
587 | 587 | (if is_coord then "COORD " else "") ^ |
588 | - lemma1 ^ ":" ^ cat1 ^ ":" ^ ENIAMtokens.string_of_interps interp1 ^ | |
588 | + lemma1 ^ ":" ^ cat1 ^ ":" ^ ENIAMtagset.render interp1 ^ | |
589 | 589 | " -> " ^ label ^ (if sem = "" then "" else "["^sem^"]") ^ " -> " |
590 | - (*^ lemma2 ^ ":"*) ^ cat2 ^ ":" ^ ENIAMtokens.string_of_interps interp2 | |
590 | + (*^ lemma2 ^ ":"*) ^ cat2 ^ ":" ^ ENIAMtagset.render interp2 | |
591 | 591 | |
592 | 592 | type sel = Any | Value of string list | Agr of string |
593 | 593 | type coord = Coord | Gen |
... | ... | @@ -914,34 +914,34 @@ let rec split_tree forest = function |
914 | 914 | |
915 | 915 | (* let rec rules_of_tree2 = function |
916 | 916 | Dep({sons=[]} as d) -> |
917 | - d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp | |
917 | + d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp | |
918 | 918 | | Dep({sons=[Dep d2]} as d) -> |
919 | - "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^ | |
919 | + "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^ | |
920 | 920 | " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]" |
921 | 921 | | Dep({sons=[Dep d2;Dep d3]} as d) -> |
922 | - "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^ | |
922 | + "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^ | |
923 | 923 | " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]" |
924 | 924 | | _ -> failwith "rules_of_tree2" *) |
925 | 925 | |
926 | 926 | (* let rec rules_of_tree2 = function |
927 | 927 | Dep({sons=[]} as d) -> |
928 | - "_:" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp | |
928 | + "_:" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp | |
929 | 929 | | Dep({sons=[Dep d2]} as d) -> |
930 | - "[ _:" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^ | |
930 | + "[ _:" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^ | |
931 | 931 | " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]" |
932 | 932 | | Dep({sons=[Dep d2;Dep d3]} as d) -> |
933 | - "[ _:" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^ | |
933 | + "[ _:" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^ | |
934 | 934 | " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]" |
935 | 935 | | _ -> failwith "rules_of_tree2" *) |
936 | 936 | |
937 | 937 | let rec rules_of_tree2 = function |
938 | 938 | Dep({sons=[]} as d) -> |
939 | - "_:" ^ d.cat (*^ ":" ^ ENIAMtokens.string_of_interps d.interp*) | |
939 | + "_:" ^ d.cat (*^ ":" ^ ENIAMtagset.render d.interp*) | |
940 | 940 | | Dep({sons=[Dep d2]} as d) -> |
941 | - "[ _:" ^ d.cat ^ (*":" ^ ENIAMtokens.string_of_interps d.interp ^*) | |
941 | + "[ _:" ^ d.cat ^ (*":" ^ ENIAMtagset.render d.interp ^*) | |
942 | 942 | " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]" |
943 | 943 | | Dep({sons=[Dep d2;Dep d3]} as d) -> |
944 | - "[ _:" ^ d.cat ^ (*":" ^ ENIAMtokens.string_of_interps d.interp ^*) | |
944 | + "[ _:" ^ d.cat ^ (*":" ^ ENIAMtagset.render d.interp ^*) | |
945 | 945 | " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]" |
946 | 946 | | _ -> failwith "rules_of_tree2" |
947 | 947 | |
... | ... |
lexSemantics/ENIAMvalence.ml
lexSemantics/ENIAMwalRenderer.ml
... | ... | @@ -270,6 +270,8 @@ let render_phrase_cat cat role node = function |
270 | 270 | | Measure(AllUAgr) -> Tensor[Atom "measure"; AVar "unumber"; AVar "ucase"; AVar "ugender"; AVar "uperson"] *) |
271 | 271 | | Or -> Tensor[Atom "or"; Atom cat; Atom role; Atom node] |
272 | 272 | | Qub -> Tensor[Atom "qub"; Atom cat; Atom role; Atom node] |
273 | + | AdMod(GradAgr) -> Tensor[Atom "admod"; AVar "grad"; Atom cat; Atom role; Atom node] | |
274 | + | AdMod(Grad grad) -> Tensor[Atom "admod"; Atom grad; Atom cat; Atom role; Atom node] | |
273 | 275 | (* | Inclusion -> Tensor[Atom "inclusion"] |
274 | 276 | | Adja -> Tensor[Atom "adja"] |
275 | 277 | | Aglt -> Tensor[Atom "aglt"; AVar "number"; AVar "person"] |
... | ... |
lexSemantics/ENIAMwalStringOf.ml
... | ... | @@ -86,6 +86,7 @@ let gender = function |
86 | 86 | |
87 | 87 | let grad = function |
88 | 88 | Grad s -> s |
89 | + | GradAgr -> "agr" | |
89 | 90 | | GradUndef -> "_" |
90 | 91 | |
91 | 92 | (* let psem = function |
... | ... | @@ -154,6 +155,7 @@ let rec phrase = function |
154 | 155 | (* | Num(c,a) -> "num(" ^ case c ^ "," ^ acm a ^ ")" *) |
155 | 156 | | Or -> "or" |
156 | 157 | | Qub -> "qub" |
158 | + | AdMod g -> "admod(" ^ grad g ^ ")" | |
157 | 159 | | Inclusion -> "inclusion" |
158 | 160 | | Pro -> "pro" |
159 | 161 | | ProNG -> "prong" |
... | ... |
lexSemantics/ENIAMwalTypes.ml
... | ... | @@ -29,7 +29,7 @@ type comp = Comp of string | Zeby | Gdy | CompUndef |
29 | 29 | type comp_type = Int | Rel | CompTypeUndef (*| CompTypeAgr*) |
30 | 30 | type number = Number of string | NumberUndef | NumberAgr |
31 | 31 | type gender = Gender of string | GenderUndef | GenderAgr | Genders of string list |
32 | -type grad = Grad of string | GradUndef | |
32 | +type grad = Grad of string | GradUndef | GradAgr | |
33 | 33 | (* type psem = Psem | Pnosem *) |
34 | 34 | (* type refl = (*ReflEmpty |*) ReflTrue | ReflFalse | ReflUndef *) |
35 | 35 | (* type acm = Acm of string | AcmUndef *) |
... | ... | @@ -87,6 +87,7 @@ type phrase = |
87 | 87 | (* | Refl |
88 | 88 | | Recip *) |
89 | 89 | | Qub |
90 | + | AdMod of grad | |
90 | 91 | | Inclusion |
91 | 92 | | Pro |
92 | 93 | | ProNG |
... | ... |
morphology/ENIAMinflexion.ml
... | ... | @@ -108,13 +108,15 @@ let alt = ref (StringMap.empty : (bool * t) list StringMap.t) |
108 | 108 | let stems = ref (StringMap.empty : (string * string * StringSet.t) list StringMap.t) |
109 | 109 | let rules = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list) |
110 | 110 | let wyglos = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list) |
111 | +let lemmata = ref StringSet.empty | |
111 | 112 | |
112 | 113 | let initialize () = |
113 | 114 | alt := prepare_alt StringMap.empty alt_filename; |
114 | 115 | alt := prepare_alt !alt alt_supplement_filename; |
115 | 116 | stems := load_stems stem_filename; |
116 | 117 | rules := prepare_rules rules_filename; |
117 | - wyglos := prepare_wyglos wyglos_filename | |
118 | + wyglos := prepare_wyglos wyglos_filename; | |
119 | + lemmata := StringSet.of_list (File.load_lines lemmata_filename) | |
118 | 120 | |
119 | 121 | (* let initialize () = |
120 | 122 | alt := prepare_alt StringMap.empty "resources/alt.tab"; |
... | ... |
morphology/ENIAMmorphologyTypes.ml
... | ... | @@ -65,5 +65,6 @@ let alt_filename = resource_path ^ "/morphology/alt.tab" |
65 | 65 | let stem_filename = resource_path ^ "/morphology/stem.tab" |
66 | 66 | let rules_filename = resource_path ^ "/morphology/freq_rules.tab" |
67 | 67 | let wyglos_filename = resource_path ^ "/morphology/wyglos.tab" |
68 | +let lemmata_filename = resource_path ^ "/morphology/lemmata.tab" | |
68 | 69 | |
69 | 70 | let alt_supplement_filename = resource_path ^ "/morphology/alt_supplement.tab" |
... | ... |
morphology/generate.ml
1 | +open Xstd | |
2 | + | |
1 | 3 | let nlp_resources_path = "../../NLP resources/" |
2 | 4 | let sgjp_path = nlp_resources_path ^ "SGJP/" |
3 | 5 | let sgjp_filename = "sgjp-20170730.tab.gz" |
... | ... | @@ -35,9 +37,18 @@ let generate_alt rules_filename path filename out_filename = |
35 | 37 | let dict = Dict.remove_validated_forms dict in |
36 | 38 | Dict.print out_filename dict |
37 | 39 | |
40 | +let generate_lemmata path filename out_filename = | |
41 | + let dict = Dict.load_tab (path ^ filename) in | |
42 | + let lemmata = Xlist.fold dict StringSet.empty (fun set e -> | |
43 | + StringSet.add set (Stem.simplify_lemma e.ENIAMmorphologyTypes.lemma)) in | |
44 | + File.file_out out_filename (fun file -> | |
45 | + StringSet.iter lemmata (Printf.fprintf file "%s\n")) | |
46 | + | |
47 | + | |
38 | 48 | let _ = |
39 | - Dict.generate_rule_frequencies_list interp_compound_rule_trees sources "resources/freq_rules.tab"; | |
49 | + (* Dict.generate_rule_frequencies_list interp_compound_rule_trees sources "resources/freq_rules.tab"; | |
40 | 50 | generate_alt "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/alt.tab"; |
41 | 51 | Dict.generate_stem_dict "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/stem.tab"; |
42 | - Dict.generate_wyglos sgjp_path sgjp_filename "resources/wyglos.tab"; | |
52 | + Dict.generate_wyglos sgjp_path sgjp_filename "resources/wyglos.tab"; *) | |
53 | + generate_lemmata sgjp_path sgjp_filename "resources/lemmata.tab"; | |
43 | 54 | () |
... | ... |