Commit 7028da65ef43a51d60dbb2f01b7ea7d961d0bf7a

Authored by Wojciech Jaworski
1 parent 74683cab

algorytm selekcji lematów

Showing 59 changed files with 266391 additions and 138 deletions

Too many changes to show.

To preserve performance only 13 of 59 files are displayed.

LCGlexicon/ENIAM_LCGlexicon.ml
@@ -380,3 +380,9 @@ let create_entries rules id orth cats valence lex_entries = @@ -380,3 +380,9 @@ let create_entries rules id orth cats valence lex_entries =
380 let rules = make_term id orth rules in 380 let rules = make_term id orth rules in
381 (* print_endline "create_entries 5"; *) 381 (* print_endline "create_entries 5"; *)
382 rules @ l) 382 rules @ l)
  383 +
  384 +let initialize () =
  385 + ENIAMcategoriesPL.initialize ();
  386 + let filenames = [rules_filename; user_lexicon_filename] @ Xlist.map (!theories_paths) (fun path -> path ^ "/lexicon.dic") in
  387 + rules := make_rules_list false filenames;
  388 + dep_rules := make_rules_list true filenames
LCGlexicon/ENIAM_LCGlexiconTypes.ml
@@ -17,6 +17,8 @@ @@ -17,6 +17,8 @@
17 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 *) 18 *)
19 19
  20 +open Xstd
  21 +
20 type categories = {lemma: string; pos: string; pos2: string; 22 type categories = {lemma: string; pos: string; pos2: string;
21 cat: string; coerced: string list; roles: string list; snode: string list; phrase: string list; 23 cat: string; coerced: string list; roles: string list; snode: string list; phrase: string list;
22 numbers: string list; cases: string list; genders: string list; persons: string list; 24 numbers: string list; cases: string list; genders: string list; persons: string list;
@@ -126,3 +128,7 @@ let subst_time_lexemes_filename = resource_path ^ &quot;/LCGlexicon/subst_time.dat&quot; @@ -126,3 +128,7 @@ let subst_time_lexemes_filename = resource_path ^ &quot;/LCGlexicon/subst_time.dat&quot;
126 128
127 let adv_modes_filename = resource_path ^ "/Walenty/adv_modes.tab" 129 let adv_modes_filename = resource_path ^ "/Walenty/adv_modes.tab"
128 let num_nsems_filename = resource_path ^ "/LCGlexicon/num.tab" 130 let num_nsems_filename = resource_path ^ "/LCGlexicon/num.tab"
  131 +
  132 +let rules = ref (StringMap.empty : (entry list StringMap.t * entry list) StringMap.t)
  133 +let dep_rules = ref (StringMap.empty : (entry list StringMap.t * entry list) StringMap.t)
  134 +let theories_paths = ref ([] : string list)
LCGlexicon/ENIAMcategoriesPL.ml
@@ -29,7 +29,7 @@ let all_persons = [&quot;pri&quot;;&quot;sec&quot;;&quot;ter&quot;] @@ -29,7 +29,7 @@ let all_persons = [&quot;pri&quot;;&quot;sec&quot;;&quot;ter&quot;]
29 29
30 let all_phrases = [ 30 let all_phrases = [
31 "np";"adjp";"advp";"infp";"ip"; 31 "np";"adjp";"advp";"infp";"ip";
32 - "prepnp";"cp";"ncp";"prepncp";"padvp";"colonp";"mp";"intp"; 32 + "prepnp";"cp";"ncp";"prepncp";"padvp";"colonp";"mp";"intp";"admod";
33 "adja";"prepadjp";"comparp";"xp";"xpnom";"xpgen";"symbol";"fixed"; 33 "adja";"prepadjp";"comparp";"xp";"xpnom";"xpgen";"symbol";"fixed";
34 "s";"<root>";"<sentence>";"<paragraph>";(*"";"";"";"";"";"";"";"";*)] 34 "s";"<root>";"<sentence>";"<paragraph>";(*"";"";"";"";"";"";"";"";*)]
35 35
LCGlexicon/resources/lexicon-pl.dic
@@ -36,9 +36,6 @@ measure_weight=1 @@ -36,9 +36,6 @@ measure_weight=1
36 36
37 @LEXICON 37 @LEXICON
38 38
39 -pos=ordnum,phrase=adjp:  
40 - adjp*number*case*gender*grad*coerced*role*node;  
41 -  
42 #oznaczenia godzin i minut 39 #oznaczenia godzin i minut
43 pos=hour-minute|hour,phrase=np: 40 pos=hour-minute|hour,phrase=np:
44 QUANT[number=sg,case=nom&gen&dat&acc&inst&loc,gender=f,person=ter,role=0] 41 QUANT[number=sg,case=nom&gen&dat&acc&inst&loc,gender=f,person=ter,role=0]
@@ -211,24 +208,6 @@ pos=subst,case=gen,nsyn=pronoun,cat!=Measure|imię|nazwisko,phrase=np: @@ -211,24 +208,6 @@ pos=subst,case=gen,nsyn=pronoun,cat!=Measure|imię|nazwisko,phrase=np:
211 {distant-schema}{\np*unumber*case*ugender*uperson*Measure*Measure*concept, 208 {distant-schema}{\np*unumber*case*ugender*uperson*Measure*Measure*concept,
212 schema}; 209 schema};
213 210
214 -# liczebniki  
215 -#pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=count:  
216 -# QUANT[role=0]  
217 -# num*number*case*gender*person*acm*nsem*role*node; # FIXME: jak usunięcie Phrase ProNG wpływa na pokrycie?  
218 -#pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=mass:  
219 -# QUANT[role=0]  
220 -# num*number*case*gender*person*acm*nsem*role*node; # FIXME: jak usunięcie Phrase ProNG wpływa na pokrycie?  
221 -pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=count,phrase=np:  
222 - QUANT[cat=Number,role=Count]  
223 - num*number*case*gender*person*acm*nsem*role*node|(1+fixed*T*OpAdNum*Mod*concept);  
224 -pos=num|intnum|realnum|intnum-interval|realnum-interval,nsem=mass,phrase=np:  
225 - QUANT[cat=Amount,role=Amount]  
226 - num*number*case*gender*person*acm*nsem*role*node|(1+fixed*T*OpAdNum*Mod*concept);  
227 -  
228 -lemma=jeden,pos=adj,grad=pos,phrase=np:  
229 - QUANT[person=all_persons,acm=congr,nsem=count,role=Count]  
230 - num*number*case*gender*person*acm*nsem*role*node;  
231 -  
232 # pojemniki 211 # pojemniki
233 pos=subst,cat=Measure: 212 pos=subst,cat=Measure:
234 np*number*case*gender*person*cat*role*node 213 np*number*case*gender*person*cat*role*node
@@ -240,6 +219,8 @@ pos=subst,case=gen,cat=Measure: @@ -240,6 +219,8 @@ pos=subst,case=gen,cat=Measure:
240 {distant-schema}{\num*number*case*gender*person*rec*count*T*concept} 219 {distant-schema}{\num*number*case*gender*person*rec*count*T*concept}
241 {schema}{local-schema}: measure_weight; # UWAGA: number "sg" i gender "n", żeby uzgadniać z podmiotem czasownika 220 {schema}{local-schema}: measure_weight; # UWAGA: number "sg" i gender "n", żeby uzgadniać z podmiotem czasownika
242 221
  222 +pos=subst,phrase=xp:
  223 + xp*coerced*role*node{distant-schema}{schema}{local-schema};
243 224
244 #frazy przymiotnikowe 225 #frazy przymiotnikowe
245 pos=adj|adjc|adjp,phrase=adjp: 226 pos=adj|adjc|adjp,phrase=adjp:
@@ -284,15 +265,15 @@ lemma=po,pos=prep,phrase=prepadjp: @@ -284,15 +265,15 @@ lemma=po,pos=prep,phrase=prepadjp:
284 prepadjp*lemma*case*cat*role*node 265 prepadjp*lemma*case*cat*role*node
285 {/adjp*T*case*T*T*cat*CORE*node+adjp*sg*dat*m1*T*cat*CORE*node}; 266 {/adjp*T*case*T*T*cat*CORE*node+adjp*sg*dat*m1*T*cat*CORE*node};
286 267
287 -lemma=za|zbyt|niezbyt,pos=prep,phrase=adjp:  
288 - QUANT[cat=0,number=all_numbers,case=all_cases,gender=all_genders,grad=pos]  
289 - adjp*number*case*gender*grad*cat*role*node  
290 - {/adjp*number*case*gender*grad*cat*Arg*node}; 268 +#lemma=za|zbyt|niezbyt,pos=x,phrase=adjp:
  269 +# QUANT[cat=0,coerced=0,number=all_numbers,case=all_cases,gender=all_genders,grad=pos]
  270 +# adjp*number*case*gender*grad*cat*role*node
  271 +# {/adjp*number*case*gender*grad*cat*Arg*node};
291 272
292 -lemma=jak,pos=x,phrase=adjp:  
293 - QUANT[cat=0,coerced=0,number=all_numbers,case=all_cases,gender=all_genders]  
294 - adjp*number*case*gender*sup*cat*role*node  
295 - {/adjp*number*case*gender*sup*cat*Arg*node}; 273 +#lemma=jak,pos=x,phrase=adjp:
  274 +# QUANT[cat=0,coerced=0,number=all_numbers,case=all_cases,gender=all_genders]
  275 +# adjp*number*case*gender*sup*cat*role*node
  276 +# {/adjp*number*case*gender*sup*cat*Arg*node};
296 277
297 pos=compar: 278 pos=compar:
298 QUANT[cat=0] 279 QUANT[cat=0]
@@ -304,6 +285,22 @@ pos=compar: @@ -304,6 +285,22 @@ pos=compar:
304 comparp*lemma*case*cat*role*node 285 comparp*lemma*case*cat*role*node
305 /xp*cat*CORE*node; 286 /xp*cat*CORE*node;
306 287
  288 +#modyfikatory przyimków i przysłówków
  289 +lemma=za,pos=qub,phrase=admod:
  290 + QUANT[grad=pos]
  291 + admod*grad*cat*role*node{distant-schema}{schema}{local-schema};
  292 +
  293 +lemma=bardzo|zbyt|niezbyt,pos=adv,phrase=admod:
  294 + QUANT[grad=pos]
  295 + admod*grad*cat*role*node{distant-schema}{schema}{local-schema};
  296 +
  297 +lemma=nieco|trochę,pos=adv,phrase=admod:
  298 + QUANT[grad=pos&com]
  299 + admod*grad*cat*role*node{distant-schema}{schema}{local-schema};
  300 +
  301 +lemma=jak,pos=adv,phrase=admod:
  302 + QUANT[grad=sup]
  303 + admod*grad*cat*role*node{distant-schema}{schema}{local-schema};
307 304
308 # czasowniki 305 # czasowniki
309 pos=ger,phrase=np: 306 pos=ger,phrase=np:
NKJP2/validateMorphology.ml
@@ -35,7 +35,7 @@ let rec add_ntoken stats = function @@ -35,7 +35,7 @@ let rec add_ntoken stats = function
35 Token t -> 35 Token t ->
36 (try 36 (try
37 let nlemma,ncat,ninterp = get_ntoken t.attrs in 37 let nlemma,ncat,ninterp = get_ntoken t.attrs in
38 - StringQMap.add stats (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]) 38 + StringQMap.add stats (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtagset.render [ninterp])
39 with Not_found -> stats) 39 with Not_found -> stats)
40 | Seq l -> Xlist.fold l stats add_ntoken 40 | Seq l -> Xlist.fold l stats add_ntoken
41 | Variant l -> Xlist.fold l stats add_ntoken 41 | Variant l -> Xlist.fold l stats add_ntoken
@@ -228,7 +228,7 @@ let get_lemma_cat_interp = function @@ -228,7 +228,7 @@ let get_lemma_cat_interp = function
228 | nlemma,lemma,"subst","subst",[["pl"];c;g],[["pl"];["nom";"voc"];_] -> lemma,"subst",[["pl"];c;g] 228 | nlemma,lemma,"subst","subst",[["pl"];c;g],[["pl"];["nom";"voc"];_] -> lemma,"subst",[["pl"];c;g]
229 (* | "5","5","adj","dig",ninterp,[] -> "piąty","adj",ninterp 229 (* | "5","5","adj","dig",ninterp,[] -> "piąty","adj",ninterp
230 | "6","6","adj","dig",ninterp,[] -> "szósty","adj",ninterp *) 230 | "6","6","adj","dig",ninterp,[] -> "szósty","adj",ninterp *)
231 - (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtokens.string_of_interps [ninterp] ^ " " ^ ENIAMtokens.string_of_interps [interp]); raise Not_found *) 231 + (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtagset.render [ninterp] ^ " " ^ ENIAMtagset.render [interp]); raise Not_found *)
232 | _ -> raise Not_found 232 | _ -> raise Not_found
233 233
234 234
@@ -323,7 +323,7 @@ let rec validate_ntoken_token name id_div paragraph stats = function @@ -323,7 +323,7 @@ let rec validate_ntoken_token name id_div paragraph stats = function
323 (try 323 (try
324 let nlemma,ncat,ninterp = get_ntoken t.attrs in 324 let nlemma,ncat,ninterp = get_ntoken t.attrs in
325 process_ntoken2 stats 1 name id_div t.orth t.beg paragraph nlemma ncat ninterp 325 process_ntoken2 stats 1 name id_div t.orth t.beg paragraph nlemma ncat ninterp
326 - (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]); 326 + (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtagset.render [ninterp]);
327 Printf.printf "%s\t%d\t%s\t%d\n" name id_div t.orth t.beg; 327 Printf.printf "%s\t%d\t%s\t%d\n" name id_div t.orth t.beg;
328 print_endline paragraph; 328 print_endline paragraph;
329 stats *) 329 stats *)
corpora/CONLL2.ml
@@ -46,7 +46,7 @@ let load_token beg compound in_channel = @@ -46,7 +46,7 @@ let load_token beg compound in_channel =
46 [a;b] -> (try int_of_string b - int_of_string a with _ -> failwith "load_token: interval id") 46 [a;b] -> (try int_of_string b - int_of_string a with _ -> failwith "load_token: interval id")
47 | _ -> failwith "load_token: interval id" in 47 | _ -> failwith "load_token: interval id" in
48 raise (Interval_id len) in 48 raise (Interval_id len) in
49 - let pos,tags = match ENIAMtokens.parse_postags interp with [x] -> x | _ -> failwith "n_token" in 49 + let pos,tags = match ENIAMtagset.parse interp with [x] -> x | _ -> failwith "n_token" in
50 {empty_token_env with orth = orth; beg=beg; len=len; next=next; 50 {empty_token_env with orth = orth; beg=beg; len=len; next=next;
51 token = Lemma(lemma,pos,[tags])}, next, id, sl, sem in 51 token = Lemma(lemma,pos,[tags])}, next, id, sl, sem in
52 let line = input_line in_channel in 52 let line = input_line in_channel in
@@ -139,7 +139,7 @@ let get_tagset corpus = @@ -139,7 +139,7 @@ let get_tagset corpus =
139 Int.fold 1 (ExtArray.size tokens - 1) qmap (fun qmap i -> 139 Int.fold 1 (ExtArray.size tokens - 1) qmap (fun qmap i ->
140 let t = ExtArray.get tokens i in 140 let t = ExtArray.get tokens i in
141 match t.token with 141 match t.token with
142 - Lemma(lemma,cat,interp) -> StringQMap.add qmap (cat ^ ":" ^ ENIAMtokens.string_of_interps interp) 142 + Lemma(lemma,cat,interp) -> StringQMap.add qmap (cat ^ ":" ^ ENIAMtagset.render interp)
143 | _ -> failwith "get_tagset")) 143 | _ -> failwith "get_tagset"))
144 144
145 let numbers = StringSet.of_list ["sg";"pl"] 145 let numbers = StringSet.of_list ["sg";"pl"]
@@ -315,9 +315,9 @@ let convert_tagset corpus = @@ -315,9 +315,9 @@ let convert_tagset corpus =
315 | _,"discourse:emo",_ -> "discourse:emo" 315 | _,"discourse:emo",_ -> "discourse:emo"
316 | (Lemma(lemma1,"subst",[[_] :: [c1] :: _]) as s),"case",(Lemma(lemma2,"prep",[[c2] :: _]) as t) -> 316 | (Lemma(lemma1,"subst",[[_] :: [c1] :: _]) as s),"case",(Lemma(lemma2,"prep",[[c2] :: _]) as t) ->
317 if c1 = c2 then "subst" ^ " -> case -> " ^ "prep" else ENIAMtokens.string_of_token s ^ " -> " ^ "case" ^ " -> " ^ ENIAMtokens.string_of_token t 317 if c1 = c2 then "subst" ^ " -> case -> " ^ "prep" else ENIAMtokens.string_of_token s ^ " -> " ^ "case" ^ " -> " ^ ENIAMtokens.string_of_token t
318 - | Lemma(lemma1,cat1,interp1),"case",Lemma(lemma2,"adv",interp2) -> cat1 ^ ":" ^ ENIAMtokens.string_of_interps interp1 ^ " -> case -> " ^ lemma2 ^ ":" ^ "adv" ^ ":" ^ ENIAMtokens.string_of_interps interp2 318 + | Lemma(lemma1,cat1,interp1),"case",Lemma(lemma2,"adv",interp2) -> cat1 ^ ":" ^ ENIAMtagset.render interp1 ^ " -> case -> " ^ lemma2 ^ ":" ^ "adv" ^ ":" ^ ENIAMtagset.render interp2
319 | Lemma(lemma1,cat1,interp1),label,Lemma(lemma2,cat2,interp2) -> 319 | Lemma(lemma1,cat1,interp1),label,Lemma(lemma2,cat2,interp2) ->
320 - cat1 ^ ":" ^ ENIAMtokens.string_of_interps interp1 ^ " -> " ^ label ^ " -> " ^ cat2 ^ ":" ^ ENIAMtokens.string_of_interps interp2 320 + cat1 ^ ":" ^ ENIAMtagset.render interp1 ^ " -> " ^ label ^ " -> " ^ cat2 ^ ":" ^ ENIAMtagset.render interp2
321 | s,label,t -> ENIAMtokens.string_of_token s ^ " -> " ^ label ^ " -> " ^ ENIAMtokens.string_of_token t 321 | s,label,t -> ENIAMtokens.string_of_token s ^ " -> " ^ label ^ " -> " ^ ENIAMtokens.string_of_token t
322 322
323 let list_dependencies corpus = 323 let list_dependencies corpus =
@@ -348,12 +348,12 @@ let string_of_sem sem = @@ -348,12 +348,12 @@ let string_of_sem sem =
348 if sem = "" then "" else "[" ^ sem ^ "]" 348 if sem = "" then "" else "[" ^ sem ^ "]"
349 349
350 let string_of_lci d = 350 let string_of_lci d =
351 - let interp = ENIAMtokens.string_of_interps d.interp in 351 + let interp = ENIAMtagset.render d.interp in
352 if interp = "" then Printf.sprintf "%s,%s" d.lemma d.cat 352 if interp = "" then Printf.sprintf "%s,%s" d.lemma d.cat
353 else Printf.sprintf "%s,%s:%s" d.lemma d.cat interp 353 else Printf.sprintf "%s,%s:%s" d.lemma d.cat interp
354 354
355 let string_of_phrase (phrase,interp) = 355 let string_of_phrase (phrase,interp) =
356 - let interp = ENIAMtokens.string_of_interps interp in 356 + let interp = ENIAMtagset.render interp in
357 if interp = "" then phrase 357 if interp = "" then phrase
358 else Printf.sprintf "%s:%s" phrase interp 358 else Printf.sprintf "%s:%s" phrase interp
359 359
@@ -585,9 +585,9 @@ let rec flatten_coordination is_coord ulabel usem = function @@ -585,9 +585,9 @@ let rec flatten_coordination is_coord ulabel usem = function
585 585
586 let string_of_dependency2 is_coord (lemma1,cat1,interp1) label sem (lemma2,cat2,interp2) = 586 let string_of_dependency2 is_coord (lemma1,cat1,interp1) label sem (lemma2,cat2,interp2) =
587 (if is_coord then "COORD " else "") ^ 587 (if is_coord then "COORD " else "") ^
588 - lemma1 ^ ":" ^ cat1 ^ ":" ^ ENIAMtokens.string_of_interps interp1 ^ 588 + lemma1 ^ ":" ^ cat1 ^ ":" ^ ENIAMtagset.render interp1 ^
589 " -> " ^ label ^ (if sem = "" then "" else "["^sem^"]") ^ " -> " 589 " -> " ^ label ^ (if sem = "" then "" else "["^sem^"]") ^ " -> "
590 - (*^ lemma2 ^ ":"*) ^ cat2 ^ ":" ^ ENIAMtokens.string_of_interps interp2 590 + (*^ lemma2 ^ ":"*) ^ cat2 ^ ":" ^ ENIAMtagset.render interp2
591 591
592 type sel = Any | Value of string list | Agr of string 592 type sel = Any | Value of string list | Agr of string
593 type coord = Coord | Gen 593 type coord = Coord | Gen
@@ -914,34 +914,34 @@ let rec split_tree forest = function @@ -914,34 +914,34 @@ let rec split_tree forest = function
914 914
915 (* let rec rules_of_tree2 = function 915 (* let rec rules_of_tree2 = function
916 Dep({sons=[]} as d) -> 916 Dep({sons=[]} as d) ->
917 - d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp 917 + d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp
918 | Dep({sons=[Dep d2]} as d) -> 918 | Dep({sons=[Dep d2]} as d) ->
919 - "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^ 919 + "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^
920 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]" 920 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]"
921 | Dep({sons=[Dep d2;Dep d3]} as d) -> 921 | Dep({sons=[Dep d2;Dep d3]} as d) ->
922 - "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^ 922 + "[ " ^ d.lemma ^ ":" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^
923 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]" 923 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]"
924 | _ -> failwith "rules_of_tree2" *) 924 | _ -> failwith "rules_of_tree2" *)
925 925
926 (* let rec rules_of_tree2 = function 926 (* let rec rules_of_tree2 = function
927 Dep({sons=[]} as d) -> 927 Dep({sons=[]} as d) ->
928 - "_:" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp 928 + "_:" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp
929 | Dep({sons=[Dep d2]} as d) -> 929 | Dep({sons=[Dep d2]} as d) ->
930 - "[ _:" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^ 930 + "[ _:" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^
931 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]" 931 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]"
932 | Dep({sons=[Dep d2;Dep d3]} as d) -> 932 | Dep({sons=[Dep d2;Dep d3]} as d) ->
933 - "[ _:" ^ d.cat ^ ":" ^ ENIAMtokens.string_of_interps d.interp ^ 933 + "[ _:" ^ d.cat ^ ":" ^ ENIAMtagset.render d.interp ^
934 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]" 934 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]"
935 | _ -> failwith "rules_of_tree2" *) 935 | _ -> failwith "rules_of_tree2" *)
936 936
937 let rec rules_of_tree2 = function 937 let rec rules_of_tree2 = function
938 Dep({sons=[]} as d) -> 938 Dep({sons=[]} as d) ->
939 - "_:" ^ d.cat (*^ ":" ^ ENIAMtokens.string_of_interps d.interp*) 939 + "_:" ^ d.cat (*^ ":" ^ ENIAMtagset.render d.interp*)
940 | Dep({sons=[Dep d2]} as d) -> 940 | Dep({sons=[Dep d2]} as d) ->
941 - "[ _:" ^ d.cat ^ (*":" ^ ENIAMtokens.string_of_interps d.interp ^*) 941 + "[ _:" ^ d.cat ^ (*":" ^ ENIAMtagset.render d.interp ^*)
942 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]" 942 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " ]"
943 | Dep({sons=[Dep d2;Dep d3]} as d) -> 943 | Dep({sons=[Dep d2;Dep d3]} as d) ->
944 - "[ _:" ^ d.cat ^ (*":" ^ ENIAMtokens.string_of_interps d.interp ^*) 944 + "[ _:" ^ d.cat ^ (*":" ^ ENIAMtagset.render d.interp ^*)
945 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]" 945 " -> " ^ d2.label ^ " -> " ^ rules_of_tree2 (Dep d2) ^ " | " ^ d3.label ^ " -> " ^ rules_of_tree2 (Dep d3) ^ " ]"
946 | _ -> failwith "rules_of_tree2" 946 | _ -> failwith "rules_of_tree2"
947 947
lexSemantics/ENIAMvalence.ml
@@ -59,6 +59,7 @@ let transform_phrase pos lemma = function @@ -59,6 +59,7 @@ let transform_phrase pos lemma = function
59 | XP as morf -> [morf] 59 | XP as morf -> [morf]
60 | AdjA as morf -> [morf] 60 | AdjA as morf -> [morf]
61 | PadvP as morf -> [morf] 61 | PadvP as morf -> [morf]
  62 + | AdMod _ as morf -> [morf]
62 | Qub as morf -> [morf] 63 | Qub as morf -> [morf]
63 | FixedP _ as morf -> [morf] 64 | FixedP _ as morf -> [morf]
64 | SymbolP as morf -> [morf] 65 | SymbolP as morf -> [morf]
lexSemantics/ENIAMwalRenderer.ml
@@ -270,6 +270,8 @@ let render_phrase_cat cat role node = function @@ -270,6 +270,8 @@ let render_phrase_cat cat role node = function
270 | Measure(AllUAgr) -> Tensor[Atom "measure"; AVar "unumber"; AVar "ucase"; AVar "ugender"; AVar "uperson"] *) 270 | Measure(AllUAgr) -> Tensor[Atom "measure"; AVar "unumber"; AVar "ucase"; AVar "ugender"; AVar "uperson"] *)
271 | Or -> Tensor[Atom "or"; Atom cat; Atom role; Atom node] 271 | Or -> Tensor[Atom "or"; Atom cat; Atom role; Atom node]
272 | Qub -> Tensor[Atom "qub"; Atom cat; Atom role; Atom node] 272 | Qub -> Tensor[Atom "qub"; Atom cat; Atom role; Atom node]
  273 + | AdMod(GradAgr) -> Tensor[Atom "admod"; AVar "grad"; Atom cat; Atom role; Atom node]
  274 + | AdMod(Grad grad) -> Tensor[Atom "admod"; Atom grad; Atom cat; Atom role; Atom node]
273 (* | Inclusion -> Tensor[Atom "inclusion"] 275 (* | Inclusion -> Tensor[Atom "inclusion"]
274 | Adja -> Tensor[Atom "adja"] 276 | Adja -> Tensor[Atom "adja"]
275 | Aglt -> Tensor[Atom "aglt"; AVar "number"; AVar "person"] 277 | Aglt -> Tensor[Atom "aglt"; AVar "number"; AVar "person"]
lexSemantics/ENIAMwalStringOf.ml
@@ -86,6 +86,7 @@ let gender = function @@ -86,6 +86,7 @@ let gender = function
86 86
87 let grad = function 87 let grad = function
88 Grad s -> s 88 Grad s -> s
  89 + | GradAgr -> "agr"
89 | GradUndef -> "_" 90 | GradUndef -> "_"
90 91
91 (* let psem = function 92 (* let psem = function
@@ -154,6 +155,7 @@ let rec phrase = function @@ -154,6 +155,7 @@ let rec phrase = function
154 (* | Num(c,a) -> "num(" ^ case c ^ "," ^ acm a ^ ")" *) 155 (* | Num(c,a) -> "num(" ^ case c ^ "," ^ acm a ^ ")" *)
155 | Or -> "or" 156 | Or -> "or"
156 | Qub -> "qub" 157 | Qub -> "qub"
  158 + | AdMod g -> "admod(" ^ grad g ^ ")"
157 | Inclusion -> "inclusion" 159 | Inclusion -> "inclusion"
158 | Pro -> "pro" 160 | Pro -> "pro"
159 | ProNG -> "prong" 161 | ProNG -> "prong"
lexSemantics/ENIAMwalTypes.ml
@@ -29,7 +29,7 @@ type comp = Comp of string | Zeby | Gdy | CompUndef @@ -29,7 +29,7 @@ type comp = Comp of string | Zeby | Gdy | CompUndef
29 type comp_type = Int | Rel | CompTypeUndef (*| CompTypeAgr*) 29 type comp_type = Int | Rel | CompTypeUndef (*| CompTypeAgr*)
30 type number = Number of string | NumberUndef | NumberAgr 30 type number = Number of string | NumberUndef | NumberAgr
31 type gender = Gender of string | GenderUndef | GenderAgr | Genders of string list 31 type gender = Gender of string | GenderUndef | GenderAgr | Genders of string list
32 -type grad = Grad of string | GradUndef 32 +type grad = Grad of string | GradUndef | GradAgr
33 (* type psem = Psem | Pnosem *) 33 (* type psem = Psem | Pnosem *)
34 (* type refl = (*ReflEmpty |*) ReflTrue | ReflFalse | ReflUndef *) 34 (* type refl = (*ReflEmpty |*) ReflTrue | ReflFalse | ReflUndef *)
35 (* type acm = Acm of string | AcmUndef *) 35 (* type acm = Acm of string | AcmUndef *)
@@ -87,6 +87,7 @@ type phrase = @@ -87,6 +87,7 @@ type phrase =
87 (* | Refl 87 (* | Refl
88 | Recip *) 88 | Recip *)
89 | Qub 89 | Qub
  90 + | AdMod of grad
90 | Inclusion 91 | Inclusion
91 | Pro 92 | Pro
92 | ProNG 93 | ProNG
morphology/ENIAMinflexion.ml
@@ -108,13 +108,15 @@ let alt = ref (StringMap.empty : (bool * t) list StringMap.t) @@ -108,13 +108,15 @@ let alt = ref (StringMap.empty : (bool * t) list StringMap.t)
108 let stems = ref (StringMap.empty : (string * string * StringSet.t) list StringMap.t) 108 let stems = ref (StringMap.empty : (string * string * StringSet.t) list StringMap.t)
109 let rules = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list) 109 let rules = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list)
110 let wyglos = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list) 110 let wyglos = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list)
  111 +let lemmata = ref StringSet.empty
111 112
112 let initialize () = 113 let initialize () =
113 alt := prepare_alt StringMap.empty alt_filename; 114 alt := prepare_alt StringMap.empty alt_filename;
114 alt := prepare_alt !alt alt_supplement_filename; 115 alt := prepare_alt !alt alt_supplement_filename;
115 stems := load_stems stem_filename; 116 stems := load_stems stem_filename;
116 rules := prepare_rules rules_filename; 117 rules := prepare_rules rules_filename;
117 - wyglos := prepare_wyglos wyglos_filename 118 + wyglos := prepare_wyglos wyglos_filename;
  119 + lemmata := StringSet.of_list (File.load_lines lemmata_filename)
118 120
119 (* let initialize () = 121 (* let initialize () =
120 alt := prepare_alt StringMap.empty "resources/alt.tab"; 122 alt := prepare_alt StringMap.empty "resources/alt.tab";
morphology/ENIAMmorphologyTypes.ml
@@ -65,5 +65,6 @@ let alt_filename = resource_path ^ &quot;/morphology/alt.tab&quot; @@ -65,5 +65,6 @@ let alt_filename = resource_path ^ &quot;/morphology/alt.tab&quot;
65 let stem_filename = resource_path ^ "/morphology/stem.tab" 65 let stem_filename = resource_path ^ "/morphology/stem.tab"
66 let rules_filename = resource_path ^ "/morphology/freq_rules.tab" 66 let rules_filename = resource_path ^ "/morphology/freq_rules.tab"
67 let wyglos_filename = resource_path ^ "/morphology/wyglos.tab" 67 let wyglos_filename = resource_path ^ "/morphology/wyglos.tab"
  68 +let lemmata_filename = resource_path ^ "/morphology/lemmata.tab"
68 69
69 let alt_supplement_filename = resource_path ^ "/morphology/alt_supplement.tab" 70 let alt_supplement_filename = resource_path ^ "/morphology/alt_supplement.tab"
morphology/generate.ml
  1 +open Xstd
  2 +
1 let nlp_resources_path = "../../NLP resources/" 3 let nlp_resources_path = "../../NLP resources/"
2 let sgjp_path = nlp_resources_path ^ "SGJP/" 4 let sgjp_path = nlp_resources_path ^ "SGJP/"
3 let sgjp_filename = "sgjp-20170730.tab.gz" 5 let sgjp_filename = "sgjp-20170730.tab.gz"
@@ -35,9 +37,18 @@ let generate_alt rules_filename path filename out_filename = @@ -35,9 +37,18 @@ let generate_alt rules_filename path filename out_filename =
35 let dict = Dict.remove_validated_forms dict in 37 let dict = Dict.remove_validated_forms dict in
36 Dict.print out_filename dict 38 Dict.print out_filename dict
37 39
  40 +let generate_lemmata path filename out_filename =
  41 + let dict = Dict.load_tab (path ^ filename) in
  42 + let lemmata = Xlist.fold dict StringSet.empty (fun set e ->
  43 + StringSet.add set (Stem.simplify_lemma e.ENIAMmorphologyTypes.lemma)) in
  44 + File.file_out out_filename (fun file ->
  45 + StringSet.iter lemmata (Printf.fprintf file "%s\n"))
  46 +
  47 +
38 let _ = 48 let _ =
39 - Dict.generate_rule_frequencies_list interp_compound_rule_trees sources "resources/freq_rules.tab"; 49 + (* Dict.generate_rule_frequencies_list interp_compound_rule_trees sources "resources/freq_rules.tab";
40 generate_alt "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/alt.tab"; 50 generate_alt "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/alt.tab";
41 Dict.generate_stem_dict "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/stem.tab"; 51 Dict.generate_stem_dict "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/stem.tab";
42 - Dict.generate_wyglos sgjp_path sgjp_filename "resources/wyglos.tab"; 52 + Dict.generate_wyglos sgjp_path sgjp_filename "resources/wyglos.tab"; *)
  53 + generate_lemmata sgjp_path sgjp_filename "resources/lemmata.tab";
43 () 54 ()