Commit dd02bfeb64ff545df3950a4d8a4910a09b4c712d
Merge branch 'integration' into dep_trees
Showing
98 changed files
with
512041 additions
and
494159 deletions
Too many changes to show.
To preserve performance only 49 of 98 files are displayed.
LCGlexicon/ENIAM_LCGlexicon.ml
... | ... | @@ -71,6 +71,30 @@ let assign_quantifiers (selectors,rule,weight) = |
71 | 71 | let quant = merge_quant categories quant in |
72 | 72 | selectors, (bracket,quant,syntax),(rule,weight) |
73 | 73 | |
74 | +let rec check_quantifiers_int_rec (selectors,syntax) quants = function | |
75 | + Atom x -> () | |
76 | + | AVar "schema" -> () | |
77 | + | AVar x -> | |
78 | + if not (SelectorSet.mem quants (selector_of_string x)) | |
79 | + then failwith ("Variable '" ^ x ^ "' is not quantified in rule " ^ string_of_selectors selectors ^ ": " ^ ENIAM_LCGstringOf.grammar_symbol 0 syntax) | |
80 | + | With l -> Xlist.iter l (check_quantifiers_int_rec (selectors,syntax) quants) | |
81 | + | Zero -> () | |
82 | + | Top -> () | |
83 | + | |
84 | +let rec check_quantifiers_rec rule quants = function | |
85 | + Tensor l -> Xlist.iter l (check_quantifiers_int_rec rule quants) | |
86 | + | Plus l -> Xlist.iter l (check_quantifiers_rec rule quants) | |
87 | + | Imp(s,d,t) -> check_quantifiers_rec rule quants s; check_quantifiers_rec rule quants t | |
88 | + | One -> () | |
89 | + | ImpSet(s,l) -> check_quantifiers_rec rule quants s; Xlist.iter l (fun (_,t) -> check_quantifiers_rec rule quants t) | |
90 | + | Star s -> check_quantifiers_rec rule quants s | |
91 | + | Maybe s -> check_quantifiers_rec rule quants s | |
92 | + | _ -> failwith "check_quantifiers_rec" | |
93 | + | |
94 | +let check_quantifiers (selectors,(bracket,quant,syntax),_) = | |
95 | + let quants = Xlist.fold quant SelectorSet.empty (fun quants (q,_) -> SelectorSet.add quants q) in | |
96 | + check_quantifiers_rec (selectors,syntax) quants syntax | |
97 | + | |
74 | 98 | let assign_semantics (selectors,(bracket,quant,syntax),(rule,weight)) = |
75 | 99 | let semantics = try |
76 | 100 | let raised,rule = get_raised [] rule in |
... | ... | @@ -135,6 +159,7 @@ let dict_of_grammar grammar = |
135 | 159 | let make_rules x_flag filename = |
136 | 160 | let lexicon = ENIAM_LCGlexiconParser.load_lexicon filename in |
137 | 161 | let lexicon = List.rev (Xlist.rev_map lexicon assign_quantifiers) in |
162 | + Xlist.iter lexicon check_quantifiers; | |
138 | 163 | let lexicon = List.rev (Xlist.rev_map lexicon assign_semantics) in |
139 | 164 | let lexicon = if x_flag then List.rev (Xlist.rev_map lexicon add_x_args) else lexicon in |
140 | 165 | dict_of_grammar lexicon |
... | ... |
LCGlexicon/ENIAM_LCGlexiconTypes.ml
LCGlexicon/ENIAMcategoriesPL.ml
... | ... | @@ -36,7 +36,7 @@ let selector_values = Xlist.fold [ |
36 | 36 | "match-result";"url";"email";"obj-id";"adj";"adjc";"adjp";"adja"; |
37 | 37 | "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt"; |
38 | 38 | "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"part";"comp";"conj";"interj"; |
39 | - "sinterj";"burk";"interp";"unk";"html-tag"]; | |
39 | + "sinterj";"burk";"interp";"xxx";"unk";"html-tag"]; | |
40 | 40 | Pos2, []; |
41 | 41 | Cat, []; |
42 | 42 | Number, all_numbers; |
... | ... | @@ -365,6 +365,8 @@ let clarify_categories proper cat = function |
365 | 365 | | lemma,"interp",[] -> [{empty_cats with lemma=lemma; pos="interp"; pos2="interp"}] |
366 | 366 | | lemma,"unk",[] -> |
367 | 367 | [{empty_cats with lemma=lemma; pos="unk"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}] |
368 | + | lemma,"xxx",[] -> | |
369 | + [{empty_cats with lemma=lemma; pos="xxx"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}] | |
368 | 370 | | lemma,"html-tag",[] -> [{empty_cats with lemma=lemma; pos="html-tag"; pos2="html-tag"}] |
369 | 371 | | lemma,c,l -> failwith ("clarify_categories: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat ".")))) |
370 | 372 | |
... | ... | @@ -557,9 +559,9 @@ let pos_categories = Xlist.fold [ |
557 | 559 | "imps",[Lemma;(*NewLemma;*)Cat;Number;Gender;Person;Aspect;Negation;Mood;Tense;]; |
558 | 560 | "pred",[Lemma;(*NewLemma;*)Cat;Number;Gender;Person;Aspect;Negation;Mood;Tense;]; |
559 | 561 | "aglt",[Lemma;Number;Person;Aspect;]; |
560 | - "inf",[Lemma;(*NewLemma;*)Cat;Aspect;]; | |
561 | - "pcon",[Lemma;(*NewLemma;*)Cat;Aspect;]; | |
562 | - "pant",[Lemma;(*NewLemma;*)Cat;Aspect;]; | |
562 | + "inf",[Lemma;(*NewLemma;*)Cat;Aspect;Negation;]; | |
563 | + "pcon",[Lemma;(*NewLemma;*)Cat;Aspect;Negation;]; | |
564 | + "pant",[Lemma;(*NewLemma;*)Cat;Aspect;Negation;]; | |
563 | 565 | "qub",[Lemma;]; |
564 | 566 | "part",[Lemma;]; |
565 | 567 | "comp",[Lemma;];(* ctype *) |
... | ... | @@ -569,5 +571,6 @@ let pos_categories = Xlist.fold [ |
569 | 571 | "burk",[Lemma;]; |
570 | 572 | "interp",[Lemma;]; |
571 | 573 | "unk",[Lemma;Number;Case;Gender;Person;]; |
574 | + "xxx",[Lemma;Number;Case;Gender;Person;]; | |
572 | 575 | "html-tag",[Lemma;]; |
573 | 576 | ] StringMap.empty (fun map (k,l) -> StringMap.add map k l) |
... | ... |
LCGlexicon/TODO
LCGlexicon/resources/lexicon-pl.dic
... | ... | @@ -104,11 +104,14 @@ pos=subst,case=gen,nsem=measure: |
104 | 104 | measure*sg*case*n2*person{\num*number*case*gender*person*rec}{schema}{\(1+qub),/(1+inclusion)}: measure_weight; # UWAGA: number "sg" i gender "n2", żeby uzgadniać z podmiotem czasownika |
105 | 105 | |
106 | 106 | # frazy przyimkowe |
107 | +#lemma!=temu,pos=prep: prepnp*lemma*case{\(1+advp*T),/np*T*case*T*T}{\(1+qub),/(1+inclusion)}; | |
108 | +#lemma!=temu,pos=prep: prepadjp*lemma*case{\(1+advp*T),/adjp*T*case*T}{\(1+qub),/(1+inclusion)}; | |
107 | 109 | pos=prep: prepnp*lemma*case{\(1+advp*T),/np*T*case*T*T}{\(1+qub),/(1+inclusion)}; |
108 | 110 | pos=prep: prepadjp*lemma*case{\(1+advp*T),/adjp*T*case*T}{\(1+qub),/(1+inclusion)}; |
109 | 111 | lemma=po,pos=prep: QUANT[case=postp] prepadjp*lemma*case{\(1+advp*T),/(adjp*sg*dat*m1+adjp*T*postp*T)}{\(1+qub),/(1+inclusion)}; # po polsku, po kreciemu |
110 | 112 | lemma=z,pos=prep: QUANT[case=postp] prepadjp*lemma*case{\(1+advp*T),/adjp*sg*nom*f}{\(1+qub),/(1+inclusion)}; # z bliska |
111 | 113 | lemma=na,pos=prep: QUANT[case=postp] prepadjp*lemma*case{\(1+advp*T),/advp*T}{\(1+qub),/(1+inclusion)}; # na lewo |
114 | +lemma=temu,pos=prep: prepnp*lemma*case\np*T*case*T*T; # chwilę temu | |
112 | 115 | |
113 | 116 | # przimkowe określenia czasu |
114 | 117 | lemma=z,pos=prep,case=gen: prepnp*lemma*case{\(1+advp*T),/(day-month+day+year+date+hour+hour-minute)}{\(1+qub),/(1+inclusion)}; |
... | ... | @@ -292,6 +295,7 @@ lemma=[,pos=interp: (inclusion/rparen2)/(np*T*T*T*T+ip*T*T*T+adjp*T*T*T+pr |
292 | 295 | lemma=),pos=interp: rparen; |
293 | 296 | lemma=],pos=interp: rparen2; |
294 | 297 | pos=unk: np*number*case*gender*person; |
298 | +pos=xxx: np*number*case*gender*person; | |
295 | 299 | |
296 | 300 | lemma=<conll_root>,pos=interp: <conll_root>/(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj); |
297 | 301 | |
... | ... | @@ -302,7 +306,7 @@ lemma=<sentence>,pos=interp: BRACKET <root>/s; |
302 | 306 | |
303 | 307 | lemma=:,pos=interp: BRACKET or; |
304 | 308 | lemma=:s,pos=interp: BRACKET <colon>\<speaker>; |
305 | -lemma=:s,pos=interp: BRACKET (<colon>\<speaker>)/<squery>; | |
309 | +lemma=:s,pos=interp: BRACKET (<colon>\<speaker>)/<squery>; #FIXME <squery> nie jest nigdzie generowane | |
306 | 310 | lemma=<or-sentence>,pos=interp: BRACKET <root>/s; |
307 | 311 | lemma=<or-sentence>,pos=interp: BRACKET ((<root>/<speaker-end>)/(ip*T*T*T/or))/or2 SEM[λxλyλz.NODE(yx,z)]; |
308 | 312 | lemma=</or-sentence>,pos=interp: BRACKET or2\?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj); |
... | ... |
LCGlexicon/test.ml
... | ... | @@ -19,6 +19,7 @@ |
19 | 19 | |
20 | 20 | open ENIAM_LCGlexiconTypes |
21 | 21 | open ENIAM_LCGtypes |
22 | +open Xstd | |
22 | 23 | |
23 | 24 | let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename |
24 | 25 | (* let rules = ENIAM_LCGlexicon.make_rules false "resources/lexicon-pl.dic" *) |
... | ... | @@ -97,19 +98,32 @@ let create_chart valence tokens last = |
97 | 98 | ENIAM_LCGrenderer.reset_variable_names (); |
98 | 99 | ENIAM_LCGrenderer.add_variable_numbers (); |
99 | 100 | let cats = ENIAMcategoriesPL.clarify_categories proper ["X"] (lemma,pos,interp) in |
100 | - let l = ENIAM_LCGlexicon.create_entries rules id orth cats valence in | |
101 | + let l = ENIAM_LCGlexicon.create_entries rules id orth cats valence [] in | |
101 | 102 | ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in |
102 | 103 | chart |
103 | 104 | |
105 | +let create_text_fragments tokens last = | |
106 | + let text_fragments = Array.make last IntMap.empty in | |
107 | + Xlist.iter tokens (fun (id,lnode,rnode,orth,lemma,pos,interp,proper) -> | |
108 | + text_fragments.(lnode) <- IntMap.add text_fragments.(lnode) rnode orth); | |
109 | + Int.iter_down 0 (last - 1) (fun i -> | |
110 | + let map = IntMap.fold text_fragments.(i) text_fragments.(i) (fun map j orth -> | |
111 | + if j = last then map else | |
112 | + IntMap.fold text_fragments.(j) map (fun map k orth2 -> | |
113 | + IntMap.add map k (orth ^ " " ^ orth2))) in | |
114 | + text_fragments.(i) <- map); | |
115 | + text_fragments | |
116 | + | |
104 | 117 | let test_example valence (name,tokens,last) = |
105 | 118 | ENIAM_LCGreductions.reset_variant_label (); |
106 | 119 | let chart = create_chart valence tokens last in |
107 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart; | |
120 | + let text_fragments = create_text_fragments tokens last in | |
121 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" text_fragments chart; | |
108 | 122 | let chart,references = ENIAM_LCGchart.lazify chart in |
109 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | |
123 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" text_fragments chart; | |
110 | 124 | ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; |
111 | 125 | let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) |
112 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | |
126 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" text_fragments chart; | |
113 | 127 | ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; |
114 | 128 | if ENIAM_LCGchart.is_parsed chart then ( |
115 | 129 | let term = ENIAM_LCGchart.get_parsed_term chart in |
... | ... |
LCGparser/ENIAM_LCGgraphOf.ml
... | ... | @@ -25,6 +25,7 @@ let escape_string s = |
25 | 25 | match String.sub s i 1 with |
26 | 26 | "<" -> t ^ "〈" |
27 | 27 | | ">" -> t ^ "〉" |
28 | + | "\"" -> t ^ "\\\"" | |
28 | 29 | | c -> t ^ c) |
29 | 30 | |
30 | 31 | let string_of_node t = |
... | ... | @@ -58,6 +59,10 @@ let rec print_dependency_tree_rec file edge upper id = function |
58 | 59 | | Val s -> |
59 | 60 | fprintf file " %s [shape=box,label=\"%s\"]\n" id s; |
60 | 61 | print_edge file edge upper id |
62 | + | SetAttr(a,s,t) -> | |
63 | + fprintf file " %s [shape=box,label=\"SetAttr(%s,%s)\"]\n" id a (ENIAM_LCGstringOf.linear_term 0 s); | |
64 | + print_edge file edge upper id; | |
65 | + print_dependency_tree_rec2 file "" id t | |
61 | 66 | | Dot -> () |
62 | 67 | (* fprintf file " %s [shape=box,label=\"Dot\"]\n" id; |
63 | 68 | print_edge file edge upper id*) |
... | ... | @@ -87,17 +92,26 @@ let rec print_simplified_dependency_tree_rec2 file edge upper = function |
87 | 92 | Xlist.iter l (fun (i,t) -> print_simplified_dependency_tree_rec2 file i e t) |
88 | 93 | | Dot -> () |
89 | 94 | | Ref i -> print_edge file edge upper ("x" ^ string_of_int i) |
90 | - | t -> failwith ("print_simplified_dependency_tree_rec: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
95 | + | SetAttr(a,s,t) -> () | |
96 | + (* fprintf file " %s [shape=box,label=\"SetAttr(%s,%s)\"]\n" id a (ENIAM_LCGstringOf.linear_term 0 s); | |
97 | + print_edge file edge upper id; *) | |
98 | + (* print_simplified_dependency_tree_rec2 file "" id t *) | |
99 | + | Node t -> () | |
100 | + | t -> failwith ("print_simplified_dependency_tree_rec2: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
91 | 101 | |
92 | 102 | let rec print_simplified_dependency_tree_rec file edge upper id = function |
93 | 103 | Node t -> |
94 | - fprintf file " %s [label=\"%s\\n%s:%s\\n%s\\n%f\"]\n" id t.orth t.lemma t.pos (ENIAM_LCGstringOf.linear_term 0 t.symbol) t.weight; | |
104 | + fprintf file " %s [label=\"%s\\n%s:%s\\n%s\\n%f\"]\n" id (escape_string t.orth) (escape_string t.lemma) t.pos (escape_string (ENIAM_LCGstringOf.linear_term 0 t.symbol)) t.weight; | |
95 | 105 | print_edge file edge upper id; |
96 | 106 | print_simplified_dependency_tree_rec2 file "" id t.args |
97 | 107 | | Variant(e,l) -> |
98 | 108 | fprintf file " %s [shape=diamond,label=\"%s\"]\n" id e; |
99 | 109 | print_edge file edge upper id; |
100 | 110 | Xlist.iter l (fun (i,t) -> print_simplified_dependency_tree_rec file i id (id ^ "y" ^ i) t) |
111 | + (* | SetAttr(a,s,t) -> | |
112 | + fprintf file " %s [shape=box,label=\"SetAttr(%s,%s)\"]\n" id a (ENIAM_LCGstringOf.linear_term 0 s); | |
113 | + print_edge file edge upper id; *) | |
114 | + (* print_simplified_dependency_tree_rec2 file "" id t *) | |
101 | 115 | | Dot -> () |
102 | 116 | | t -> failwith ("print_simplified_dependency_tree_rec: " ^ ENIAM_LCGstringOf.linear_term 0 t) |
103 | 117 | |
... | ... |
LCGparser/ENIAM_LCGlatexOf.ml
... | ... | @@ -199,7 +199,7 @@ let rec grammar_symbol c = function |
199 | 199 | | BracketSet d -> "{\\bf BracketSet}(" ^ direction d ^ ")" |
200 | 200 | | Maybe s -> "?" ^ grammar_symbol 2 s |
201 | 201 | |
202 | -let chart page g = | |
202 | +let chart page text_fragments g = | |
203 | 203 | let layers = ENIAM_LCGchart.fold g IntMap.empty (fun layers (symbol,node1,node2,sem,layer) -> |
204 | 204 | let nodes = try IntMap.find layers layer with Not_found -> IntMap.empty in |
205 | 205 | let content = node2, grammar_symbol 0 symbol, linear_term 0 sem in |
... | ... | @@ -207,16 +207,17 @@ let chart page g = |
207 | 207 | let nodes = IntMap.add_inc nodes node1 [content] (fun l -> content :: l) in |
208 | 208 | IntMap.add layers layer nodes) in |
209 | 209 | let n = match page with "a4" -> "10" | "a1" -> "40" | _ -> "20" in |
210 | - "\\begin{longtable}{|l|l|l|p{" ^ n ^ "cm}|}\n\\hline\n" ^ | |
210 | + "\\begin{longtable}{|l|l|l|l|p{" ^ n ^ "cm}|}\n\\hline\n" ^ | |
211 | 211 | String.concat "" (List.rev (IntMap.fold layers [] (fun l layer nodes -> |
212 | 212 | IntMap.fold nodes l (fun l node1 contents -> |
213 | 213 | Xlist.fold contents l (fun l (node2,symbol,sem) -> |
214 | - (Printf.sprintf "%d & %d--%d & $\\begin{array}{l}%s\\end{array}$ & $%s$\\\\\n\\hline\n" layer node1 node2 symbol sem) :: l))))) ^ | |
214 | + let s = try IntMap.find text_fragments.(node1) node2 with Not_found -> failwith (Printf.sprintf "chart: text_fragment not found %d-%d" node1 node2) in | |
215 | + (Printf.sprintf "%d & %d--%d & %s & $\\begin{array}{l}%s\\end{array}$ & $%s$\\\\\n\\hline\n" layer node1 node2 s symbol sem) :: l))))) ^ | |
215 | 216 | "\\end{longtable}" |
216 | 217 | |
217 | -let print_chart path name page g = | |
218 | +let print_chart path name page text_fragments g = | |
218 | 219 | Xlatex.latex_file_out path name page false (fun file -> |
219 | - Printf.fprintf file "%s\n" (chart page g)); | |
220 | + Printf.fprintf file "%s\n" (chart page text_fragments g)); | |
220 | 221 | Xlatex.latex_compile_and_clean path name |
221 | 222 | |
222 | 223 | |
... | ... |
LCGparser/test.ml
... | ... | @@ -18,6 +18,7 @@ |
18 | 18 | *) |
19 | 19 | |
20 | 20 | open ENIAM_LCGtypes |
21 | +open Xstd | |
21 | 22 | |
22 | 23 | type entry = |
23 | 24 | Basic of grammar_symbol |
... | ... | @@ -90,15 +91,28 @@ let create_chart tokens last = |
90 | 91 | ENIAM_LCGchart.add chart lnode rnode (Bracket(lf,rf,syntax),semantics) 0) in |
91 | 92 | chart |
92 | 93 | |
94 | +let create_text_fragments tokens last = | |
95 | + let text_fragments = Array.make last IntMap.empty in | |
96 | + Xlist.iter tokens (fun (lnode,rnode,orth,lemma,pos,entry) -> | |
97 | + text_fragments.(lnode) <- IntMap.add text_fragments.(lnode) rnode orth); | |
98 | + Int.iter_down 0 (last - 1) (fun i -> | |
99 | + let map = IntMap.fold text_fragments.(i) text_fragments.(i) (fun map j orth -> | |
100 | + if j = last then map else | |
101 | + IntMap.fold text_fragments.(j) map (fun map k orth2 -> | |
102 | + IntMap.add map k (orth ^ " " ^ orth2))) in | |
103 | + text_fragments.(i) <- map); | |
104 | + text_fragments | |
105 | + | |
93 | 106 | let test_example (name,tokens,last) = |
94 | 107 | ENIAM_LCGreductions.reset_variant_label (); |
95 | 108 | let chart = create_chart tokens last in |
96 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a3" chart; | |
109 | + let text_fragments = create_text_fragments tokens last in | |
110 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a3" text_fragments chart; | |
97 | 111 | let chart,references = ENIAM_LCGchart.lazify chart in |
98 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart; | |
112 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" text_fragments chart; | |
99 | 113 | ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references; |
100 | 114 | let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) |
101 | - ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart; | |
115 | + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" text_fragments chart; | |
102 | 116 | ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references; |
103 | 117 | if ENIAM_LCGchart.is_parsed chart then ( |
104 | 118 | let term = ENIAM_LCGchart.get_parsed_term chart in |
... | ... |
NKJP2/ENIAM_NKJP.ml
... | ... | @@ -330,6 +330,42 @@ let rec split_front rev n p = |
330 | 330 | if n = 0 then List.rev rev, p else |
331 | 331 | split_front (List.hd p :: rev) (n-1) (List.tl p) |
332 | 332 | |
333 | +let rec combine_three = function | |
334 | + [],[],[] -> [] | |
335 | + | x1 :: l1, x2 :: l2, x3 :: l3 -> (x1,x2,x3) :: combine_three (l1,l2,l3) | |
336 | + | _ -> failwith "combine_three" | |
337 | + | |
338 | +type split = Single of string | Split of (string * string * string * string list) list | Correct | |
339 | + | |
340 | +type err = Err | ErrTagE | TagE | TErr | DErr | CErr | Corr | |
341 | + | |
342 | +let parse_err = function | |
343 | + "ERR" -> Err | |
344 | + | "ERR-TAGE" -> ErrTagE | |
345 | + | "TAGE" -> TagE | |
346 | + | "TERR" -> TErr | |
347 | + | "DERR" -> DErr | |
348 | + | "CERR" -> CErr | |
349 | + | s -> failwith ("parser_err: " ^ s) | |
350 | + | |
351 | +let load_err_corr err_corr_filename = | |
352 | + File.fold_tab err_corr_filename StringMap.empty (fun err_corr -> function | |
353 | + [real_orth;lemma;orth;interp;freq;compos;sgjp;common;err] -> | |
354 | + StringMap.add_inc err_corr (real_orth^"\t"^lemma^"\t"^interp) (Single orth,parse_err err) (fun _ -> failwith "load_err_corr") | |
355 | + | [real_orth;lemma;interp;split_orth;split_lemma;split_interp;freq;compos;sgjp;common;err] -> | |
356 | + let l = combine_three (Xstring.split "|" split_orth,Xstring.split "|" split_lemma,Xstring.split "|" split_interp) in | |
357 | + let l = Xlist.map l (fun (orth,lemma,interp) -> | |
358 | + match Xstring.split ":" interp with | |
359 | + cat :: interp -> orth,lemma,cat,interp | |
360 | + | _ -> failwith "load_err_corr") in | |
361 | + StringMap.add_inc err_corr (real_orth^"\t"^lemma^"\t"^interp) (Split l,parse_err err) (fun _ -> failwith "load_err_corr") | |
362 | + | l -> print_endline ("load_err_corr: " ^ String.concat "\t" l); err_corr) | |
363 | + (* oooo o o interj 1 NCOMPOS NON-SGJP CW DERR | |
364 | + o opylać opyla fin:sg:ter:imperf 1 NCOMPOS NON-SGJP CW ERR *) | |
365 | + (* napewno napewno qub na|pewno na|pewno prep:acc|adv:pos 2 NCOMPOS NON-SGJP CW ERR *) | |
366 | + | |
367 | +let err_corr = load_err_corr "../resources/NKJP1M/NKJP1M-frequency-with-corrections.tab" | |
368 | + | |
333 | 369 | let match_tokens name id_p s sentences = |
334 | 370 | let p = Xunicode.utf8_chars_of_utf8_string s in |
335 | 371 | let len = Xlist.size p in |
... | ... | @@ -340,8 +376,43 @@ let match_tokens name id_p s sentences = |
340 | 376 | (* if no_spaces>0 && (nps || i=0) then Printf.printf "match_tokens spaces: %s %n i=%d beg=%d len=%d\n" name id_p i beg len; *) |
341 | 377 | let i = i+no_spaces in |
342 | 378 | let real_orth,p = split_front [] len p in |
343 | - if beg = i then i+len, p, (beg,len,no_spaces,String.concat "" real_orth,orth,lemma,cat,interp) :: tokens else | |
344 | - failwith (Printf.sprintf "match_tokens 1: %s %n i=%d beg=%d len=%d" name id_p i beg len)) in | |
379 | + let split,err = try StringMap.find err_corr (orth ^ "\t" ^ lemma ^ "\t" ^ String.concat ":" (cat :: interp)) with Not_found -> Correct,Corr in | |
380 | + if beg <> i then failwith (Printf.sprintf "match_tokens 1: %s %n i=%d beg=%d len=%d" name id_p i beg len) else ( | |
381 | + (* if err <> Corr then Printf.printf "match_tokens err: orth=%s lemma=%s cat=%s\n" orth lemma cat; *) | |
382 | + match split with | |
383 | + Correct -> | |
384 | + i+len, p, (beg,len,no_spaces,String.concat "" real_orth,orth,lemma,cat,interp) :: tokens | |
385 | + | Single new_orth -> | |
386 | + let new_orth = if err = TErr then orth else new_orth in | |
387 | + i+len, p, (beg,len,no_spaces,String.concat "" real_orth,new_orth,lemma,cat,interp) :: tokens | |
388 | + | Split["w",lemma1,cat1,interp1;"ogóle",lemma2,cat2,interp2] -> | |
389 | + if "wogole" = String.concat "" real_orth then | |
390 | + i+len, p, (beg+1,len-1,0,"ogole","ogóle",lemma2,cat2,interp2) :: (beg,1,no_spaces,"w","w",lemma1,cat1,interp1) :: tokens else | |
391 | + if "wogóle" <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: wogole") else | |
392 | + i+len, p, (beg+1,len-1,0,"ogóle","ogóle",lemma2,cat2,interp2) :: (beg,1,no_spaces,"w","w",lemma1,cat1,interp1) :: tokens | |
393 | + | Split["z",lemma1,cat1,interp1;"pewnością",lemma2,cat2,interp2] -> | |
394 | + if "spewnością" <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: spewnością") else | |
395 | + i+len, p, (beg+1,len-1,0,"pewnością","pewnością",lemma2,cat2,interp2) :: (beg,1,no_spaces,"s","z",lemma1,cat1,interp1) :: tokens | |
396 | + | Split["z",lemma1,cat1,interp1;"powrotem",lemma2,cat2,interp2] -> | |
397 | + if "spowrotem" <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: spowrotem") else | |
398 | + i+len, p, (beg+1,len-1,0,"powrotem","powrotem",lemma2,cat2,interp2) :: (beg,1,no_spaces,"s","z",lemma1,cat1,interp1) :: tokens | |
399 | + | Split["Słyszała",lemma1,cat1,interp1;"m",lemma2,cat2,interp2] -> | |
400 | + if "Słyszalam" <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: Słyszalam") else | |
401 | + i+len, p, (beg+len-1,1,0,"m","m",lemma2,cat2,interp2) :: (beg,len-1,no_spaces,"Słyszala","Słyszała",lemma1,cat1,interp1) :: tokens | |
402 | + | Split[orth1,lemma1,cat1,interp1;orth2,lemma2,cat2,interp2] -> | |
403 | + if orth1 ^ orth2 <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: %s|%s <> %s" orth1 orth2 (String.concat "" real_orth)) else | |
404 | + let len1 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth1) in | |
405 | + let len2 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth2) in | |
406 | + if len1 + len2 <> len then failwith "match_tokens 4" else | |
407 | + i+len, p, (beg+len1,len2,0,orth2,orth2,lemma2,cat2,interp2) :: (beg,len1,no_spaces,orth1,orth1,lemma1,cat1,interp1) :: tokens | |
408 | + | Split[orth1,lemma1,cat1,interp1;orth2,lemma2,cat2,interp2;orth3,lemma3,cat3,interp3] -> | |
409 | + if orth1 ^ orth2 ^ orth3 <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 5: %s|%s|%s <> %s" orth1 orth2 orth3 (String.concat "" real_orth)) else | |
410 | + let len1 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth1) in | |
411 | + let len2 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth2) in | |
412 | + let len3 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth3) in | |
413 | + if len1 + len2 + len3 <> len then failwith "match_tokens 6" else | |
414 | + i+len, p, (beg+len1+len2,len3,0,orth3,orth3,lemma3,cat3,interp3) :: (beg+len1,len2,0,orth2,orth2,lemma2,cat2,interp2) :: (beg,len1,no_spaces,orth1,orth1,lemma1,cat1,interp1) :: tokens | |
415 | + | Split _ -> failwith "match_tokens: ni")) in | |
345 | 416 | i,p,(id_s,List.rev tokens,named_tokens) :: sentences) in |
346 | 417 | let no_spaces,p = get_spaces 0 p in |
347 | 418 | if i+no_spaces <> len then failwith (Printf.sprintf "match_tokens 2: %s %n i=%d len=%d p='%s'" name id_p i len (String.concat "" p)) |
... | ... |
NKJP2/spelling.ml
... | ... | @@ -34,7 +34,7 @@ let generate_error_sentences sentences = |
34 | 34 | let no_tokens = Xlist.size tokens in |
35 | 35 | let tokens,prev_orth,prev_cat = Xlist.fold tokens ([],prev_orth,prev_cat) (fun (tokens,prev_orth,prev_cat) (_,_,no_spaces,real_orth,orth,_,cat,_) -> |
36 | 36 | let tokens = Int.fold 1 no_spaces tokens (fun tokens _ -> xml_space :: tokens) in |
37 | - let tokens = if no_spaces = 0 && ValidateTokenizer.is_space_required prev_orth prev_cat orth cat then xml_err_space:: tokens else tokens in | |
37 | + let tokens = if no_spaces = 0 && ValidateTokenizer.is_space_required prev_orth prev_cat orth cat then xml_err_space :: tokens else tokens in | |
38 | 38 | (make_xml_token real_orth orth) :: tokens, orth, cat) in |
39 | 39 | Xml.Element("s",["id",id_s;"length",string_of_int no_tokens],merge_pcdata (List.rev tokens)) :: sentences,prev_orth,prev_cat) in |
40 | 40 | Xml.Element("p",[],List.rev sentences) |
... | ... |
exec/ENIAMexec.ml
... | ... | @@ -81,9 +81,25 @@ let create_chart rules tokens lex_sems paths last = |
81 | 81 | ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in |
82 | 82 | chart |
83 | 83 | |
84 | +let create_text_fragments tokens paths last = | |
85 | + let text_fragments = Array.make last IntMap.empty in | |
86 | + Xlist.iter paths (fun (id,lnode,rnode) -> | |
87 | + let t = ExtArray.get tokens id in | |
88 | + let orth = if t.ENIAMtokenizerTypes.beg + t.ENIAMtokenizerTypes.len = t.ENIAMtokenizerTypes.next | |
89 | + then t.ENIAMtokenizerTypes.orth else t.ENIAMtokenizerTypes.orth ^ " " in | |
90 | + text_fragments.(lnode) <- IntMap.add text_fragments.(lnode) rnode orth); | |
91 | + Int.iter_down 0 (last - 1) (fun i -> | |
92 | + let map = IntMap.fold text_fragments.(i) text_fragments.(i) (fun map j orth -> | |
93 | + if j = last then map else | |
94 | + IntMap.fold text_fragments.(j) map (fun map k orth2 -> | |
95 | + IntMap.add map k (orth ^ orth2))) in | |
96 | + text_fragments.(i) <- map); | |
97 | + text_fragments | |
98 | + | |
84 | 99 | let eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last = |
85 | 100 | ENIAM_LCGreductions.reset_variant_label (); |
86 | 101 | let result = {empty_eniam_parse_result with paths_size = Xlist.size paths} in |
102 | + let result = if verbosity = 0 then result else {result with text_fragments=create_text_fragments tokens paths last} in | |
87 | 103 | let time1 = time_fun () in |
88 | 104 | try |
89 | 105 | let chart = create_chart rules tokens lex_sems paths last in |
... | ... |
exec/ENIAMexecTypes.ml
... | ... | @@ -18,6 +18,7 @@ |
18 | 18 | *) |
19 | 19 | |
20 | 20 | open ENIAM_LCGtypes |
21 | +open Xstd | |
21 | 22 | |
22 | 23 | type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated |
23 | 24 | |
... | ... | @@ -41,6 +42,7 @@ type eniam_parse_result = { |
41 | 42 | dependency_tree4: linear_term array; |
42 | 43 | dependency_tree5: linear_term array; |
43 | 44 | dependency_tree6: linear_term array; |
45 | + text_fragments: string IntMap.t array; | |
44 | 46 | } |
45 | 47 | (* |
46 | 48 | type conll_parse_result = { |
... | ... | @@ -175,6 +177,7 @@ let empty_eniam_parse_result = { |
175 | 177 | dependency_tree4=[| |]; |
176 | 178 | dependency_tree5=[| |]; |
177 | 179 | dependency_tree6=[| |]; |
180 | + text_fragments=[| |]; | |
178 | 181 | } |
179 | 182 | |
180 | 183 | (* |
... | ... |
exec/ENIAMsemLexicon.ml
0 → 100644
1 | +(* | |
2 | + * ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish | |
3 | + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open Xstd | |
21 | +open ENIAM_LCGtypes | |
22 | +open Lexer | |
23 | +open ENIAMwalTypes | |
24 | +open ENIAMlexSemanticsTypes | |
25 | + | |
26 | +let remove_comments line = | |
27 | + try | |
28 | + let n = String.index line '#' in | |
29 | + String.sub line 0 n | |
30 | + with Not_found -> line | |
31 | + | |
32 | +let rec manage_tokens = function | |
33 | + [arg;[T role]] -> [arg,role] | |
34 | + | arg :: (T role :: arg2) :: tokens -> (arg,role) :: manage_tokens (arg2 :: tokens) | |
35 | + | _ -> failwith "manage_tokens" | |
36 | + | |
37 | +let parse_dir p = function | |
38 | + T "/" :: tokens -> tokens, {p with dir=Forward_} | |
39 | + | T "\\" :: tokens -> tokens, {p with dir=Backward_} | |
40 | + | T "|" :: tokens -> tokens, {p with dir=Both_} | |
41 | + | tokens -> failwith ("parse_dir: " ^ Lexer.string_of_token_list tokens) | |
42 | + | |
43 | +let parse_multi p = function | |
44 | + T "?" :: tokens -> tokens, {p with is_necessary=Multi} | |
45 | + | tokens -> tokens,p | |
46 | + | |
47 | +let parse_morf p = function | |
48 | + [T "1"] -> {p with is_necessary=Opt} | |
49 | + | tokens -> | |
50 | + let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function | |
51 | + [T s] -> Atom s | |
52 | + | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in | |
53 | + {p with morfs=LCG (Tensor l) :: p.morfs} | |
54 | + | |
55 | +let parse_arg tokens p = | |
56 | + (* Printf.printf "parse_arg: %s\n" (Lexer.string_of_token_list tokens); *) | |
57 | + let tokens,p = parse_dir p tokens in | |
58 | + let tokens,p = parse_multi p tokens in | |
59 | + match Lexer.find_brackets ["(",")"] [] tokens with | |
60 | + [B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf | |
61 | + | tokens -> parse_morf p tokens | |
62 | + | |
63 | + | |
64 | +let parse_role p = function | |
65 | + "adjunct" -> {p with gf=ADJUNCT} | |
66 | + | "unk" -> {p with role="unk"} | |
67 | + | "nosem" -> {p with gf=NOSEM} | |
68 | + | "Count" -> {p with role="Count"} | |
69 | + | "Measure" -> {p with role="Measure"} | |
70 | + | s -> failwith ("parse_role: " ^ s) | |
71 | + | |
72 | +let parse_entry = function | |
73 | + [T symbol; T ":"; T "null"] -> symbol,[] | |
74 | + | T symbol :: T ":" :: tokens -> | |
75 | + (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *) | |
76 | + let tokens = Lexer.split_symbol (T ":") [] tokens in | |
77 | + let tokens = manage_tokens tokens in | |
78 | + let positions = Xlist.map tokens (fun (arg,role) -> | |
79 | + parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in | |
80 | + symbol,positions | |
81 | + | tokens -> failwith ("parse_entry: " ^ Lexer.string_of_token_list tokens) | |
82 | + | |
83 | +let load_lexicon filename = | |
84 | + let lines = File.load_lines filename in | |
85 | + let lines = List.rev (Xlist.rev_map lines remove_comments) in | |
86 | + let tokens = List.flatten (List.rev (Xlist.rev_map lines (Lexer.split "\\]\\| \\|\t\\|\r\\|\\?\\|:\\|;\\|&\\|!\\|=\\|}\\|{\\|,\\|\\*\\|/\\|\\+\\|)\\|(\\||\\|\\[\\|\\"))) in | |
87 | + let tokens = List.rev (Xlist.fold tokens [] (fun tokens -> function | |
88 | + T " " -> tokens | |
89 | + | T "\t" -> tokens | |
90 | + | T "\r" -> tokens | |
91 | + | t -> t :: tokens)) in | |
92 | + let entries = Lexer.split_symbol (T ";") [] tokens in | |
93 | + Xlist.fold entries StringMap.empty (fun map entry -> | |
94 | + let symbol,args = parse_entry entry in | |
95 | + StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol))) | |
96 | + | |
97 | +let sem_lexicon = load_lexicon "resources/lexicon-pl.dic" | |
98 | + | |
99 | +let extend_frame symbol frame = | |
100 | + try | |
101 | + let positions = StringMap.find sem_lexicon symbol in | |
102 | + {frame with positions=positions @ frame.positions} | |
103 | + with Not_found -> failwith ("extend_frame: " ^ symbol) | |
... | ... |
exec/ENIAMsemValence.ml
0 → 100644
1 | +(* | |
2 | + * ENIAMexec implements ENIAM processing stream | |
3 | + * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open ENIAMexecTypes | |
21 | +open ENIAM_LCGtypes | |
22 | +open ENIAM_LCGlexiconTypes | |
23 | +open ENIAMlexSemanticsTypes | |
24 | +open Xstd | |
25 | + | |
26 | +type pos = {role: linear_term; role_attr: linear_term; selprefs: linear_term; gf: ENIAMwalTypes.gf; | |
27 | + cr: string list; ce: string list; | |
28 | + is_necessary: bool; is_pro: bool; is_prong: bool; is_multi: bool; dir: string; morfs: StringSet.t} | |
29 | + | |
30 | +let match_value v2 = function | |
31 | + Val v -> if v = v2 then Val v else raise Not_found | |
32 | + | _ -> failwith "match_value" | |
33 | + | |
34 | +let match_neg_value vals = function | |
35 | + Val v -> if Xlist.mem vals v then raise Not_found else Val v | |
36 | + | _ -> failwith "match_neg_value" | |
37 | + | |
38 | +let rec apply_selector v2 = function | |
39 | + (sel,[]) -> failwith ("apply_selector: " ^ ENIAMcategoriesPL.string_of_selector sel) | |
40 | + | Negation,("NEGATION",v) :: l -> ("NEGATION",match_value v2 v) :: l | |
41 | + | Aspect,("ASPECT",v) :: l -> ("ASPECT",match_value v2 v) :: l | |
42 | + | Mood,("MOOD",v) :: l -> ("MOOD",match_value v2 v) :: l | |
43 | + | Nsyn,("NSYN",v) :: l -> ("NSYN",match_value v2 v) :: l | |
44 | + | Nsem,("NSEM",v) :: l -> ("NSEM",match_value v2 v) :: l | |
45 | + | Case,("CASE",v) :: l -> ("CASE",match_value v2 v) :: l | |
46 | + | Mode,("MODE",v) :: l -> ("MODE",match_value v2 v) :: l | |
47 | + | sel,(attr,v) :: l -> print_endline ("apply_selector: " ^ ENIAMcategoriesPL.string_of_selector sel ^ " " ^ attr); (attr,v) :: (apply_selector v2 (sel,l)) | |
48 | + | |
49 | +let rec apply_neg_selector vals = function | |
50 | + (sel,[]) -> failwith ("apply_neg_selector: " ^ ENIAMcategoriesPL.string_of_selector sel) | |
51 | + | Nsem,("NSEM",v) :: l -> ("NSEM",match_neg_value vals v) :: l | |
52 | + | Case,("CASE",v) :: l -> ("CASE",match_neg_value vals v) :: l | |
53 | + | sel,(attr,v) :: l -> (*print_endline ("apply_neg_selector: " ^ ENIAMcategoriesPL.string_of_selector sel ^ " " ^ attr);*) (attr,v) :: (apply_neg_selector vals (sel,l)) | |
54 | + | |
55 | +let rec apply_selectors attrs = function | |
56 | + [] -> attrs | |
57 | + | (sel,Eq,[v]) :: l -> apply_selectors (apply_selector v (sel,attrs)) l | |
58 | + | (sel,Neq,vals) :: l -> apply_selectors (apply_neg_selector vals (sel,attrs)) l | |
59 | + | _ -> failwith "apply_selectors" | |
60 | + | |
61 | +module OrderedStringDir = | |
62 | + struct | |
63 | + type t = string * string | |
64 | + let compare = compare | |
65 | + end | |
66 | + | |
67 | +module StringDirMap = Xmap.Make(OrderedStringDir) | |
68 | + | |
69 | +let rec get_arg_symbols_variant arg_symbols = function | |
70 | + Ref i -> | |
71 | + let l,dir = arg_symbols.(i) in | |
72 | + Xlist.map l (fun s -> (s,dir),Ref i) | |
73 | + | Variant(e,l) -> | |
74 | + let map = Xlist.fold l StringDirMap.empty (fun map (i,t) -> | |
75 | + Xlist.fold (get_arg_symbols_variant arg_symbols t) map (fun map (arg_symbol,t) -> | |
76 | + StringDirMap.add_inc map arg_symbol [i,t] (fun l -> (i,t) :: l))) in | |
77 | + StringDirMap.fold map [] (fun found arg_symbol l -> (arg_symbol,Variant(e,l)) :: found) | |
78 | + | t -> failwith ("get_arg_symbols_variant: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
79 | + | |
80 | +let rec get_arg_symbols_tuple arg_symbols rev = function | |
81 | + Dot -> rev | |
82 | + | Tuple l -> Xlist.fold l rev (get_arg_symbols_tuple arg_symbols) | |
83 | + | t -> (get_arg_symbols_variant arg_symbols t) :: rev | |
84 | + | |
85 | +let string_of_argdir = function | |
86 | + "forward" -> "/" | |
87 | + | "backward" -> "\\" | |
88 | + | "both" -> "|" | |
89 | + | _ -> failwith "string_of_argdir" | |
90 | + | |
91 | +let string_of_arg arg = | |
92 | + String.concat ", " (Xlist.map arg (fun ((arg_symbol,dir),t) -> (string_of_argdir dir) ^ arg_symbol ^ ":" ^ ENIAM_LCGstringOf.linear_term 0 t)) | |
93 | + | |
94 | +let string_of_position p = | |
95 | + (string_of_argdir p.dir) ^ String.concat "+" (StringSet.to_list p.morfs) | |
96 | + | |
97 | +let rec match_arg_positions arg rev = function | |
98 | + p :: positions -> | |
99 | + Printf.printf "match_arg_positions: arg=%s rev=[%s] positions=%s :: [%s]\n%!" (string_of_arg arg) (String.concat "; " (Xlist.map rev string_of_position)) (string_of_position p) (String.concat "; " (Xlist.map positions string_of_position)); | |
100 | + let l = Xlist.fold arg [] (fun l ((arg_symbol,dir),t) -> | |
101 | + if StringSet.mem p.morfs arg_symbol && p.dir = dir then t :: l else l) in | |
102 | + (match l with | |
103 | + [] -> print_endline "match_arg_positions: not matched"; match_arg_positions arg (p :: rev) positions | |
104 | + | [t] -> | |
105 | + let t = if p.gf = ENIAMwalTypes.SUBJ || p.gf = ENIAMwalTypes.OBJ || p.gf = ENIAMwalTypes.ARG then | |
106 | + SetAttr("role",p.role,SetAttr("role_attr",p.role_attr,SetAttr("selprefs",p.selprefs,t))) | |
107 | + else if p.gf = ENIAMwalTypes.ADJUNCT then t else failwith "match_arg_positions: ni 2" in | |
108 | + let t = SetAttr("gf",Val (ENIAMwalStringOf.gf p.gf),t) in | |
109 | + let t = Xlist.fold p.cr t (fun t cr -> SetAttr("controller",Val cr,t)) in | |
110 | + let t = Xlist.fold p.ce t (fun t ce -> SetAttr("controllee",Val ce,t)) in | |
111 | + if p.is_multi then (t, rev @ (p :: positions)) :: (match_arg_positions arg (p :: rev) positions) | |
112 | + else (t, rev @ positions) :: (match_arg_positions arg (p :: rev) positions) | |
113 | + | _ -> failwith "match_arg_positions: ni") | |
114 | + | [] -> Printf.printf "match_arg_positions: arg=%s rev=[%s] positions=[]\n%!" (string_of_arg arg) (String.concat "; " (Xlist.map rev string_of_position)); [] | |
115 | + | |
116 | +(* Jeśli ta funkcja zwróci pustą listę, oznacza to, że argumentów nie dało się dopasować do pozycji *) | |
117 | +let rec match_args_positions_rec positions = function | |
118 | + arg :: args -> | |
119 | + Printf.printf "match_args_positions_rec: args=%s :: [%s] positions=[%s]\n%!" (string_of_arg arg) (String.concat "; " (Xlist.map args string_of_arg)) (String.concat "; " (Xlist.map positions string_of_position)); | |
120 | + Xlist.fold (match_arg_positions arg [] positions) [] (fun found (arg_pos,positions) -> | |
121 | + Xlist.fold (match_args_positions_rec positions args) found (fun found l -> (arg_pos :: l) :: found)) | |
122 | + | [] -> | |
123 | + Printf.printf "match_args_positions_rec: args=[] positions=[%s]\n%!" (String.concat "; " (Xlist.map positions string_of_position)); | |
124 | + let b = Xlist.fold positions false (fun b p -> p.is_necessary || b) in | |
125 | + if b then print_endline "match_args_positions: not matched"; | |
126 | + if b then [] else | |
127 | + [Xlist.fold positions [] (fun found p -> | |
128 | + if not p.is_pro then found else | |
129 | + let attrs = ["role",p.role; "role_attr",p.role_attr; "selprefs",p.selprefs; "gf",Val (ENIAMwalStringOf.gf p.gf)] in | |
130 | + let attrs = if p.is_prong then attrs else attrs in (* FIXME: dodać number, gender *) | |
131 | + let attrs = Xlist.fold p.cr attrs (fun attrs cr -> ("controller",Val cr) :: attrs) in | |
132 | + let attrs = Xlist.fold p.ce attrs (fun attrs ce -> ("controllee",Val ce) :: attrs) in | |
133 | + Node{ENIAM_LCGrenderer.empty_node with lemma="pro"; pos="pro"; attrs=attrs} :: found)] | |
134 | + | |
135 | +(* FIXME: opcjonalność podrzędników argumentów zleksykalizowanych *) | |
136 | + | |
137 | +(* Jeśli ta funkcja zwróci pustą listę, oznacza to, że argumentów nie dało się dopasować do pozycji *) | |
138 | +let match_args_positions args positions = | |
139 | + Printf.printf "match_args_positions: args=[%s] positions=[%s]\n%!" (String.concat "; " (Xlist.map args string_of_arg)) (String.concat "; " (Xlist.map positions string_of_position)); | |
140 | + Xlist.rev_map (match_args_positions_rec positions args) (function | |
141 | + [] -> Dot | |
142 | + | [t] -> t | |
143 | + | l -> Tuple l) | |
144 | + | |
145 | +let translate_selprefs = function | |
146 | + ENIAMwalTypes.SynsetId _ -> failwith "translate_selprefs" | |
147 | + | ENIAMwalTypes.Predef _ -> failwith "translate_selprefs" | |
148 | + | ENIAMwalTypes.SynsetName s -> s | |
149 | + | ENIAMwalTypes.RelationRole _ -> "ALL" | |
150 | + | |
151 | +let string_of_internal_morf = function | |
152 | + Atom s -> s | |
153 | + | AVar s -> s | |
154 | + | Top -> "T" | |
155 | + | t -> failwith ("string_of_internal_morf: " ^ ENIAM_LCGstringOf.internal_grammar_symbol_prime t) | |
156 | + | |
157 | + | |
158 | +let string_of_morf = function | |
159 | + ENIAMwalTypes.LCG Tensor l -> String.concat "*" (Xlist.map l string_of_internal_morf) | |
160 | + | ENIAMwalTypes.LCG t -> failwith ("string_of_morf: " ^ ENIAM_LCGstringOf.grammar_symbol_prime t) | |
161 | + | _ -> failwith "string_of_morf" | |
162 | + | |
163 | +let rec string_of_arg_symbol = function | |
164 | + Dot -> "" | |
165 | + | Val s -> s | |
166 | + | Tuple l -> String.concat "*" (Xlist.map l string_of_arg_symbol) | |
167 | + | t -> failwith ("string_of_arg_symbol: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
168 | + | |
169 | +let translate_dir = function | |
170 | + ENIAMwalTypes.Both_ -> "both" | |
171 | + | ENIAMwalTypes.Forward_ -> "forward" | |
172 | + | ENIAMwalTypes.Backward_ -> "backward" | |
173 | + | |
174 | +let translate_position id p = | |
175 | + {role = Val p.ENIAMwalTypes.role; | |
176 | + role_attr = Val p.ENIAMwalTypes.role_attr; | |
177 | + selprefs = (match Xlist.map p.ENIAMwalTypes.sel_prefs translate_selprefs with | |
178 | + [] -> Dot | |
179 | + | [s] -> Val s | |
180 | + | l -> Tuple(Xlist.rev_map l (fun s -> Val s))); | |
181 | + gf=p.ENIAMwalTypes.gf; | |
182 | + cr=Xlist.map p.ENIAMwalTypes.cr (fun cr -> id ^ "-" ^ cr); | |
183 | + ce=Xlist.map p.ENIAMwalTypes.ce (fun ce -> id ^ "-" ^ ce); | |
184 | + is_necessary = p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.Req(*Xlist.fold p.ENIAMwalTypes.morfs true (fun b -> function ENIAMwalTypes.LCG One -> false | _ -> b)*); | |
185 | + is_pro = p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.Pro || p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.ProNG; | |
186 | + is_prong = p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.ProNG; | |
187 | + is_multi = p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.Multi; | |
188 | + dir= translate_dir p.ENIAMwalTypes.dir; | |
189 | + morfs = Xlist.fold p.ENIAMwalTypes.morfs StringSet.empty (fun morfs morf -> | |
190 | + if morf = ENIAMwalTypes.LCG One then (Printf.printf "translate_position: One%!"; morfs) else | |
191 | + StringSet.add morfs (string_of_morf morf))} | |
192 | + | |
193 | +let get_phrase_symbol = function | |
194 | + Tuple[Val "lex";Val "się";Val "qub"] -> "lex-się-qub" | |
195 | + | Tuple(Val s :: _) -> s | |
196 | + | Val s -> s | |
197 | + (* | Dot -> "dot" *) | |
198 | + | t -> failwith ("get_phrase_symbol: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
199 | + | |
200 | +(* let extend_frame symbol = function *) | |
201 | + | |
202 | +let rec assign_frames_rec tokens lex_sems tree arg_symbols visited = function | |
203 | + Ref i -> | |
204 | + if IntSet.mem visited i then Ref i,visited else | |
205 | + let t,visited = assign_frames_rec tokens lex_sems tree arg_symbols (IntSet.add visited i) tree.(i) in | |
206 | + tree.(i) <- t; | |
207 | + Ref i,visited | |
208 | + | Node t -> | |
209 | + let args,visited = assign_frames_rec tokens lex_sems tree arg_symbols visited t.args in | |
210 | + let t = {t with args=args} in | |
211 | + if t.symbol = Dot then Node t,visited else | |
212 | + let args = get_arg_symbols_tuple arg_symbols [] args in | |
213 | + let s = ExtArray.get lex_sems t.id in | |
214 | + let symbol = get_phrase_symbol t.symbol in | |
215 | + let frames = Xlist.fold s.ENIAMlexSemanticsTypes.frames [] (fun frames frame -> | |
216 | + print_endline ("selectors: " ^ ENIAMcategoriesPL.string_of_selectors frame.selectors); | |
217 | + try | |
218 | + let attrs = apply_selectors t.attrs frame.selectors in | |
219 | + let frame = ENIAMsemLexicon.extend_frame symbol frame in | |
220 | + print_endline "passed"; | |
221 | + (attrs,frame,Xlist.rev_map frame.positions (translate_position (string_of_int t.id))) :: frames | |
222 | + with Not_found -> print_endline "rejected"; frames) in | |
223 | + if frames = [] then failwith "assign_frames_rec: no frame" else | |
224 | + let e = ENIAM_LCGreductions.get_variant_label () in | |
225 | + let l,_ = Xlist.fold frames ([],1) (fun (l,n) (attrs,frame,positions) -> | |
226 | + Printf.printf "assign_frames_rec: lemma=%s args=[%s] positions=[%s]\n%!" t.lemma (String.concat "; " (Xlist.map args string_of_arg)) (String.concat "; " (Xlist.map positions string_of_position)); | |
227 | + if frame.meanings = [] then failwith "assign_frames_rec: no meanings" else | |
228 | + Xlist.fold (match_args_positions args positions) (l,n) (fun (l,n) args -> | |
229 | + Xlist.fold frame.meanings (l,n) (fun (l,n) (meaning,hipero,weight) -> | |
230 | + (string_of_int n, Node{t with attrs= | |
231 | + ("meaning",Val meaning) :: | |
232 | + ("hipero",ENIAM_LCGrules.make_variant (Xlist.map hipero (fun (h,n) -> Tuple[Val h;Val(string_of_int n)]))) :: | |
233 | + ("arole",Val frame.arole) :: | |
234 | + ("arole-attr",Val frame.arole_attr) :: | |
235 | + ("arev",Val (if frame.arev then "+" else "-")) :: | |
236 | + ("fopinion",Val (ENIAMwalStringOf.opinion frame.fopinion)) :: | |
237 | + ("sopinion",Val (ENIAMwalStringOf.opinion frame.sopinion)) :: t.attrs; args=args}) :: | |
238 | + l,n+1))) in | |
239 | + if l = [] then failwith ("assign_frames_rec: no frame assingment found for " ^ t.lemma ^ " " ^ ENIAM_LCGstringOf.linear_term 0 t.symbol) else | |
240 | + Variant(e,l),visited | |
241 | + | Variant(e,l) -> | |
242 | + let l,visited = Xlist.fold l ([],visited) (fun (l,visited) (i,t) -> | |
243 | + let t,visited = assign_frames_rec tokens lex_sems tree arg_symbols visited t in | |
244 | + (i,t) :: l, visited) in | |
245 | + Variant(e,List.rev l),visited | |
246 | + | Tuple l -> | |
247 | + let l,visited = Xlist.fold l ([],visited) (fun (l,visited) t -> | |
248 | + let t,visited = assign_frames_rec tokens lex_sems tree arg_symbols visited t in | |
249 | + t :: l, visited) in | |
250 | + Tuple(List.rev l),visited | |
251 | + | Dot -> Dot,visited | |
252 | + | t -> failwith ("assign_frames_rec: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
253 | + | |
254 | +let rec get_arg_symbols = function | |
255 | + Node{arg_symbol=Tuple([Val "cp"; Val "T"; Val "T"]); | |
256 | + symbol=Tuple([Val "cp"; ctype; comp]); arg_dir=dir} -> | |
257 | + [string_of_arg_symbol (Tuple([Val "cp"; Val "T"; Val "T"])); | |
258 | + string_of_arg_symbol (Tuple([Val "cp"; ctype; comp]))],dir | |
259 | + | Node{arg_symbol=Tuple([Val "ncp"; Val "T"; Val arg_case; Val "T"; Val "T"; Val "T"; Val "T"]); | |
260 | + symbol=Tuple([Val "ncp"; number; case; gender; person; ctype; comp]); arg_dir=dir} -> | |
261 | + [string_of_arg_symbol (Tuple([Val "ncp"; Val "T"; Val arg_case; Val "T"; Val "T"; Val "T"; Val "T"])); | |
262 | + string_of_arg_symbol (Tuple([Val "ncp"; Val "T"; Val arg_case; Val "T"; Val "T"; ctype; comp]))],dir | |
263 | + | Node{arg_symbol=Tuple([Val "prepncp"; Val arg_prep; Val arg_case; Val "T"; Val "T"]); | |
264 | + symbol=Tuple([Val "prepncp"; prep; case; ctype; comp]); arg_dir=dir} -> | |
265 | + [string_of_arg_symbol (Tuple([Val "prepncp"; Val arg_prep; Val arg_case; Val "T"; Val "T"])); | |
266 | + string_of_arg_symbol (Tuple([Val "prepncp"; prep; case; ctype; comp]))],dir | |
267 | + | Node t -> [string_of_arg_symbol t.arg_symbol], t.arg_dir | |
268 | + | t -> failwith ("get_arg_symbols: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
269 | + | |
270 | +let assign_frames tokens lex_sems tree = | |
271 | + print_endline "assign_frames"; | |
272 | + let tree = Array.copy tree in | |
273 | + let arg_symbols = Array.make (Array.length tree) ([],"") in | |
274 | + Int.iter 0 (Array.length tree - 1) (fun i -> | |
275 | + arg_symbols.(i) <- get_arg_symbols tree.(i)); | |
276 | + let _ = assign_frames_rec tokens lex_sems tree arg_symbols IntSet.empty (Ref 0) in | |
277 | + tree | |
278 | + | |
279 | +let assign tokens lex_sems text = | |
280 | + map_text Struct (fun mode -> function | |
281 | + ENIAMSentence result -> | |
282 | + if result.status <> Parsed then ENIAMSentence result else | |
283 | + ENIAMSentence {result with dependency_tree6=assign_frames tokens lex_sems result.dependency_tree6} | |
284 | + | t -> t) text | |
285 | + | |
286 | +let rec cut_nodes result_tree = function | |
287 | + | Node t -> | |
288 | + let i = ExtArray.add result_tree (Node t) in | |
289 | + Ref i | |
290 | + | Variant(e,l) -> | |
291 | + let l = Xlist.rev_map l (fun (i,t) -> i, cut_nodes result_tree t) in | |
292 | + Variant(e,List.rev l) | |
293 | + | Tuple l -> | |
294 | + let l = Xlist.rev_map l (cut_nodes result_tree) in | |
295 | + Tuple(List.rev l) | |
296 | + | Dot -> Dot | |
297 | + | t -> failwith ("cut_nodes: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
298 | + | |
299 | +let rec reduce_set_attr attr v = function | |
300 | + Node t -> Node{t with attrs=(attr,v) :: t.attrs} | |
301 | + | Variant(e,l) -> | |
302 | + Variant(e,List.rev (Xlist.rev_map l (fun (i,t) -> | |
303 | + i, reduce_set_attr attr v t))) | |
304 | + | t -> failwith ("reduce_set_attr: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
305 | + | |
306 | +let rec reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree = function | |
307 | + Ref i -> | |
308 | + if mid_tree.(i) <> Dot then mid_tree.(i) else | |
309 | + let t = reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree orig_tree.(i) in | |
310 | + mid_tree.(i) <- t; | |
311 | + t | |
312 | + | Node t -> | |
313 | + let args = reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree t.args in | |
314 | + print_endline ("reduce_tree_rec 1: " ^ ENIAM_LCGstringOf.linear_term 0 args); | |
315 | + let args = cut_nodes result_tree args in | |
316 | + print_endline ("reduce_tree_rec 2: " ^ ENIAM_LCGstringOf.linear_term 0 args); | |
317 | + let id = | |
318 | + if t.id = 0 then | |
319 | + let id = ExtArray.add tokens {ENIAMtokenizerTypes.empty_token_env with ENIAMtokenizerTypes.token=ENIAMtokenizerTypes.Lemma("pro","pro",[[]])} in | |
320 | + let _ = ExtArray.add lex_sems empty_lex_sem in | |
321 | + id | |
322 | + else t.id in | |
323 | + Node{t with args=args; id=id} | |
324 | + | Variant(e,l) -> | |
325 | + let l = Xlist.rev_map l (fun (i,t) -> i, reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree t) in | |
326 | + Variant(e,List.rev l) | |
327 | + | Tuple l -> | |
328 | + let l = Xlist.rev_map l (reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree) in | |
329 | + Tuple(List.rev l) | |
330 | + | Dot -> Dot | |
331 | + | SetAttr(attr,v,t) -> | |
332 | + let t = reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree t in | |
333 | + reduce_set_attr attr v t | |
334 | + | t -> failwith ("reduce_tree_rec: " ^ ENIAM_LCGstringOf.linear_term 0 t) | |
335 | + | |
336 | +let reduce_tree tokens lex_sems orig_tree = | |
337 | + print_endline "reduce_tree"; | |
338 | + let mid_tree = Array.make (Array.length orig_tree) Dot in | |
339 | + let result_tree = ExtArray.make (Array.length orig_tree) Dot in | |
340 | + let _ = ExtArray.add result_tree Dot in | |
341 | + let t = reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree orig_tree.(0) in | |
342 | + ExtArray.set result_tree 0 t; | |
343 | + ExtArray.to_array result_tree | |
344 | + | |
345 | +let reduce tokens lex_sems text = | |
346 | + map_text Struct (fun mode -> function | |
347 | + ENIAMSentence result -> | |
348 | + if result.status <> Parsed then ENIAMSentence result else | |
349 | + ENIAMSentence {result with dependency_tree6=reduce_tree tokens lex_sems result.dependency_tree6} | |
350 | + | t -> t) text | |
... | ... |
exec/ENIAMvisualization.ml
... | ... | @@ -774,8 +774,8 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
774 | 774 | | LexiconError -> sprintf "error_lex: %s paths_size=%d\n" result.msg result.paths_size |
775 | 775 | | ParseError -> |
776 | 776 | if verbosity = 0 then () else ( |
777 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1; | |
778 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2; | |
777 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1; | |
778 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2; | |
779 | 779 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2); |
780 | 780 | sprintf "error_parse: %s paths_size=%d\n" result.msg result.paths_size ^ |
781 | 781 | (if verbosity = 0 then "" else |
... | ... | @@ -785,10 +785,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
785 | 785 | "" |
786 | 786 | | ParseTimeout -> |
787 | 787 | if verbosity < 2 then () else ( |
788 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1; | |
788 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1; | |
789 | 789 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2); |
790 | 790 | if verbosity = 0 then () else ( |
791 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2); | |
791 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2); | |
792 | 792 | sprintf "timeout: %s paths_size=%d\n" result.msg result.paths_size ^ |
793 | 793 | (if verbosity < 2 then "" else |
794 | 794 | sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^ |
... | ... | @@ -798,13 +798,13 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
798 | 798 | "" |
799 | 799 | | NotParsed -> |
800 | 800 | if verbosity = 0 then () else ( |
801 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1); | |
801 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1); | |
802 | 802 | if verbosity < 2 then () else ( |
803 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2; | |
803 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2; | |
804 | 804 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2; |
805 | 805 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3); |
806 | 806 | if verbosity = 0 then () else ( |
807 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3); | |
807 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3); | |
808 | 808 | sprintf "not_parsed: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^ |
809 | 809 | (if verbosity = 0 then "" else |
810 | 810 | sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix) ^ |
... | ... | @@ -817,11 +817,11 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
817 | 817 | "" |
818 | 818 | | ReductionError -> |
819 | 819 | if verbosity < 2 then () else ( |
820 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2; | |
820 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2; | |
821 | 821 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2; |
822 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3); | |
822 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3); | |
823 | 823 | if verbosity = 0 then () else ( |
824 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1; | |
824 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1; | |
825 | 825 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3); |
826 | 826 | (if verbosity < 2 then "" else |
827 | 827 | sprintf "error_reduction: %s paths_size=%d chart_size=%d\n" result.msg result.paths_size result.chart_size ^ |
... | ... | @@ -834,10 +834,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
834 | 834 | "" |
835 | 835 | | TooManyNodes -> |
836 | 836 | if verbosity < 2 then () else ( |
837 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1; | |
838 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2; | |
837 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1; | |
838 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2; | |
839 | 839 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2; |
840 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3; | |
840 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3; | |
841 | 841 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3); |
842 | 842 | sprintf "to_many_nodes: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^ |
843 | 843 | (if verbosity < 2 then "" else |
... | ... | @@ -849,10 +849,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
849 | 849 | "" |
850 | 850 | | NotReduced -> |
851 | 851 | if verbosity < 2 then () else ( |
852 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1; | |
853 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2; | |
852 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1; | |
853 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2; | |
854 | 854 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2; |
855 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3); | |
855 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3); | |
856 | 856 | if verbosity = 0 then () else ( |
857 | 857 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3; |
858 | 858 | Xlatex.latex_file_out path (file_prefix ^ "_4_term") "a4" false (fun file -> |
... | ... | @@ -872,10 +872,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
872 | 872 | "" |
873 | 873 | | SemError -> |
874 | 874 | if verbosity < 2 then () else ( |
875 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1; | |
876 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2; | |
875 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1; | |
876 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2; | |
877 | 877 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2; |
878 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3); | |
878 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3); | |
879 | 879 | if verbosity = 0 then () else ( |
880 | 880 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3; |
881 | 881 | Xlatex.latex_file_out path (file_prefix ^ "_4_term") "a4" false (fun file -> |
... | ... | @@ -895,10 +895,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam |
895 | 895 | "" |
896 | 896 | | Parsed -> |
897 | 897 | if verbosity < 2 then () else ( |
898 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1; | |
899 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2; | |
898 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1; | |
899 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2; | |
900 | 900 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2; |
901 | - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3; | |
901 | + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3; | |
902 | 902 | ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3; |
903 | 903 | Xlatex.latex_file_out path (file_prefix ^ "_4_term") "a4" false (fun file -> |
904 | 904 | Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 result.term4)); |
... | ... |
exec/TODO
exec/makefile
... | ... | @@ -9,15 +9,15 @@ OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa b |
9 | 9 | eniam-lexSemantics.cmxa #eniam-exec.cmxa |
10 | 10 | INSTALLDIR=`ocamlc -where`/eniam |
11 | 11 | |
12 | -SOURCES= ENIAMexecTypes.ml ENIAMexec.ml ENIAMselectSent.ml ENIAMvisualization.ml | |
12 | +SOURCES= ENIAMexecTypes.ml ENIAMexec.ml ENIAMselectSent.ml ENIAMsemLexicon.ml ENIAMsemValence.ml ENIAMvisualization.ml | |
13 | 13 | |
14 | 14 | all: eniam-exec.cma eniam-exec.cmxa |
15 | 15 | |
16 | 16 | install: all |
17 | 17 | mkdir -p $(INSTALLDIR) |
18 | 18 | cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) |
19 | - cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMvisualization.cmi $(INSTALLDIR) | |
20 | - cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMvisualization.cmx $(INSTALLDIR) | |
19 | + cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR) | |
20 | + cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR) | |
21 | 21 | |
22 | 22 | |
23 | 23 | eniam-exec.cma: $(SOURCES) |
... | ... |
exec/parser.ml
... | ... | @@ -30,6 +30,7 @@ let img = ref 1 |
30 | 30 | let timeout = ref 30. |
31 | 31 | let select_sentence_modes_flag = ref false |
32 | 32 | let select_sentences_flag = ref true |
33 | +let assign_semantic_valence_flag = ref true | |
33 | 34 | let output_dir = ref "results/" |
34 | 35 | let spec_list = [ |
35 | 36 | (* "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; |
... | ... | @@ -56,6 +57,8 @@ let spec_list = [ |
56 | 57 | "--no_sel_modes", Arg.Unit (fun () -> select_sentence_modes_flag:=false), "Do not select sencence modes (default)"; |
57 | 58 | "--sel_sent", Arg.Unit (fun () -> select_sentences_flag:=true), "Select parsed sentences (default)"; |
58 | 59 | "--no_sel_sent", Arg.Unit (fun () -> select_sentences_flag:=false), "Do not select parsed sentences"; |
60 | + "--sem_valence", Arg.Unit (fun () -> assign_semantic_valence_flag:=true), "Assign semantic valence (default)"; | |
61 | + "--no_sem_valence", Arg.Unit (fun () -> assign_semantic_valence_flag:=false), "Do not assign semantic valence"; | |
59 | 62 | ] |
60 | 63 | |
61 | 64 | let usage_msg = |
... | ... | @@ -93,6 +96,8 @@ let rec main_loop sub_in sub_out = |
93 | 96 | let text = ENIAMexec.parse !timeout !verbosity rules tokens lex_sems text in |
94 | 97 | let text = if !select_sentence_modes_flag then ENIAMselectSent.select_sentence_modes_text text else text in |
95 | 98 | let text = if !select_sentences_flag then ENIAMselectSent.select_sentences_text ENIAMexecTypes.Struct text else text in |
99 | + let text = if !assign_semantic_valence_flag then ENIAMsemValence.assign tokens lex_sems text else text in | |
100 | + let text = if !assign_semantic_valence_flag then ENIAMsemValence.reduce tokens lex_sems text else text in | |
96 | 101 | ENIAMvisualization.print_html_text !output_dir "parsed_text" text !img !verbosity tokens); |
97 | 102 | prerr_endline "Done!"; |
98 | 103 | main_loop sub_in sub_out) |
... | ... | @@ -106,10 +111,7 @@ let _ = |
106 | 111 | prerr_endline message; |
107 | 112 | ENIAMcategoriesPL.initialize (); |
108 | 113 | Arg.parse spec_list anon_fun usage_msg; |
109 | - if !lexSemantics_built_in then ( | |
110 | - ENIAMsubsyntax.initialize (); | |
111 | - ENIAMwalParser.initialize (); | |
112 | - ENIAMwalReduce.initialize ()); | |
114 | + if !lexSemantics_built_in then ENIAMlexSemantics.initialize (); | |
113 | 115 | Gc.compact (); |
114 | 116 | let sub_in,sub_out = |
115 | 117 | if !lexSemantics_built_in then stdin,stdout |
... | ... |
exec/resources/lexicon-pl.dic
0 → 100644
1 | + | |
2 | +day-lex: /(date+day+day-month):unk; | |
3 | +date: /(1+year-lex):unk; | |
4 | +day: /month-lex:unk; | |
5 | +day-interval: /month-lex:unk; | |
6 | +day-month: /(1+year-lex):unk; | |
7 | +year-lex: |(1+adjp*number*case*gender):unk; | |
8 | +month-lex: /(1+year+np*T*gen*T*T):unk; | |
9 | + | |
10 | +date-interval: null; | |
11 | +day-month-interval: null; | |
12 | +month-interval: null; | |
13 | +year: null; | |
14 | +year-interval: null; | |
15 | +roman: null; | |
16 | +roman-interval: null; | |
17 | +hour-minute: null; | |
18 | +hour: null; | |
19 | +hour-minute-interval: null; | |
20 | +hour-interval: null; | |
21 | +obj-id: null; | |
22 | +match-result: null; | |
23 | +url: null; | |
24 | +email: null; | |
25 | + | |
26 | +np: | |
27 | + \(1+num*number*case*gender*person*congr+num*number*case*gender*person*rec):Count \(1+qub):adjunct /(1+inclusion):adjunct | |
28 | + \(1+measure*unumber*ucase*ugender*uperson):Measure | |
29 | + /(1+date+day+day-month+date-interval+day-interval+day-month-interval+year+year-interval+hour+hour-minute+hour-interval+hour-minute-interval):unk | |
30 | + |(1+year):unk /(1+obj-id):unk; | |
31 | + | |
32 | +num: \(1+qub):adjunct /(1+inclusion):adjunct; | |
33 | + | |
34 | +measure: | |
35 | + \(1+num*number*case*gender*person*congr+num*number*case*gender*person*rec):Count \(1+qub):adjunct /(1+inclusion):adjunct; | |
36 | + | |
37 | +prepnp: \(1+advp*T):adjunct /(np*T*case*T*T+day-month+day+year+date+hour+hour-minute):unk \(1+qub):adjunct /(1+inclusion):adjunct; | |
38 | +prepadjp: \(1+advp*T):adjunct /(adjp*T*case*T+adjp*sg*dat*m1+adjp*T*postp*T+adjp*sg*nom*f+advp*T):unk \(1+qub):adjunct /(1+inclusion):adjunct; | |
39 | +compar: \(1+advp*T):adjunct /(np*T*case*T*T+prepnp*T*T+prepadjp*T*T):unk \(1+qub):adjunct /(1+inclusion):adjunct; | |
40 | + | |
41 | +adjp: \(1+qub):adjunct /(1+inclusion):adjunct \(1+adja):unk; | |
42 | + | |
43 | +adja: /hyphen:nosem; | |
44 | + | |
45 | +advp: \(1+qub):adjunct /(1+inclusion):adjunct \(1+adja):unk; | |
46 | + | |
47 | +#FIXME: sprawdzić czy 'or' czy 'or2' | |
48 | +ip: /(1+int):unk \(1+qub):adjunct /(1+inclusion):adjunct \(1+nie):nosem |(1+aux-imp):nosem | |
49 | + |(1+aux-fut*number*gender*person+aux-past*number*gender*person):nosem |(1+aglt*number*person):nosem |(1+by):nosem /(1+or):adjunct; | |
50 | + | |
51 | +aux-fut: null; | |
52 | +aux-past: null; | |
53 | +aglt: null; | |
54 | + | |
55 | +infp: \(1+qub):adjunct /(1+inclusion):adjunct \(1+nie):nosem; | |
56 | +padvp: \(1+qub):adjunct /(1+inclusion):adjunct \(1+nie):nosem; | |
57 | + | |
58 | +cp: /ip*T*T*T:unk; | |
59 | +ncp: \(1+qub):adjunct /(1+inclusion):adjunct /cp*ctype*plemma:unk; | |
60 | + | |
61 | +#lemma=i|lub|czy|bądź,pos=conj: | |
62 | +# QUANT[number=all_numbers,gender=all_genders,person=all_persons] | |
63 | +# (ip*number*gender*person/ip*T*T*T)\ip*T*T*T; | |
64 | +#lemma=,|i|lub|czy|bądź,pos=conj: (advp*mod/prepnp*T*T)\prepnp*T*T; | |
65 | +#lemma=,|i|lub|czy|bądź,pos=conj: QUANT[mode=0] (advp*mode/advp*mode)\prepnp*T*T; | |
66 | +#lemma=,|i|lub|czy|bądź,pos=conj: QUANT[mode=0] (advp*mode/prepnp*T*T)\advp*mode; | |
67 | +#lemma=,|i|lub|czy|bądź,pos=conj: (advp*mod/advp*T)\advp*T; #FIXME: przydałaby się wersja zachowująca mode | |
68 | +#lemma=,|i|lub|czy|bądź,pos=conj: | |
69 | +# QUANT[plemma=0,case=all_cases] | |
70 | +# (prepnp*plemma*case/prepnp*plemma*case)\prepnp*plemma*case; | |
71 | +#lemma=,|i|lub|czy|bądź,pos=conj: | |
72 | +# QUANT[number=all_numbers,case=all_cases,gender=all_genders,person=all_persons] | |
73 | +# (np*number*case*gender*person/np*T*case*T*T)\np*T*case*T*T; | |
74 | +#lemma=,|i|lub|czy|bądź,pos=conj: | |
75 | +# QUANT[number=all_numbers,case=all_cases,gender=all_genders] | |
76 | +# (adjp*number*case*gender/adjp*number*case*gender)\adjp*number*case*gender; | |
77 | + | |
78 | +lex-się-qub: null; | |
79 | +nie: null; | |
80 | +by: null; | |
81 | +aux-imp: null; | |
82 | +qub: null; | |
83 | +interj: null; | |
84 | +hyphen: null; | |
85 | +int: null; | |
86 | +#lemma=„,pos=interp: QUANT[number=0,case=0,gender=0,person=0] (np*number*case*gender*person/rquot)/np*number*case*gender*person; #SetAttr("QUOT",Val "+",Var "x" | |
87 | +#lemma=«,pos=interp: QUANT[number=0,case=0,gender=0,person=0] (np*number*case*gender*person/rquot2)/np*number*case*gender*person; #SetAttr("QUOT",Val "+",Var "x" | |
88 | +#lemma=»,pos=interp: QUANT[number=0,case=0,gender=0,person=0] (np*number*case*gender*person/rquot3)/np*number*case*gender*person; #SetAttr("QUOT",Val "+",Var "x" | |
89 | +rquot: null; | |
90 | +rquot2: null; | |
91 | +rquot3: null; | |
92 | +#lemma=(,pos=interp: (inclusion/rparen)/(np*T*T*T*T+ip*T*T*T+adjp*T*T*T+prepnp*T*T); #SetAttr("INCLUSION",Val "+", | |
93 | +#lemma=[,pos=interp: (inclusion/rparen2)/(np*T*T*T*T+ip*T*T*T+adjp*T*T*T+prepnp*T*T); #SetAttr("INCLUSION",Val "+", | |
94 | +rparen: null; | |
95 | +rparen2: null; | |
96 | + | |
97 | +<conll_root>: /(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj):unk; | |
98 | +s: \?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj):unk; | |
99 | +<root>: /(1+s):unk /(1+<speaker-end>):unk /(1+or):unk /(1+np*T*nom*T*T):unk /(1+ip*T*T*T):unk; | |
100 | + | |
101 | +or: null; | |
102 | +<colon>: \<speaker>:unk /(1+<squery>):unk; | |
103 | +or2: \?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj):unk; | |
104 | +<speaker-end>: null | |
... | ... |
lexSemantics/ENIAMadjuncts.ml
... | ... | @@ -268,6 +268,27 @@ let add_adjuncts preps compreps compars pos2 (selectors,schema) = |
268 | 268 | | "adv" -> [selectors,schema @ ENIAMwalRenderer.adv_adjuncts_simp @ compars] |
269 | 269 | | _ -> [] |
270 | 270 | |
271 | +open ENIAMlexSemanticsTypes | |
272 | + | |
273 | +let add_subj_cr cr positions = | |
274 | + Xlist.map positions (fun p -> | |
275 | + if p.gf = SUBJ then {p with cr=cr :: p.cr} else p) | |
276 | + | |
277 | +let add_connected_adjuncts preps compreps compars pos2 frame = | |
278 | + let compreps = Xlist.rev_map compreps ENIAMwalRenderer.render_connected_comprep in | |
279 | + let prepnps = Xlist.rev_map preps (fun (prep,cases) -> ENIAMwalRenderer.render_connected_prepnp prep cases) in | |
280 | + let prepadjps = Xlist.rev_map preps (fun (prep,cases) -> ENIAMwalRenderer.render_connected_prepadjp prep cases) in | |
281 | + let compars = Xlist.rev_map compars ENIAMwalRenderer.render_connected_compar in | |
282 | + match pos2 with | |
283 | + "verb" -> [{frame with positions=(add_subj_cr "3" frame.positions) @ ENIAMwalRenderer.verb_connected_adjuncts_simp @ prepnps @ prepadjps @ compreps @ compars}] | |
284 | + | "noun" -> [ | |
285 | + {frame with selectors=[Nsyn,Eq,["proper"]] @ frame.selectors; positions=ENIAMwalRenderer.proper_noun_connected_adjuncts_simp @ prepnps @ compreps @ compars}; | |
286 | + {frame with selectors=[Nsyn,Eq,["common"];Nsem,Eq,["measure"]] @ frame.selectors; positions=ENIAMwalRenderer.measure_noun_connected_adjuncts_simp @ prepnps @ compreps @ compars}; | |
287 | + {frame with selectors=[Nsyn,Eq,["common"];Nsem,Neq,["measure"]] @ frame.selectors; positions=frame.positions @ ENIAMwalRenderer.common_noun_connected_adjuncts_simp @ prepnps @ compreps @ compars}] | |
288 | + | "adj" -> [{frame with positions=frame.positions @ ENIAMwalRenderer.adj_connected_adjuncts_simp @ compars}] | |
289 | + | "adv" -> [{frame with positions=frame.positions @ ENIAMwalRenderer.adv_connected_adjuncts_simp @ compars}] | |
290 | + | _ -> [] | |
291 | + | |
271 | 292 | (* let _ = |
272 | 293 | let schemata,entries = ENIAMvalence.prepare_all_valence ENIAMwalParser.phrases ENIAMwalParser.schemata ENIAMwalParser.entries in |
273 | 294 | let _ = Entries.map2 schemata (fun pos lemma schemata -> simplify_schemata pos (ENIAMvalence.simplify_pos pos) lemma schemata) in |
... | ... |
lexSemantics/ENIAMlexSemantics.ml
... | ... | @@ -23,12 +23,61 @@ open ENIAMlexSemanticsTypes |
23 | 23 | open ENIAMwalTypes |
24 | 24 | open Xstd |
25 | 25 | |
26 | -(*let find_senses t = (* FIXME: sensy zawierające 'się' *) | |
27 | - match t.token with | |
28 | - Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos | |
29 | - | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses | |
30 | - | _ -> [] | |
31 | -*) | |
26 | +let find_meaning m = | |
27 | + try | |
28 | + ENIAMplWordnet.find_meaning m.plwnluid | |
29 | + with Not_found -> | |
30 | + m.name ^ "-" ^ m.variant, [], unknown_meaning_weight | |
31 | + | |
32 | +let find_prep_meaning lemma hipero = | |
33 | + let hipero = match hipero with | |
34 | + [Predef hipero] -> hipero | |
35 | + | _ -> failwith "find_prep_meaning" in | |
36 | + if hipero = "ALL" then lemma, [hipero,0], unknown_meaning_weight else | |
37 | + let syn_id = StringMap.find !ENIAMplWordnet.predef hipero in | |
38 | + let hipero = IntMap.fold (ENIAMplWordnet.get_hipero syn_id) [] (fun hipero syn_id cost -> (ENIAMplWordnet.synset_name syn_id, cost) :: hipero) in | |
39 | + lemma, hipero, unknown_meaning_weight | |
40 | + | |
41 | +let lex_sie = LCG (ENIAMwalRenderer.render_morf (SimpleLexArg("się",QUB))) | |
42 | + | |
43 | +(* FIXME: naiwnie wierzymy, że jeśli leksem jest opisany semantycznie w walentym to zawiera ramy dla wszystkich sensów *) | |
44 | +let find_senses t s = | |
45 | + (*let set = Xlist.fold s.frames StringSet.empty (fun set frame -> | |
46 | + Xlist.fold frame.meanings set (fun set (name,hipero,weight) -> | |
47 | + StringSet.add set name)) in*) | |
48 | + let senses = match t.token with | |
49 | + Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos | |
50 | + | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses | |
51 | + | _ -> [] in | |
52 | + (* let senses = | |
53 | + Xlist.fold senses [] (fun senses (name,hipero,weight) -> | |
54 | + if StringSet.mem set name then senses else (name,hipero,weight) :: senses) in *) | |
55 | + let senses_sie = match t.token with | |
56 | + Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses (lemma ^ " się") pos | |
57 | + | Proper(_,_,_,senses) -> [] | |
58 | + | _ -> [] in | |
59 | +(* let senses_sie = Xlist.fold senses_sie [] (fun senses_sie (name,hipero,weight) -> | |
60 | + if StringSet.mem set name then senses_sie else (name,hipero,weight) :: senses_sie) in | |
61 | + let frames = if senses = [] then s.frames else {empty_frame with meanings=senses} :: s.frames in | |
62 | + let frames = if senses_sie = [] then frames else {empty_frame with meanings=senses_sie; | |
63 | + positions=[{empty_position with role="Lemma"; mode=["lemma"]; morfs=[lex_sie]; is_necessary=Req}]} :: frames in*) (* FIXME: czy to nie usuwa elementów z ramy? *) | |
64 | + let frames = Xlist.fold s.frames [] (fun frames f -> | |
65 | + if f.meanings <> [] then f :: frames else | |
66 | + if senses = [] && senses_sie = [] then {f with meanings=[ENIAMtokens.get_lemma t.token, [], unknown_meaning_weight]} :: frames else | |
67 | + (if senses_sie = [] then [] else [{f with meanings=senses_sie; positions={empty_position with role="Lemma"; mode=["lemma"]; morfs=[lex_sie]; is_necessary=Req} :: f.positions}]) @ | |
68 | + [{f with meanings=senses}] @ frames) in | |
69 | + {s with frames=frames} | |
70 | + | |
71 | +let find_selprefs schema = (* FIXME: RelationRole *) | |
72 | + Xlist.map schema (fun p -> | |
73 | + let l = Xlist.fold p.sel_prefs [] (fun l -> function | |
74 | + SynsetId id -> (try ENIAMplWordnet.synset_name id :: l with ENIAMplWordnet.SynsetNotFound -> l) | |
75 | + | Predef s -> s :: l | |
76 | + | SynsetName _ -> failwith "find_selprefs" | |
77 | + | RelationRole _ -> l) in | |
78 | + let l = if l = [] then ["ALL"] else l in | |
79 | + {p with sel_prefs=Xlist.map l (fun s -> SynsetName s)}) | |
80 | + | |
32 | 81 | let rec find a l i = |
33 | 82 | if a.(i) = max_int then ( |
34 | 83 | a.(i) <- i; |
... | ... | @@ -102,6 +151,33 @@ let get_preps tokens group = (* FIXME: To nie zadziała przy kilku wystąpieniac |
102 | 151 | | _ -> preps,compars) in |
103 | 152 | StringMap.fold preps [] (fun l prep v -> (prep, StringSet.to_list v) :: l), StringSet.to_list compars |
104 | 153 | |
154 | +let make_unique schemata = | |
155 | + let map = Xlist.fold schemata StringMap.empty (fun map (selectors,schema) -> | |
156 | + let s = "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "}" in | |
157 | + StringMap.add map s (selectors,schema)) in | |
158 | + StringMap.fold map [] (fun l _ (selectors,schema) -> (selectors,schema) :: l) | |
159 | + | |
160 | +let semantize lemma pos (selectors,schema) = | |
161 | + let schema = Xlist.rev_map schema (fun p -> | |
162 | + {p with role="Arg"; sel_prefs=[Predef "X"]}) in (* FIXME: zaślepka, żeby preferować znane argumenty *) | |
163 | + Xlist.rev_map (ENIAMvalence.get_aroles schema lemma pos) (fun (sel,arole,arole_attr,arev) -> | |
164 | + {empty_frame with selectors=sel @ selectors; positions=schema; | |
165 | + arole=arole; arole_attr=arole_attr; arev=arev}) | |
166 | + | |
167 | +let assign_prep_semantics lemma = | |
168 | + let roles = try StringMap.find ENIAMlexSemanticsData.prep_roles lemma with Not_found -> [] in | |
169 | + Printf.printf "assign_prep_semantics: |roles|=%d\n%!" (Xlist.size roles); | |
170 | + Xlist.map roles (function (case,arole,arole_attr,hipero,sel_prefs) -> | |
171 | + Printf.printf "assign_prep_semantics: case=%s arole=%s arole_attr=%s\n%!" case arole arole_attr; | |
172 | + let meaning = find_prep_meaning lemma hipero in (* FIXME: zaślepka dla meaning i weight *) | |
173 | + print_endline "assign_prep_semantics 1"; | |
174 | + let positions = [{empty_position with | |
175 | + sel_prefs=sel_prefs; dir=if lemma="temu" then Backward_ else Forward_; | |
176 | + morfs=ENIAMwalRenderer.assing_pref_morfs (lemma,case); is_necessary=Req}] in | |
177 | + print_endline "assign_prep_semantics 2"; | |
178 | + {empty_frame with selectors=[ENIAM_LCGlexiconTypes.Case,ENIAM_LCGlexiconTypes.Eq,[case]]; meanings=[meaning]; positions=find_selprefs positions; | |
179 | + arole=arole; arole_attr=arole_attr; arev=false}) | |
180 | + | |
105 | 181 | let assign_valence tokens lex_sems group = |
106 | 182 | let lexemes = Xlist.fold group StringSet.empty (fun lexemes id -> |
107 | 183 | let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens id).token in |
... | ... | @@ -118,10 +194,10 @@ let assign_valence tokens lex_sems group = |
118 | 194 | (* Printf.printf "A %s %s %s |schemata|=%d\n" lemma pos pos2 (Xlist.size schemata); *) |
119 | 195 | let entries = Entries.find entries pos lemma in |
120 | 196 | let connected = Entries.find connected pos2 lemma in |
121 | - let schemata = List.flatten (Xlist.map schemata (fun (opinion,neg,pred,aspect,schema) -> | |
122 | - ENIAMvalence.transform_entry pos lemma neg pred aspect schema)) in (* FIXME: gubię opinię *) | |
197 | + let schemata1 = List.flatten (Xlist.map schemata (fun (opinion,neg,pred,aspect,schema) -> | |
198 | + ENIAMvalence.transform_entry pos lemma neg pred aspect schema)) in (* gubię opinię *) | |
123 | 199 | (* Printf.printf "B %s |schemata|=%d\n" lemma (Xlist.size schemata); *) |
124 | - let schemata = ENIAMadjuncts.simplify_schemata lexemes pos pos2 lemma schemata in | |
200 | + let schemata = ENIAMadjuncts.simplify_schemata lexemes pos pos2 lemma schemata1 in | |
125 | 201 | (* Printf.printf "C %s |schemata|=%d\n" lemma (Xlist.size schemata); *) |
126 | 202 | let schemata = Xlist.rev_map schemata (fun (selectors,schema) -> |
127 | 203 | selectors,ENIAMwalRenderer.render_simple_schema schema) in |
... | ... | @@ -130,13 +206,31 @@ let assign_valence tokens lex_sems group = |
130 | 206 | let entries = List.flatten (Xlist.rev_map entries (ENIAMvalence.transform_lex_entry pos lemma)) in |
131 | 207 | let entries = Xlist.map entries (fun (selectors,entry) -> |
132 | 208 | selectors,ENIAMwalRenderer.render_lex_entry entry) in |
133 | - let connected = List.flatten (Xlist.map connected (fun (sopinion,fopinion,meanings,neg,pred,aspect,schema) -> | |
134 | - Xlist.rev_map (ENIAMvalence.transform_entry pos lemma neg pred aspect schema) (fun (selectors,schema) -> | |
135 | - selectors,meanings,schema))) in (* FIXME: gubię opinię *) | |
136 | - let connected = Xlist.fold connected [] (fun connected (selectors,meanings,schema) -> | |
137 | - if ENIAMadjuncts.check_selector_lex_constraints lexemes pos selectors then (selectors,meanings,schema) :: connected else connected) in | |
138 | - let connected = Xlist.rev_map connected (fun (selectors,meanings,schema) -> | |
139 | - selectors,meanings,ENIAMwalRenderer.render_connected_schema schema) in | |
209 | + let connected = List.flatten (Xlist.map connected (fun (sopinion,fopinion,meanings,neg,pred,aspect,schema1) -> | |
210 | + List.flatten (Xlist.rev_map (ENIAMvalence.transform_entry pos lemma neg pred aspect schema1) (fun (selectors,schema) -> | |
211 | + Xlist.rev_map (ENIAMvalence.get_aroles schema1 lemma pos) (fun (sel,arole,arole_attr,arev) -> | |
212 | + {selectors=sel @ selectors; meanings=Xlist.map meanings find_meaning; positions=schema; | |
213 | + arole=arole; arole_attr=arole_attr; arev=arev; sopinion=sopinion; fopinion=fopinion}))))) in | |
214 | + (* Printf.printf "E %s |connected|=%d\n" lemma (Xlist.size connected); *) | |
215 | + let connected = if connected = [] then List.flatten (Xlist.rev_map (make_unique schemata1) (semantize lemma pos)) else connected in | |
216 | + (* Printf.printf "F %s |connected|=%d\n" lemma (Xlist.size connected); *) | |
217 | + let connected = Xlist.fold connected [] (fun connected frame -> | |
218 | + if ENIAMadjuncts.check_selector_lex_constraints lexemes pos frame.selectors then frame :: connected else connected) in | |
219 | + (* Printf.printf "G %s |connected|=%d\n" lemma (Xlist.size connected); *) | |
220 | + let connected = Xlist.rev_map connected (fun frame -> | |
221 | + {frame with | |
222 | + positions = find_selprefs (ENIAMwalRenderer.render_connected_schema (ENIAMwalReduce.set_necessary frame.positions))}) in | |
223 | + (* Printf.printf "H %s |connected|=%d\n" lemma (Xlist.size connected); *) | |
224 | + let connected = List.flatten (Xlist.rev_map connected (ENIAMadjuncts.add_connected_adjuncts preps compreps compars pos2)) in | |
225 | + (* Printf.printf "I %s |connected|=%d\n" lemma (Xlist.size connected); *) | |
226 | + let connected = if pos = "prep" then | |
227 | + if connected <> [] then failwith "assign_valence" else | |
228 | + assign_prep_semantics lemma else connected in | |
229 | + (* Printf.printf "J %s |connected|=%d\n" lemma (Xlist.size connected); *) | |
230 | + let connected = if connected = [] then | |
231 | + Xlist.rev_map (ENIAMvalence.get_aroles [] lemma pos) (fun (sel,arole,arole_attr,arev) -> | |
232 | + {empty_frame with selectors=sel; arole=arole; arole_attr=arole_attr; arev=arev}) else connected in | |
233 | + (* Printf.printf "K %s |connected|=%d\n" lemma (Xlist.size connected); *) | |
140 | 234 | ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with |
141 | 235 | schemata=schemata; lex_entries=entries; frames=connected}) |
142 | 236 | |
... | ... | @@ -177,60 +271,54 @@ let assign_valence tokens lex_sems group = |
177 | 271 | (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*))}; |
178 | 272 | ExtArray.set tokens id {(ExtArray.get tokens id) with token=Lemma(lemma,pos,interp)} |
179 | 273 | | _ -> ()) |
274 | +*) | |
180 | 275 | |
181 | -let get_prefs_schema prefs schema = | |
182 | - Xlist.fold schema prefs (fun prefs t -> | |
183 | - Xlist.fold t.sel_prefs prefs StringSet.add) | |
184 | - | |
185 | -let map_prefs_schema senses schema = | |
186 | - Xlist.map schema (fun t -> | |
187 | - if Xlist.mem t.morfs (Phrase Pro) || Xlist.mem t.morfs (Phrase ProNG) then t else | |
188 | - {t with sel_prefs = Xlist.fold t.sel_prefs [] (fun l s -> | |
189 | - if StringSet.mem senses s then s :: l else l)}) | |
190 | 276 | |
191 | 277 | let disambiguate_senses lex_sems group = |
192 | 278 | let prefs = Xlist.fold group (StringSet.singleton "ALL") (fun prefs id -> |
193 | - Xlist.fold (ExtArray.get lex_sems id).valence prefs (fun prefs -> function | |
194 | - _,Frame(_,schema) -> get_prefs_schema prefs schema | |
195 | - | _,LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema | |
196 | - | _,ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in | |
197 | - let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id -> | |
279 | + Xlist.fold (ExtArray.get lex_sems id).frames prefs (fun prefs frame -> | |
280 | + Xlist.fold frame.positions prefs (fun prefs t -> | |
281 | + Xlist.fold t.sel_prefs prefs (fun prefs -> function | |
282 | + SynsetName s -> StringSet.add prefs s | |
283 | + | _ -> failwith "disambiguate_senses")))) in | |
284 | + (*let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id -> | |
198 | 285 | Xlist.fold (ExtArray.get lex_sems id).senses hipero (fun hipero (_,l,_) -> |
199 | 286 | Xlist.fold l hipero StringSet.add)) in |
200 | 287 | let senses = StringSet.intersection prefs hipero in |
201 | 288 | let is_zero = StringSet.mem hipero "0" in |
202 | - let senses = if is_zero then StringSet.add senses "0" else senses in | |
289 | + let senses = if is_zero then StringSet.add senses "0" else senses in*) | |
203 | 290 | Xlist.iter group (fun id -> |
204 | 291 | let t = ExtArray.get lex_sems id in |
205 | - ExtArray.set lex_sems id {t with valence = if is_zero then t.valence else | |
206 | - Xlist.map t.valence (function | |
207 | - n,Frame(a,schema) -> n,Frame(a,map_prefs_schema senses schema) | |
208 | - | n,LexFrame(s,p,r,schema) -> n,LexFrame(s,p,r,map_prefs_schema senses schema) | |
209 | - | n,ComprepFrame(s,p,r,schema) -> n,ComprepFrame(s,p,r,map_prefs_schema senses schema)); | |
210 | - senses = Xlist.map t.senses (fun (s,l,w) -> | |
211 | - s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)}) | |
212 | - | |
213 | -*) | |
292 | + ExtArray.set lex_sems id {t with frames=Xlist.map t.frames (fun frame -> | |
293 | + let meanings = Xlist.map frame.meanings (fun (name,hipero,weight) -> | |
294 | + let hipero = Xlist.fold hipero ["ALL",0] (fun hipero (name,cost) -> | |
295 | + if StringSet.mem prefs name then (name,cost) :: hipero else hipero) in | |
296 | + name,hipero,weight) in | |
297 | + {frame with meanings=meanings})}) | |
214 | 298 | |
299 | +let remove_unused_tokens tokens groups = | |
300 | + let set = Xlist.fold groups IntSet.empty (fun set group -> | |
301 | + Xlist.fold group set IntSet.add) in | |
302 | + Int.iter 1 (ExtArray.size tokens - 1) (fun i -> | |
303 | + if IntSet.mem set i then () else | |
304 | + ExtArray.set tokens i ENIAMtokenizerTypes.empty_token_env) | |
215 | 305 | |
216 | 306 | let assign tokens text = |
217 | 307 | let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in |
218 | 308 | let _ = ExtArray.add lex_sems empty_lex_sem in |
219 | 309 | Int.iter 1 (ExtArray.size tokens - 1) (fun i -> |
220 | - (* let token = ExtArray.get tokens i in | |
221 | - (* ExtArray.set tokens i token; *) | |
222 | - let senses = find_senses token in *) | |
223 | - let lex_sem = {empty_lex_sem with senses=[](*senses*)} in | |
224 | - let _ = ExtArray.add lex_sems lex_sem in | |
225 | - ()); | |
310 | + ignore (ExtArray.add lex_sems empty_lex_sem)); | |
226 | 311 | let groups = split_tokens_into_groups (ExtArray.size tokens) text in |
227 | 312 | (* Xlist.iter groups (fun group -> print_endline (String.concat " " (Xlist.map group string_of_int))); *) |
313 | + remove_unused_tokens tokens groups; | |
228 | 314 | Xlist.iter groups (fun group -> assign_valence tokens lex_sems group); |
229 | - (* Xlist.iter groups (fun group -> assign_valence tokens lex_sems group); | |
315 | + Int.iter 1 (ExtArray.size tokens - 1) (fun i -> | |
316 | + let token = ExtArray.get tokens i in | |
317 | + let lex_sem = ExtArray.get lex_sems i in | |
318 | + let lex_sem = find_senses token lex_sem in | |
319 | + ExtArray.set lex_sems i lex_sem); | |
230 | 320 | Xlist.iter groups (fun group -> disambiguate_senses lex_sems group); |
231 | - Xlist.iter groups (fun group -> assign_simplified_valence tokens lex_sems group); | |
232 | - Xlist.iter groups (fun group -> assign_very_simplified_valence tokens lex_sems group); | |
233 | - Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group); *) | |
321 | + (*Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group); *) | |
234 | 322 | lex_sems |
235 | 323 | |
236 | 324 | let catch_assign tokens text = |
... | ... | @@ -239,3 +327,11 @@ let catch_assign tokens text = |
239 | 327 | with e -> |
240 | 328 | ExtArray.make 0 empty_lex_sem, |
241 | 329 | Printexc.to_string e |
330 | + | |
331 | +let initialize () = | |
332 | + ENIAMsubsyntax.initialize (); | |
333 | + ENIAMwalParser.initialize (); | |
334 | + ENIAMwalReduce.initialize (); | |
335 | + ENIAMplWordnet.initialize (); | |
336 | + ENIAMcategoriesPL.initialize (); | |
337 | + () | |
... | ... |
lexSemantics/ENIAMlexSemanticsData.ml
... | ... | @@ -21,13 +21,13 @@ open ENIAMtokenizerTypes |
21 | 21 | open ENIAMlexSemanticsTypes |
22 | 22 | open Xstd |
23 | 23 | |
24 | -let subst_inst_roles = Xlist.fold [ | |
25 | - "wiosna", "Time",""; | |
26 | - "lato", "Time",""; | |
27 | - "jesień", "Time",""; | |
28 | - "zima", "Time",""; | |
29 | - "wieczór", "Time",""; | |
30 | - ] StringMap.empty (fun map (k,r,a) -> StringMap.add map k (r,a)) | |
24 | +let subst_inst_time = StringSet.of_list [ | |
25 | + "wiosna"; | |
26 | + "lato"; | |
27 | + "jesień"; | |
28 | + "zima"; | |
29 | + "wieczór"; | |
30 | + ] | |
31 | 31 | |
32 | 32 | let adj_roles = Xlist.fold [ |
33 | 33 | "ten", "Apoz",""; |
... | ... | @@ -59,7 +59,7 @@ let adj_roles = Xlist.fold [ |
59 | 59 | "taki", "Attribute",""; |
60 | 60 | "czyj", "Possesive",""; |
61 | 61 | "który", "Attribute",""; |
62 | - ] StringMap.empty (fun map (k,r,a) -> StringMap.add map k (r,a)) | |
62 | + ] StringMap.empty (fun map (k,r,a) -> StringMap.add_inc map k [r,a] (fun l -> (r,a) :: l)) | |
63 | 63 | |
64 | 64 | let adv_roles = Xlist.fold [ (* FIXME: problem z podwójnymi przypisaniami *) |
65 | 65 | (* operators: nielokalnie zmieniaja formułe logiczna *) |
... | ... | @@ -80,7 +80,7 @@ let adv_roles = Xlist.fold [ (* FIXME: problem z podwójnymi przypisaniami *) |
80 | 80 | "dlatego", "Condition",""; (* odniesieniem argumentu jest sytuacji/kontekst *) |
81 | 81 | "tak", "Manner",""; (* odniesieniem argumentu jest sytuacji/kontekst, byc może deiktyczny *) |
82 | 82 | |
83 | - "skąd", "Location","Source"; | |
83 | +(* "skąd", "Location","Source"; | |
84 | 84 | "skądkolwiek", "Location","Source"; |
85 | 85 | "skądś", "Location","Source"; |
86 | 86 | "skądże", "Location","Source"; |
... | ... | @@ -209,8 +209,8 @@ let adv_roles = Xlist.fold [ (* FIXME: problem z podwójnymi przypisaniami *) |
209 | 209 | "ongi", "Time",""; |
210 | 210 | "ongiś", "Time",""; |
211 | 211 | "wczas", "Time",""; |
212 | - "wonczas", "Time",""; | |
213 | - ] StringMap.empty (fun map (k,r,a) -> StringMap.add map k (r,a)) | |
212 | + "wonczas", "Time","";*) | |
213 | + ] StringMap.empty (fun map (k,r,a) -> StringMap.add_inc map k [r,a] (fun l -> (r,a) :: l)) | |
214 | 214 | |
215 | 215 | let qub_roles = Xlist.fold [ |
216 | 216 | "tylko", "Quantifier",""; |
... | ... | @@ -236,10 +236,10 @@ let qub_roles = Xlist.fold [ |
236 | 236 | "ponad", "Mod",""; |
237 | 237 | "prawie", "Mod",""; |
238 | 238 | "przynajmniej", "Mod",""; |
239 | - ] StringMap.empty (fun map (k,r,a) -> StringMap.add map k (r,a)) | |
239 | + ] StringMap.empty (fun map (k,r,a) -> StringMap.add_inc map k [r,a] (fun l -> (r,a) :: l)) | |
240 | 240 | |
241 | 241 | |
242 | -let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_prefs *)(* FIXME: problem z podwójnymi przypisaniami *) | |
242 | +let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_prefs *) | |
243 | 243 | "od","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"]; |
244 | 244 | "spod","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"]; |
245 | 245 | "spomiędzy","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"]; |
... | ... | @@ -249,12 +249,14 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr |
249 | 249 | "spoza","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"]; |
250 | 250 | "sprzed","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"]; |
251 | 251 | "z","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"]; |
252 | + "z","postp", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"]; | |
252 | 253 | "znad","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"]; |
253 | 254 | "zza","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"]; |
254 | 255 | "do","gen", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"]; |
255 | 256 | "ku","dat", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"]; |
256 | 257 | "między","acc", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"]; |
257 | 258 | "na","acc", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"]; |
259 | + "na","postp", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"]; | |
258 | 260 | "nad","acc", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"]; |
259 | 261 | "nieopodal","gen", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"]; |
260 | 262 | "opodal","gen", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"]; |
... | ... | @@ -267,6 +269,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr |
267 | 269 | "za","acc", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"]; |
268 | 270 | "dzięki","dat", "Condition","",["CZEMU"],[]; |
269 | 271 | "na","acc", "Condition","",["CZEMU"],[]; |
272 | + "na","postp", "Condition","",["CZEMU"],[]; | |
270 | 273 | "od","gen", "Condition","",["CZEMU"],[]; |
271 | 274 | "przez","acc", "Condition","",["CZEMU"],[]; |
272 | 275 | "wskutek","gen", "Condition","",["CZEMU"],[]; |
... | ... | @@ -275,6 +278,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr |
275 | 278 | "do","gen", "Purpose","",["CZEMU"],[]; |
276 | 279 | "ku","dat", "Purpose","",["CZEMU"],[]; |
277 | 280 | "na","acc", "Purpose","",["CZEMU"],[]; |
281 | + "na","postp", "Purpose","",["CZEMU"],[]; | |
278 | 282 | "po","acc", "Purpose","",["CZEMU"],[]; |
279 | 283 | "do","gen", "Duration","",["CZAS"],["CZAS"]; |
280 | 284 | "od","gen", "Duration","",["CZAS"],["CZAS"]; |
... | ... | @@ -285,6 +289,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr |
285 | 289 | "między","inst", "Location","",["POŁOŻENIE"],["POŁOŻENIE"]; |
286 | 290 | "nad","inst", "Location","",["POŁOŻENIE"],["POŁOŻENIE"]; |
287 | 291 | "na","loc", "Location","",["POŁOŻENIE"],["POŁOŻENIE"]; |
292 | + "na","postp", "Location","",["POŁOŻENIE"],["POŁOŻENIE"]; | |
288 | 293 | "naokoło","gen", "Location","",["POŁOŻENIE"],["POŁOŻENIE"]; |
289 | 294 | "naprzeciw","gen", "Location","",["POŁOŻENIE"],["POŁOŻENIE"]; |
290 | 295 | "naprzeciwko","gen", "Location","",["POŁOŻENIE"],["POŁOŻENIE"]; |
... | ... | @@ -327,6 +332,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr |
327 | 332 | "jak","str", "Manner","",[],[];*) |
328 | 333 | "pod","acc", "Manner","",["ALL"],[]; |
329 | 334 | "z","inst", "Manner","",["ALL"],[]; |
335 | + "z","postp", "Manner","",["ALL"],[]; | |
330 | 336 | "dokoła","gen", "Path","",["POŁOŻENIE"],["POŁOŻENIE"]; |
331 | 337 | "dookoła","gen", "Path","",["POŁOŻENIE"],["POŁOŻENIE"]; |
332 | 338 | "koło","gen", "Path","",["POŁOŻENIE"],["POŁOŻENIE"]; |
... | ... | @@ -363,10 +369,10 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr |
363 | 369 | "temu","acc", "Time","",["CZAS"],["CZAS"]; (* dodane *) |
364 | 370 | "za","gen", "Time","",["CZAS"],["CZAS"]; (* dodane *) |
365 | 371 | ] StringMap.empty (fun map (lemma,case,role,role_attr,hipero,sel_prefs) -> |
366 | - let hipero = Xlist.fold hipero StringSet.empty ENIAMplWordnet.get_hipero_rec in | |
367 | - let map2 = try StringMap.find map lemma with Not_found -> StringMap.empty in | |
368 | - let map2 = StringMap.add_inc map2 case [case,role,role_attr,hipero,sel_prefs] (fun l -> (case,role,role_attr,hipero,sel_prefs) :: l) in | |
369 | - StringMap.add map lemma map2) | |
372 | + let hipero = Xlist.map hipero (fun hipero -> ENIAMwalTypes.Predef hipero) in | |
373 | + let sel_prefs = Xlist.map sel_prefs (fun sel_prefs -> ENIAMwalTypes.Predef sel_prefs) in | |
374 | + StringMap.add_inc map lemma [case,role,role_attr,hipero,sel_prefs] | |
375 | + (fun l -> (case,role,role_attr,hipero,sel_prefs) :: l)) | |
370 | 376 | (* "przeciwko","dat","Dat"; |
371 | 377 | "przeciw","dat","Dat"; |
372 | 378 | "o","acc","Theme"; |
... | ... | @@ -374,14 +380,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr |
374 | 380 | "według","gen","Manr"; |
375 | 381 | "wobec","gen","Dat";*) |
376 | 382 | |
377 | -let assign_prep_semantics lemma cases t = | |
378 | - try | |
379 | - let map = StringMap.find prep_roles lemma in | |
380 | - let l = List.flatten (Xlist.map cases (fun case -> | |
381 | - try StringMap.find map case with Not_found -> [])) in | |
382 | - if l = [] then Normal else PrepSemantics l | |
383 | - with Not_found -> Normal | |
384 | - | |
383 | +(* | |
385 | 384 | let subst_special_lexemes = Xlist.fold [ |
386 | 385 | "jutro", ["indexical"];(*"dzień"*) |
387 | 386 | "pojutrze", ["indexical"];(*"dzień"*) |
... | ... | @@ -553,3 +552,4 @@ let assign_semantics tokens lex_sems group = |
553 | 552 | {t with semantics=assign_prep_semantics lemma (StringSet.to_list cases) t} |
554 | 553 | | _ -> t in |
555 | 554 | ExtArray.set lex_sems id t) |
555 | +*) | |
... | ... |
lexSemantics/ENIAMlexSemanticsHTMLof.ml
... | ... | @@ -63,10 +63,10 @@ let html_of_lex_sems tokens lex_sems = |
63 | 63 | let schemata = Xlist.map t.schemata (fun (selectors,l) -> |
64 | 64 | "  [" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ String.concat ", " (Xlist.map l (fun (d,s) -> |
65 | 65 | ENIAM_LCGstringOf.direction d ^ ENIAM_LCGstringOf.grammar_symbol 0 s)) ^ "}") in |
66 | - let frames = Xlist.map t.frames (fun (selectors,meanings,schema) -> | |
66 | + (* let frames = Xlist.map t.frames (fun (selectors,meanings,schema) -> FIXME | |
67 | 67 | "  [" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "} " ^ |
68 | - String.concat ", " (Xlist.map meanings (fun m -> ENIAMwalStringOf.meaning m))) in | |
69 | - (String.concat "<br>\n " ([core] @ schemata @ frames @ lex_entries)) :: l))) ^ | |
68 | + String.concat ", " (Xlist.map meanings (fun m -> ENIAMwalStringOf.meaning m))) in *) | |
69 | + (String.concat "<br>\n " ([core] @ schemata (*@ frames*) @ lex_entries)) :: l))) ^ | |
70 | 70 | "</P>" |
71 | 71 | |
72 | 72 | (* schemata: ((ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list * |
... | ... |
lexSemantics/ENIAMlexSemanticsStringOf.ml
... | ... | @@ -27,6 +27,11 @@ let lex_sems t = |
27 | 27 | let t2 = ExtArray.get t id in |
28 | 28 | (Printf.sprintf "%3d %s" id (lex_sem t2)) :: l)))*) |
29 | 29 | |
30 | +let arole f = | |
31 | + (if f.arole = "" then "" else "," ^ f.arole) ^ | |
32 | + (if f.arole_attr = "" then "" else "," ^ f.arole_attr) ^ | |
33 | + (if f.arev then ",rev" else "") | |
34 | + | |
30 | 35 | let string_of_lex_sems tokens lex_sems = |
31 | 36 | String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size lex_sems - 1) [] (fun l id -> |
32 | 37 | let t = ExtArray.get lex_sems id in |
... | ... | @@ -35,18 +40,22 @@ let string_of_lex_sems tokens lex_sems = |
35 | 40 | let lemma = ENIAMtokens.string_of_token t2.ENIAMtokenizerTypes.token in |
36 | 41 | let core = Printf.sprintf "%3d %s %s" id orth lemma in |
37 | 42 | let lex_entries = Xlist.map t.lex_entries (fun (selectors,s) -> |
38 | - "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] " ^ ENIAM_LCGstringOf.grammar_symbol 0 s) in | |
43 | + "&[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] " ^ ENIAM_LCGstringOf.grammar_symbol 0 s) in | |
39 | 44 | let schemata = Xlist.map t.schemata (fun (selectors,l) -> |
40 | 45 | "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ String.concat "," (Xlist.map l (fun (d,s) -> |
41 | 46 | ENIAM_LCGstringOf.direction d ^ ENIAM_LCGstringOf.grammar_symbol 0 s)) ^ "}") in |
42 | - let frames = Xlist.map t.frames (fun (selectors,meanings,schema) -> | |
43 | - "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "} " ^ | |
47 | +(* let frames = Xlist.map t.frames (fun (selectors,meanings,schema) -> | |
48 | + "*[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "} " ^ | |
44 | 49 | String.concat "," (Xlist.map meanings (fun m -> ENIAMwalStringOf.meaning m))) in |
45 | - (String.concat "\n " ([core] @ schemata @ frames @ lex_entries)) :: l))) | |
50 | + let senses = Xlist.map t.senses (fun (sense,hipero,weight) -> | |
51 | + Printf.sprintf "%s[%s]%.2f" sense (String.concat "," (Xlist.map hipero (fun (s,n) -> s ^ " " ^ string_of_int n))) weight) in*) | |
52 | + let frames = Xlist.map t.frames (fun f -> | |
53 | + "*" ^ arole f ^ "[" ^ ENIAMcategoriesPL.string_of_selectors f.selectors ^ "] {" ^ ENIAMwalStringOf.schema f.positions ^ "} " ^ | |
54 | + String.concat "," (Xlist.map f.meanings (fun (sense,hipero,weight) -> | |
55 | + Printf.sprintf "%s[%s]%.2f" sense (String.concat "," (Xlist.map hipero (fun (s,n) -> s ^ " " ^ string_of_int n))) weight))) in | |
56 | + (String.concat "\n " ([core] @ (*senses @*) schemata @ frames @ lex_entries)) :: l))) | |
46 | 57 | (* let lroles = if snd t.lroles = "" then fst t.lroles else fst t.lroles ^ " " ^ snd t.lroles in |
47 | 58 | let core = Printf.sprintf "%3d %s %s %s" id orth lemma lroles in |
48 | - let senses = Xlist.map t.senses (fun (sense,hipero,weight) -> | |
49 | - Printf.sprintf "%s[%s]%.2f" sense (String.concat "," hipero) weight) in | |
50 | 59 | let valence = Xlist.map t.valence (ENIAMwalStringOf.fnum_frame "") in |
51 | 60 | let simple_valence = Xlist.map t.simple_valence (ENIAMwalStringOf.fnum_frame "") in |
52 | 61 | (* let semantics = *) |
... | ... |
lexSemantics/ENIAMlexSemanticsTypes.ml
... | ... | @@ -36,20 +36,33 @@ type semantics = |
36 | 36 | | SpecialMod of string * (type_arg list * type_term)*) |
37 | 37 | | PrepSemantics of (string * string * string * StringSet.t * string list) list (* case,role,role_attr,hipero,sel_prefs *) |
38 | 38 | |
39 | +type frame = { | |
40 | + selectors: (ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list; | |
41 | + meanings: ((*ENIAMwalTypes.meaning **) string * (string * int) list * float) list; | |
42 | + positions: ENIAMwalTypes.position list; | |
43 | + arole: string; | |
44 | + arole_attr: string; | |
45 | + arev: bool; | |
46 | + sopinion: ENIAMwalTypes.opinion; | |
47 | + fopinion: ENIAMwalTypes.opinion; | |
48 | + } | |
49 | + | |
50 | +let empty_frame = {selectors=[]; meanings=[]; positions=[]; arole=""; arole_attr=""; arev=false; | |
51 | + sopinion=ENIAMwalTypes.Nieokreslony; fopinion=ENIAMwalTypes.Nieokreslony} | |
52 | + | |
39 | 53 | type lex_sem = { |
40 | 54 | schemata: ((ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list * |
41 | 55 | (ENIAM_LCGtypes.direction * ENIAM_LCGtypes.grammar_symbol) list) list; |
42 | 56 | lex_entries: ((ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list * |
43 | 57 | ENIAM_LCGtypes.grammar_symbol) list; |
44 | - frames: ((ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list * | |
45 | - ENIAMwalTypes.meaning list * ENIAMwalTypes.position list) list; | |
58 | + frames: frame list; | |
46 | 59 | cats: string list; |
47 | 60 | (* e: labels; *) |
48 | 61 | (* valence: (int * ENIAMwalTypes.frame) list; |
49 | 62 | simple_valence: (int * ENIAMwalTypes.frame) list; |
50 | 63 | very_simple_valence: ((ENIAM_LCGgrammarPLtypes.cat * ENIAM_LCGgrammarPLtypes.selector_relation * string list) list * ENIAM_LCGtypes.grammar_symbol) list; *) |
51 | - senses: (string * string list * float) list; | |
52 | - lroles: string * string; | |
64 | + (* senses: (string * (string * int) list * float) list; *) | |
65 | + (* lroles: string * string; *) | |
53 | 66 | semantics: semantics; |
54 | 67 | } |
55 | 68 | |
... | ... | @@ -63,8 +76,8 @@ type lex_sem = { |
63 | 76 | |
64 | 77 | let empty_lex_sem = { |
65 | 78 | schemata=[]; lex_entries=[]; frames=[]; cats=["X"]; |
66 | - (*e=empty_labels;*) (*valence=[]; simple_valence=[]; very_simple_valence=[];*) senses=[]; | |
67 | - lroles="",""; semantics=Normal} | |
79 | + (*e=empty_labels;*) (*valence=[]; simple_valence=[]; very_simple_valence=[];*) (*senses=[];*) | |
80 | + (*lroles="","";*) semantics=Normal} | |
68 | 81 | |
69 | 82 | (* FIXME: poprawić katalog *) |
70 | 83 | (*let subst_uncountable_lexemes_filename = resource_path ^ "/lexSemantics/subst_uncountable.dat" |
... | ... | @@ -77,3 +90,13 @@ let subst_uncountable_lexemes_filename2 = resource_path ^ "/Walenty/subst_uncoun |
77 | 90 | let subst_container_lexemes_filename = resource_path ^ "/Walenty/subst_container.dat" |
78 | 91 | let subst_numeral_lexemes_filename = resource_path ^ "/Walenty/subst_numeral.dat" |
79 | 92 | let subst_time_lexemes_filename = resource_path ^ "/Walenty/subst_time.dat" *) |
93 | + | |
94 | +let hipero_threshold = 3 | |
95 | +let unknown_meaning_weight = -1. | |
96 | + | |
97 | +let lu_filename = resource_path ^ "/plWordnet/lu.tab" | |
98 | +let ex_hipo_filename = resource_path ^ "/plWordnet/ex_hipo.tab" | |
99 | +let syn_filename = resource_path ^ "/plWordnet/syn.tab" | |
100 | + | |
101 | +let predef_filename = resource_path ^ "/lexSemantics/predef_prefs.tab" | |
102 | +let proper_classes_filename = resource_path ^ "/lexSemantics/proper_classes.tab" | |
... | ... |
lexSemantics/ENIAMplWordnet.ml
0 → 100644
1 | +(* | |
2 | + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. | |
3 | + * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open Xstd | |
21 | +open ENIAMlexSemanticsTypes | |
22 | + | |
23 | +let lu_names = ref IntMap.empty | |
24 | +let lumap = ref StringMap.empty | |
25 | +let synmap = ref IntMap.empty | |
26 | +let ex_hipo = ref IntMap.empty | |
27 | +let predef_names = ref IntMap.empty | |
28 | +let proper_classes = ref StringMap.empty | |
29 | +let predef = ref StringMap.empty | |
30 | + | |
31 | +let load_lu filename = | |
32 | + File.fold_tab filename (IntMap.empty,StringMap.empty) (fun (lu_names,lumap) -> function | |
33 | + [lu_id; lemma; variant; syn_id] -> | |
34 | + let v = variant,int_of_string syn_id in | |
35 | + IntMap.add lu_names (int_of_string lu_id) (lemma,variant,int_of_string syn_id), | |
36 | + StringMap.add_inc lumap lemma [v] (fun l -> v :: l) | |
37 | + | l -> failwith ("load_lu: " ^ String.concat "\t" l)) | |
38 | + | |
39 | +let load_syn filename = | |
40 | + File.fold_tab filename IntMap.empty (fun synmap -> function | |
41 | + syn_id :: pos :: lu_ids -> | |
42 | + let lu_ids = Xlist.map lu_ids int_of_string in | |
43 | + IntMap.add synmap (int_of_string syn_id) (pos,lu_ids) | |
44 | + | l -> failwith ("load_syn: " ^ String.concat "\t" l)) | |
45 | + | |
46 | +let load_ex_hipo filename = | |
47 | + File.fold_tab filename IntMap.empty (fun ex_hipo -> function | |
48 | + [parent; child; cost] -> | |
49 | + let parent = int_of_string parent in | |
50 | + let child = int_of_string child in | |
51 | + let cost = int_of_string cost in | |
52 | + let children = try IntMap.find ex_hipo parent with Not_found -> IntMap.empty in | |
53 | + let children = IntMap.add_inc children child cost (fun _ -> failwith "load_ex_hipo") in | |
54 | + IntMap.add ex_hipo parent children | |
55 | + | l -> failwith ("load_ex_hipo: " ^ String.concat "\t" l)) | |
56 | + | |
57 | +let syn_id_of_sense sense = | |
58 | + let lemma,variant = | |
59 | + match List.rev (Xstring.split " " sense) with | |
60 | + variant :: l -> String.concat " " (List.rev l), variant | |
61 | + | _ -> failwith "syn_id_of_sense 1" in | |
62 | + let l = Xlist.fold (try StringMap.find !lumap lemma with Not_found -> failwith ("syn_id_of_sense: " ^ lemma)) [] (fun l (variant2,syn_id) -> | |
63 | + if variant = variant2 then syn_id :: l else l) in | |
64 | + match l with | |
65 | + [syn_id] -> syn_id | |
66 | + | _ -> failwith ("syn_id_of_sense 2: " ^ lemma) | |
67 | + | |
68 | +let load_predef ex_hipo filename = | |
69 | + let ex_hipo,predef_names,predef,_ = | |
70 | + File.fold_tab filename (ex_hipo,IntMap.empty,StringMap.empty,-1) (fun (ex_hipo,predef_names,predef,id) -> function | |
71 | + name :: senses -> | |
72 | + let ex_hipo = Xlist.fold senses ex_hipo (fun ex_hipo sense -> | |
73 | + let hipo_id = try StringMap.find predef sense with Not_found -> syn_id_of_sense sense in | |
74 | + let children = try IntMap.find ex_hipo hipo_id with Not_found -> IntMap.empty in | |
75 | + let children = IntMap.add_inc children id 0 (fun _ -> failwith "load_predef 1") in | |
76 | + IntMap.add ex_hipo hipo_id children) in | |
77 | + let predef_names = IntMap.add predef_names id name in | |
78 | + let predef = StringMap.add_inc predef name id (fun _ -> failwith "load_predef 2") in | |
79 | + ex_hipo, predef_names, predef, id-1 | |
80 | + | l -> failwith ("load_predef: " ^ String.concat "\t" l)) in | |
81 | + ex_hipo,predef_names,predef | |
82 | + | |
83 | +let rec get_hipero_rec found ex_hipo id cost = | |
84 | + let cost2 = try IntMap.find found id with Not_found -> max_int in | |
85 | + if cost2 <= cost || cost > hipero_threshold then found else | |
86 | + let found = IntMap.add found id cost in | |
87 | + let map = try IntMap.find ex_hipo id with Not_found -> IntMap.empty in | |
88 | + IntMap.fold map found (fun found id2 cost2 -> | |
89 | + get_hipero_rec found ex_hipo id2 (cost + cost2)) | |
90 | + | |
91 | +let get_hipero syn_id = | |
92 | + get_hipero_rec IntMap.empty !ex_hipo syn_id 0 | |
93 | + | |
94 | +exception SynsetNotFound | |
95 | + | |
96 | +let synset_name syn_id = | |
97 | + if IntMap.mem !predef_names syn_id then IntMap.find !predef_names syn_id else | |
98 | + let lemma,variant,_ = | |
99 | + try IntMap.find !lu_names (List.hd (snd (IntMap.find !synmap syn_id))) | |
100 | + with Not_found -> raise SynsetNotFound (*failwith ("synset_name: " ^ string_of_int syn_id)*) in | |
101 | + lemma ^ "-" ^ variant | |
102 | + | |
103 | +let load_proper_classes filename = | |
104 | + File.fold_tab filename StringMap.empty (fun map -> function | |
105 | + id :: senses -> | |
106 | + let senses = Xlist.map senses (fun sense -> | |
107 | + match List.rev (Str.split (Str.regexp " ") sense) with | |
108 | + weight :: l -> String.concat " " (List.rev l), (try float_of_string weight with _ -> failwith "load_proper_classes 2") | |
109 | + | _ -> failwith "load_proper_classes 4") in | |
110 | + let senses = Xlist.map senses (fun (sense,weight) -> | |
111 | + (* let sense = if sense = "antroponim 1" then "nazwa własna 1" else sense in | |
112 | + let sense = if sense = "godzina 4" then "godzina 3" else sense in *) | |
113 | +(* print_endline sense; *) | |
114 | + let syn_id = syn_id_of_sense sense in | |
115 | + sense,IntMap.fold (get_hipero syn_id) [] (fun hipero syn_id cost -> (synset_name syn_id, cost) :: hipero),weight) in | |
116 | + StringMap.add_inc map id senses (fun _ -> failwith ("load_proper_classes 3: " ^ id)) | |
117 | + | l -> failwith ("load_proper_classes: " ^ String.concat "\t" l)) | |
118 | + | |
119 | +let simplify_pos = function | |
120 | + "subst" -> "noun" | |
121 | + | "depr" -> "noun" | |
122 | + | "adj" -> "adj" | |
123 | + | "adja" -> "adj" | |
124 | + | "adjc" -> "adj" | |
125 | + | "adjp" -> "adj" | |
126 | + | "ger" -> "verb" | |
127 | + | "pact" -> "verb" | |
128 | + | "ppas" -> "verb" | |
129 | + | "fin" -> "verb" | |
130 | + | "bedzie" -> "verb" | |
131 | + | "praet" -> "verb" | |
132 | + | "winien" -> "verb" | |
133 | + | "impt" -> "verb" | |
134 | + | "imps" -> "verb" | |
135 | + | "inf" -> "verb" | |
136 | + | "pcon" -> "verb" | |
137 | + | "pant" -> "verb" | |
138 | + | "pred" -> "verb" | |
139 | + | s -> s | |
140 | + | |
141 | +let find_senses lemma pos = | |
142 | +(*if pos = "ppron12" || pos = "ppron3" || pos = "siebie" then {t with senses=[lemma,["0"],0.]} else*) (* FIXME: ustalić co z zaimkami *) | |
143 | + let l = try StringMap.find !lumap lemma with Not_found -> [] in | |
144 | + let pos = simplify_pos pos in | |
145 | + Xlist.fold l [] (fun l (variant,syn_id) -> | |
146 | + let pos2,_ = try IntMap.find !synmap syn_id with Not_found -> failwith "find_senses" in | |
147 | + if pos <> pos2 then l else | |
148 | + (lemma ^ "-" ^ variant, | |
149 | + IntMap.fold (get_hipero syn_id) [] (fun hipero syn_id cost -> (synset_name syn_id, cost) :: hipero), | |
150 | + log10 (1. /. (try float_of_string variant with _ -> 3.))) :: l) | |
151 | + | |
152 | +let find_proper_senses senses = | |
153 | + List.flatten (Xlist.rev_map senses (fun sense -> | |
154 | + try StringMap.find !proper_classes sense with Not_found -> failwith ("find_proper_senses: " ^ sense))) | |
155 | + | |
156 | +let find_meaning lu_id = | |
157 | + let lemma,variant,syn_id = IntMap.find !lu_names lu_id in | |
158 | + lemma ^ "-" ^ variant, | |
159 | + IntMap.fold (get_hipero syn_id) [] (fun hipero syn_id cost -> (synset_name syn_id, cost) :: hipero), | |
160 | + log10 (1. /. (try float_of_string variant with _ -> 3.)) | |
161 | + | |
162 | +let initialize () = | |
163 | + let a,b = load_lu lu_filename in | |
164 | + lu_names := a; | |
165 | + lumap := b; | |
166 | + synmap := load_syn syn_filename; | |
167 | + ex_hipo := load_ex_hipo ex_hipo_filename; | |
168 | + let a,b,c = load_predef !ex_hipo predef_filename in | |
169 | + ex_hipo := a; | |
170 | + predef_names := b; | |
171 | + predef := c; | |
172 | + proper_classes := load_proper_classes proper_classes_filename; | |
173 | + () | |
... | ... |
lexSemantics/ENIAMvalence.ml
... | ... | @@ -613,3 +613,34 @@ let get_default_valence = function |
613 | 613 | | "adj" -> [Nieokreslony,NegationUndef,PredFalse,AspectUndef,[]] |
614 | 614 | | "adv" -> [Nieokreslony,NegationUndef,PredFalse,AspectUndef,[]] |
615 | 615 | | _ -> [] |
616 | + | |
617 | +open ENIAMcategoriesPL | |
618 | + | |
619 | +let get_aroles schema lemma = function | |
620 | + "pact" -> [Xlist.fold schema ([],"Arg","",true) (fun (sel,arole,arole_attr,arev) p -> | |
621 | + if p.gf = SUBJ then sel,p.role,p.role_attr,arev else sel,arole,arole_attr,arev)] | |
622 | + | "ppas" -> [Xlist.fold schema ([],"Arg","",true) (fun (sel,arole,arole_attr,arev) p -> | |
623 | + if p.gf = OBJ then sel,p.role,p.role_attr,arev else sel,arole,arole_attr,arev)] | |
624 | + | "subst" -> [ | |
625 | + [Case,Eq,["dat"]],"Recipent","",false; | |
626 | + [Case,Eq,["inst"]],(if StringSet.mem ENIAMlexSemanticsData.subst_inst_time lemma then "Time" else "Instrument"),"",false; | |
627 | + [Case,Neq,["dat";"inst"]],"","",false] | |
628 | + | "adj" | "adjc" | "adjp" -> (* FIXME czy adjc i adjp mogą być adjunctami? *) | |
629 | + let l = try StringMap.find ENIAMlexSemanticsData.adj_roles lemma with Not_found -> ["Attribute",""] in | |
630 | + Xlist.map l (fun (role,role_attr) -> [],role,role_attr,false) | |
631 | + | "adv" -> | |
632 | + let modes = ENIAMcategoriesPL.adv_mode lemma in | |
633 | + let roles = try StringMap.find ENIAMlexSemanticsData.adv_roles lemma with Not_found -> ["Manner",""] in | |
634 | + Xlist.fold modes [] (fun l -> function | |
635 | + "mod" -> Xlist.fold roles l (fun l (role,role_attr) -> ([Mode,Eq,["mod"]],role,role_attr,false) :: l) | |
636 | + | "abl" -> ([Mode,Eq,["abl"]],"Location","Souce",false) :: l | |
637 | + | "adl" -> ([Mode,Eq,["adl"]],"Location","Goal",false) :: l | |
638 | + | "locat" -> ([Mode,Eq,["locat"]],"Location","",false) :: l | |
639 | + | "perl" -> ([Mode,Eq,["perl"]],"Path","",false) :: l | |
640 | + | "dur" -> ([Mode,Eq,["dur"]],"Duration","",false) :: l | |
641 | + | "temp" -> ([Mode,Eq,["temp"]],"Time","",false) :: l | |
642 | + | _ -> failwith "get_aroles") | |
643 | + | "qub" -> | |
644 | + let l = try StringMap.find ENIAMlexSemanticsData.qub_roles lemma with Not_found -> ["Arg",""] in | |
645 | + Xlist.map l (fun (role,role_attr) -> [],role,role_attr,false) | |
646 | + | _ -> [[],"","",false] | |
... | ... |
lexSemantics/ENIAMwalFrames.ml deleted
1 | -(* | |
2 | - * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. | |
3 | - * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | - * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences | |
5 | - * | |
6 | - * This library is free software: you can redistribute it and/or modify | |
7 | - * it under the terms of the GNU Lesser General Public License as published by | |
8 | - * the Free Software Foundation, either version 3 of the License, or | |
9 | - * (at your option) any later version. | |
10 | - * | |
11 | - * This library is distributed in the hope that it will be useful, | |
12 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - * GNU Lesser General Public License for more details. | |
15 | - * | |
16 | - * You should have received a copy of the GNU Lesser General Public License | |
17 | - * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | - *) | |
19 | - | |
20 | -open ENIAMwalTypes | |
21 | -open Xstd | |
22 | - | |
23 | -let expands,compreps,comprep_reqs,subtypes,equivs = ENIAMwalParser.load_realizations () | |
24 | -(*let verb_frames = ENIAMwalParser.load_frames (Paths.walenty_path ^ Paths.verb_filename) | |
25 | -let noun_frames = ENIAMwalParser.load_frames (Paths.walenty_path ^ Paths.noun_filename) | |
26 | -let adj_frames = ENIAMwalParser.load_frames (Paths.walenty_path ^ Paths.adj_filename) | |
27 | -let adv_frames = ENIAMwalParser.load_frames (Paths.walenty_path ^ Paths.adv_filename) *) | |
28 | - | |
29 | -let walenty = (*StringMap.empty*)ENIAMwalTEI.load_walenty2 () | |
30 | - | |
31 | -(*let _ = StringMap.iter walenty (fun pos map -> | |
32 | - StringMap.iter map (fun lexeme frames -> | |
33 | - Printf.printf "%s %s %d\n%!" pos lexeme (Xlist.size frames)))*) | |
34 | - | |
35 | -(*let all_frames = | |
36 | - ["subst",noun_frames; | |
37 | - "adj",adj_frames; | |
38 | - "adv",adv_frames; | |
39 | - "ger",verb_frames; | |
40 | - "pact",verb_frames; | |
41 | - "ppas",verb_frames; | |
42 | - "fin",verb_frames; | |
43 | - "praet",verb_frames; | |
44 | - "impt",verb_frames; | |
45 | - "imps",verb_frames; | |
46 | - "inf",verb_frames; | |
47 | - "pcon",verb_frames]*) | |
48 | - | |
49 | -let rec get_role_and_sense = function | |
50 | - Phrase(Lex "się") -> "Theme","", [] | |
51 | - | PhraseAbbr(Xp "abl",_) -> "Location","Source", [] | |
52 | - | PhraseAbbr(Xp "adl",_) -> "Location","Goal", [] | |
53 | - | PhraseAbbr(Xp "caus",_) -> "Condition","", [] | |
54 | - | PhraseAbbr(Xp "dest",_) -> "Purpose","", [] | |
55 | - | PhraseAbbr(Xp "dur",_) -> "Duration","", [] | |
56 | - | PhraseAbbr(Xp "instr",_) -> "Instrument","", [] | |
57 | - | PhraseAbbr(Xp "locat",_) -> "Location","", [] | |
58 | - | PhraseAbbr(Xp "mod",_) -> "Manner","", [] | |
59 | - | PhraseAbbr(Xp "perl",_) -> "Path","", [] | |
60 | - | PhraseAbbr(Xp "temp",_) -> "Time","", [] | |
61 | - | PhraseAbbr(Advp "abl",_) -> "Location","Source", [] | |
62 | - | PhraseAbbr(Advp "adl",_) -> "Location","Goal", [] | |
63 | - | PhraseAbbr(Advp "dur",_) -> "Duration","", [] | |
64 | - | PhraseAbbr(Advp "locat",_) -> "Location","", [] | |
65 | - | PhraseAbbr(Advp "mod",_) -> "Manner","", [] | |
66 | - | PhraseAbbr(Advp "perl",_) -> "Path","", [] | |
67 | - | PhraseAbbr(Advp "temp",_) -> "Time","", [] | |
68 | -(* | PhraseAbbr(Advp "pron",_) -> "Arg","", [] | |
69 | - | PhraseAbbr(Advp "misc",_) -> "Arg","", []*) | |
70 | - | PhraseAbbr(Distrp,_) -> "Distributive","", [] (* FIXME: to jest kwantyfikator *) | |
71 | - | PhraseAbbr(Possp,_) -> "Possesive","", [] | |
72 | - | LexPhraseMode("abl",_,_) -> "Location","Source", [] | |
73 | - | LexPhraseMode("adl",_,_) -> "Location","Goal", [] | |
74 | - | LexPhraseMode("caus",_,_) -> "Condition","", [] | |
75 | - | LexPhraseMode("dest",_,_) -> "Purpose","", [] | |
76 | - | LexPhraseMode("dur",_,_) -> "Duration","", [] | |
77 | - | LexPhraseMode("instr",_,_) -> "Instrument","", [] | |
78 | - | LexPhraseMode("locat",_,_) -> "Location","", [] | |
79 | - | LexPhraseMode("mod",_,_) -> "Manner","", [] | |
80 | - | LexPhraseMode("perl",_,_) -> "Path","", [] | |
81 | - | LexPhraseMode("temp",_,_) -> "Time","", [] | |
82 | - | _ -> "Arg","", [] | |
83 | - | |
84 | - | |
85 | -(*let rec get_gf_role = function | |
86 | - [],Phrase(NP case) -> "C", "", ["T"] | |
87 | - | [],Phrase(AdjP case) -> "R", "", ["T"] | |
88 | - | [],Phrase(NumP(case,_)) -> "C", "", ["T"] | |
89 | - | [],Phrase(PrepNP _) -> "C", "", ["T"] | |
90 | - | [],Phrase(PrepAdjP _) -> "C", "", ["T"] | |
91 | - | [],Phrase(PrepNumP _) -> "C", "", ["T"] | |
92 | - | [],Phrase(ComprepNP _) -> "C", "", ["T"] | |
93 | - | [],Phrase(ComparP _) -> "C", "", ["T"] | |
94 | - | [],Phrase(CP _) -> "C", "", ["T"] | |
95 | - | [],Phrase(NCP(case,_,_)) -> "C", "", ["T"] | |
96 | - | [],Phrase(PrepNCP _) -> "C", "", ["T"] | |
97 | - | [],Phrase(InfP _) -> "C", "", ["T"] | |
98 | - | [],Phrase(FixedP _) -> "C", "", ["T"] | |
99 | - | [],Phrase Or -> "C", "", ["T"] (* FIXME: zbadać w walentym faktyczne użycia or, bo to nie tylko zdania, ale też np(nom) w cudzysłowach *) | |
100 | - | [],Phrase(Lex "się") -> "C", "Ptnt", ["T"] | |
101 | - | [],PhraseAbbr(Xp mode,_) -> "C", mode, ["T"] | |
102 | - | [],PhraseAbbr(Advp "pron",_) -> "R", "", ["T"] | |
103 | - | [],PhraseAbbr(Advp "misc",_) -> "R", "", ["T"] | |
104 | - | [],PhraseAbbr(Advp mode,_) -> "C", mode, ["T"] | |
105 | - | [],PhraseAbbr(Nonch,_) -> "C", "", ["T"] | |
106 | - | [],PhraseAbbr(Distrp,_) -> "C", "Distr", ["T"] | |
107 | - | [],PhraseAbbr(Possp,_) -> "C", "Poss", ["T"] | |
108 | - | [],LexPhraseMode(mode,_,_) -> "C", mode, ["T"] | |
109 | - | [],LexPhrase((SUBST(_,case),_) :: _,_) -> "C", "", ["T"] | |
110 | - | [],LexPhrase((PREP _,_) :: _,_) -> "C", "", ["T"] | |
111 | - | [],LexPhrase((NUM(case,_,_),_) :: _,_) -> "C", "", ["T"] | |
112 | - | [],LexPhrase((ADJ(_,case,_,_),_) :: _,_) -> "C", "", ["T"] | |
113 | - | [],LexPhrase((ADV _,_) :: _,_) -> "C", "", ["T"] | |
114 | - | [],LexPhrase((GER(_,case,_,_,_,_),_) :: _,_) -> "C", "", ["T"] | |
115 | - | [],LexPhrase((PACT(_,case,_,_,_,_),_) :: _,_) -> "C", "", ["T"] | |
116 | - | [],LexPhrase((PPAS(_,case,_,_,_),_) :: _,_) -> "C", "", ["T"] | |
117 | - | [],LexPhrase((INF _,_) :: _,_) -> "C", "", ["T"] | |
118 | - | [],LexPhrase((QUB,_) :: _,_) -> "C", "", ["T"] | |
119 | - | [],LexPhrase((COMPAR,_) :: _,_) -> "C", "", ["T"] | |
120 | - | [],LexPhrase((COMP _,_) :: _,_) -> "C", "", ["T"] | |
121 | - | [],morf -> print_endline(*failwith*) ("get_gf: []," ^ ENIAMwalStringOf.morf morf);"","",[] | |
122 | - | _,Phrase(InfP _) -> "X", "", ["T"] | |
123 | - | _,Phrase(CP _) -> "X", "", ["T"] (* zwykle możliwa koordynacja z infp *) | |
124 | - | _,Phrase _ -> "X", "", ["T"] | |
125 | - | _,PhraseAbbr _ -> "X", "", ["T"] | |
126 | - | _,LexPhraseMode _ -> "X", "", ["T"] | |
127 | - | _,LexPhrase((INF _,_) :: _,_) -> "X", "", ["T"] | |
128 | - | _,LexPhrase _ -> "X", "", ["T"] | |
129 | - | _,morf -> failwith ("get_gf: _," ^ ENIAMwalStringOf.morf morf)*) | |
130 | - | |
131 | -(*let gf_rank = Xlist.fold [ | |
132 | - "",1; | |
133 | - ] StringMap.empty (fun gf_rank (gf,v) -> StringMap.add gf_rank gf v)*) | |
134 | - | |
135 | -(*let agregate_gfs s gfs_roles = | |
136 | -(* fst (Xlist.fold gfs ("",0) (fun (best_gf,best_rank) gf -> | |
137 | - let rank = try StringMap.find gf_rank gf with Not_found -> failwith ("agregate_gfs: " ^ gf) in | |
138 | - if rank > best_rank then gf, rank else best_gf, best_rank))*) | |
139 | -(* let gfs,roles = List.split gfs_roles in | |
140 | - let gfs = StringSet.to_list (Xlist.fold gfs StringSet.empty StringSet.add) in | |
141 | - if Xlist.size gfs > 1 then print_endline ("agregate_gfs: " ^ String.concat " " gfs); | |
142 | - if Xlist.size roles > 1 then print_endline ("agregate_gfs: " ^ String.concat " " roles);*) | |
143 | - let gf,role,prefs = List.hd gfs_roles in | |
144 | - {s with gf=gf; role=role; prefs=prefs} | |
145 | - | |
146 | -let rec make_gfs schema = | |
147 | - let schema = Xlist.map schema (function | |
148 | - {gf="subj"} as s -> {s with gf="SUBJ"; role="Agnt"; prefs=["T"]; morfs=make_gfs_morfs s.morfs} | |
149 | - | {gf="obj"} as s -> {s with gf="OBJ"; role="Ptnt"; prefs=["T"]; morfs=make_gfs_morfs s.morfs} | |
150 | - | {gf=""} as s -> agregate_gfs {s with morfs=make_gfs_morfs s.morfs} (Xlist.map s.morfs (fun morf -> get_gf_role (s.ce,morf))) | |
151 | - | {gf=t} -> failwith ("make_gfs: " ^ t)) in | |
152 | -(* let schema = List.rev (fst (Xlist.fold schema ([],StringMap.empty) (fun (schema,map) s -> | |
153 | - try | |
154 | - let n = StringMap.find map s.gf in | |
155 | - {s with gf=s.gf ^ string_of_int (n+1)} :: schema, | |
156 | - StringMap.add map s.gf (n+1) | |
157 | - with Not_found -> | |
158 | - s :: schema, StringMap.add map s.gf 1))) in*) | |
159 | - schema | |
160 | - | |
161 | -and make_gfs_morfs morfs = | |
162 | - List.flatten (Xlist.map morfs (function | |
163 | - Phrase _ as morf -> [morf] | |
164 | - | PhraseAbbr(Advp _,[]) -> [Phrase AdvP] | |
165 | - | PhraseAbbr(_,[]) -> failwith "make_gfs_morfs" | |
166 | - | PhraseAbbr(_,morfs) -> make_gfs_morfs morfs | |
167 | - | LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,make_gfs schema))] | |
168 | - | LexPhraseMode(_,pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,make_gfs schema))] | |
169 | - | _ -> failwith "make_gfs_morfs"))*) | |
170 | - | |
171 | -let mark_nosem_morfs morfs = | |
172 | - Xlist.map morfs (function | |
173 | - Phrase(PrepNP(_,prep,c)) -> Phrase(PrepNP(NoSem,prep,c)) | |
174 | - | Phrase(PrepAdjP(_,prep,c)) -> Phrase(PrepAdjP(NoSem,prep,c)) | |
175 | - | Phrase(PrepNumP(_,prep,c)) -> Phrase(PrepNumP(NoSem,prep,c)) | |
176 | -(* | Phrase(ComprepNP(_,prep)) -> Phrase(ComprepNP(NoSem,prep)) *) (* FIXME: na razie ComprepNP są zawsze semantyczne *) | |
177 | -(* | Phrase(ComparNP(_,prep,c)) -> Phrase(ComparNP(NoSem,prep,c)) (* FIXME: pomijam niesemantyczny compar *) | |
178 | - | Phrase(ComparPP(_,prep)) -> Phrase(ComparPP(NoSem,prep))*) | |
179 | - | Phrase(PrepNCP(_,prep,c,ct,co)) -> Phrase(PrepNCP(NoSem,prep,c,ct,co)) | |
180 | - | t -> t) | |
181 | - | |
182 | - | |
183 | -let agregate_role_and_sense s l = | |
184 | - let roles,senses = Xlist.fold l (StringSet.empty,StringSet.empty) (fun (roles,senses) (role,role_attr,sense) -> | |
185 | - StringSet.add roles (role ^ " " ^ role_attr), | |
186 | - Xlist.fold sense senses StringSet.add) in | |
187 | - let roles = if StringSet.size roles = 1 then roles else StringSet.remove roles "Arg " in | |
188 | - let role,role_attr = | |
189 | - match Str.split (Str.regexp " ") (StringSet.min_elt roles) with | |
190 | - [r;a] -> r,a | |
191 | - | [r] -> r,"" | |
192 | - | _ -> failwith "agregate_role_and_sense" in | |
193 | - {s with role=role; role_attr=role_attr(*; sel_prefs=StringSet.to_list senses*)} | |
194 | - | |
195 | -let rec assign_role_and_sense schema = | |
196 | - Xlist.map schema (function | |
197 | - {gf=SUBJ} as s -> | |
198 | - if s.role = "" then {s with role="Initiator"; sel_prefs=["ALL"]; morfs=assign_role_and_sense_morfs s.morfs} | |
199 | - else {s with morfs=assign_role_and_sense_morfs (mark_nosem_morfs s.morfs)} | |
200 | - | {gf=OBJ} as s -> | |
201 | - if s.role = "" then {s with role="Theme"; sel_prefs=["ALL"]; morfs=assign_role_and_sense_morfs s.morfs} | |
202 | - else {s with morfs=assign_role_and_sense_morfs (mark_nosem_morfs s.morfs)} | |
203 | - | {gf=ARG} as s -> | |
204 | - if s.role = "" then agregate_role_and_sense {s with sel_prefs=["ALL"]; morfs=assign_role_and_sense_morfs s.morfs} | |
205 | - (Xlist.map s.morfs (fun morf -> get_role_and_sense morf)) | |
206 | - else {s with morfs=assign_role_and_sense_morfs (mark_nosem_morfs s.morfs)} | |
207 | - | _ -> failwith "assign_role_and_sense") | |
208 | - | |
209 | -and assign_role_and_sense_morfs morfs = | |
210 | - List.flatten (Xlist.map morfs (function | |
211 | - Phrase _ as morf -> [morf] | |
212 | - | E _ as morf -> [morf] | |
213 | - | PhraseAbbr(Advp _,[]) -> [Phrase AdvP] | |
214 | - | PhraseAbbr(_,[]) -> failwith "assign_role_and_sense_morfs" | |
215 | - | PhraseAbbr(_,morfs) -> assign_role_and_sense_morfs morfs | |
216 | - | LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,assign_role_and_sense schema))] | |
217 | - | LexPhraseMode(_,pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,assign_role_and_sense schema))] | |
218 | - | _ -> failwith "assign_role_and_sense_morfs")) | |
219 | - | |
220 | -(*let _ = | |
221 | - Xlist.iter walenty_filenames (fun filename -> | |
222 | - print_endline filename; | |
223 | - let frames = load_frames (walenty_path ^ filename) in | |
224 | - StringMap.iter frames (fun _ l -> | |
225 | - Xlist.iter l (fun (refl,opinion,negation,pred,aspect,schema) -> | |
226 | - ignore (process_opinion opinion); | |
227 | - ignore (process_negation [Text negation]); | |
228 | - ignore (process_pred [Text pred]); | |
229 | - ignore (process_aspect [Text aspect]); | |
230 | - ignore (assign_pro_args (make_gfs (process_schema expands subtypes equivs schema))))))*) | |
231 | - | |
232 | -let remove_trivial_args schema = | |
233 | - Xlist.fold schema [] (fun l (_,_,_,morfs) -> | |
234 | - let morfs = Xlist.fold morfs [] (fun morfs -> function | |
235 | - Phrase(AdjP _) -> morfs | |
236 | - | Phrase(NP(Case "gen")) -> morfs | |
237 | - | Phrase(NCP(Case "gen",_,_)) -> morfs | |
238 | - | Phrase(PrepNP _) -> morfs | |
239 | - | Phrase(FixedP _) -> morfs | |
240 | - | LexPhrase([ADJ _,_],_) -> morfs | |
241 | - | LexPhrase([PPAS _,_],_) -> morfs | |
242 | - | LexPhrase([PACT _,_],_) -> morfs | |
243 | - | LexPhrase([SUBST(_,Case "gen"),_],_) -> morfs | |
244 | - | LexPhrase([PREP _,_;_],_) -> morfs | |
245 | - | morf -> morf :: morfs) in | |
246 | - if morfs = [] then l else morfs :: l) | |
247 | - | |
248 | -(* leksykalizacje do zmiany struktury | |
249 | -lex([PREP(gen),'z';SUBST(sg,gen),'nazwa'],atr1[OBL{lex([QUB,'tylko'],natr[])}]) | |
250 | -lex([PREP(loc),'na';SUBST(sg,loc),'papier'],atr1[OBL{lex([QUB,'tylko'],natr[])}]) | |
251 | -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}]) | |
252 | -lex([PREP(gen),'z';SUBST(sg,gen),'most'],ratr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}]) | |
253 | -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}]) | |
254 | -lex([PREP(gen),'z';SUBST(sg,gen),'most'],ratr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}]) | |
255 | -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}]) | |
256 | -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}]) | |
257 | -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}]) | |
258 | -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}]) | |
259 | -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}]) | |
260 | -*) | |
261 | - | |
262 | -let num_arg_schema_field morfs = | |
263 | - {gf=CORE; role="QUANT-ARG"; role_attr=""; sel_prefs=["ALL"]; cr=[]; ce=[]; dir=Forward; morfs=morfs} | |
264 | - | |
265 | -let std_arg_schema_field dir morfs = | |
266 | - {gf=ARG; role="Arg"; role_attr=""; sel_prefs=["ALL"]; cr=[]; ce=[]; dir=dir; morfs=morfs} | |
267 | - | |
268 | -let simple_arg_schema_field morfs = | |
269 | - {gf=ARG; role=""; role_attr=""; sel_prefs=["ALL"]; cr=[]; ce=[]; dir=Both; morfs=morfs} | |
270 | - | |
271 | -let nosem_refl_schema_field = | |
272 | - {gf=NOSEM; role=""; role_attr=""; sel_prefs=["ALL"]; cr=[]; ce=[]; dir=Both; morfs=[Phrase(Lex "się")]} | |
273 | - | |
274 | - | |
275 | -let expand_lexicalizations = function | |
276 | - Frame(atrs,schema) -> Frame(atrs,expand_lexicalizations_schema schema) | |
277 | -(* ComprepFrame(s,morfs) -> ComprepFrame(atrs,expand_lexicalizations_morfs morfs) *) | |
278 | - | _ -> failwith "expand_lexicalizations" | |
279 | - | |
280 | - | |
281 | -let prepare_schema_comprep expands subtypes equivs schema = | |
282 | - assign_pro_args (assign_role_and_sense (ENIAMwalParser.expand_equivs_schema equivs (ENIAMwalParser.expand_subtypes subtypes (ENIAMwalParser.expand_schema expands schema)))) | |
283 | - | |
284 | -let prepare_schema expands subtypes equivs schema = | |
285 | - prepare_schema_comprep expands subtypes equivs (ENIAMwalParser.parse_schema schema) | |
286 | - | |
287 | -let prepare_schema_sem expands subtypes equivs schema = | |
288 | - prepare_schema_comprep expands subtypes equivs schema | |
289 | - | |
290 | - | |
291 | -let convert_frame expands subtypes equivs lexemes valence lexeme pos (refl,opinion,negation,pred,aspect,schema) = | |
292 | -(* Printf.printf "convert_frame %s %s\n" lexeme pos; *) | |
293 | - try | |
294 | - if refl = "się" && not (StringMap.mem lexemes "się") then raise ImpossibleSchema else | |
295 | - let frame = | |
296 | - try StringMap.find default_frames refl (* w refl jest przekazywana informacja o typie domyślnej ramki *) | |
297 | - with Not_found -> | |
298 | - Frame(DefaultAtrs([],ENIAMwalParser.parse_refl [Text refl], | |
299 | - ENIAMwalParser.parse_opinion opinion, | |
300 | - ENIAMwalParser.parse_negation [Text negation], | |
301 | - ENIAMwalParser.parse_pred [Text pred], | |
302 | - ENIAMwalParser.parse_aspect [Text aspect]), | |
303 | - prepare_schema expands subtypes equivs schema) in | |
304 | - let frame = if StringMap.is_empty lexemes then frame else reduce_schema_frame lexemes frame in | |
305 | - let frame = expand_lexicalizations frame in | |
306 | - Xlist.fold (extract_lex_frames lexeme pos [] frame) valence (fun valence -> function | |
307 | - lexeme,pos,Frame(atrs,schema) -> | |
308 | - let schemas = simplify_lex (split_xor (split_or_coord schema)) in | |
309 | - Xlist.fold schemas valence (fun valence schema -> | |
310 | - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in | |
311 | - let poss = StringMap.add_inc poss pos [Frame(atrs,schema)] (fun l -> Frame(atrs,schema) :: l) in | |
312 | - StringMap.add valence lexeme poss) | |
313 | - | lexeme,pos,LexFrame(id,pos2,restr,schema) -> | |
314 | - let schemas = simplify_lex (split_xor (split_or_coord schema)) in | |
315 | - Xlist.fold schemas valence (fun valence schema -> | |
316 | - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in | |
317 | - let poss = StringMap.add_inc poss pos [LexFrame(id,pos2,restr,schema)] (fun l -> LexFrame(id,pos2,restr,schema) :: l) in | |
318 | - StringMap.add valence lexeme poss) | |
319 | - | _ -> failwith "convert_frame") | |
320 | - with ImpossibleSchema -> valence | |
321 | - | |
322 | -let convert_frame_sem expands subtypes equivs lexemes valence lexeme pos = function | |
323 | - Frame(DefaultAtrs(meanings,refl,opinion,negation,pred,aspect),positions) -> | |
324 | -(* Printf.printf "convert_frame_sem %s\n" (ENIAMwalStringOf.frame lexeme (Frame(DefaultAtrs(meanings,refl,opinion,negation,pred,aspect),positions))); *) | |
325 | - (try | |
326 | - if refl = ReflSie && not (StringMap.mem lexemes "się") then raise ImpossibleSchema else | |
327 | - let frame = | |
328 | - Frame(DefaultAtrs(meanings,refl,opinion,negation,pred,aspect), | |
329 | - prepare_schema_sem expands subtypes equivs positions) in | |
330 | - let frame = if StringMap.is_empty lexemes then frame else reduce_schema_frame lexemes frame in | |
331 | - let frame = expand_lexicalizations frame in | |
332 | - Xlist.fold (extract_lex_frames lexeme pos [] frame) valence (fun valence -> function | |
333 | - lexeme,pos,Frame(atrs,schema) -> | |
334 | - let schemas = simplify_lex (split_xor (split_or_coord schema)) in | |
335 | - Xlist.fold schemas valence (fun valence schema -> | |
336 | - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in | |
337 | - let poss = StringMap.add_inc poss pos [Frame(atrs,schema)] (fun l -> Frame(atrs,schema) :: l) in | |
338 | - StringMap.add valence lexeme poss) | |
339 | - | lexeme,pos,LexFrame(id,pos2,restr,schema) -> | |
340 | - let schemas = simplify_lex (split_xor (split_or_coord schema)) in | |
341 | - Xlist.fold schemas valence (fun valence schema -> | |
342 | - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in | |
343 | - let poss = StringMap.add_inc poss pos [LexFrame(id,pos2,restr,schema)] (fun l -> LexFrame(id,pos2,restr,schema) :: l) in | |
344 | - StringMap.add valence lexeme poss) | |
345 | - | _ -> failwith "convert_frame_sem") | |
346 | - with ImpossibleSchema -> valence) | |
347 | - | _ -> failwith "convert_frame_sem" | |
348 | - | |
349 | -let make_comprep_frames_of_schema s = function | |
350 | - [{cr=[];ce=[]; morfs=[LexPhrase([pos,Lexeme lex],(restr,schema))]}] -> | |
351 | - lex, | |
352 | - (match get_pos lex pos with [pos] -> pos | _ -> failwith "make_comprep_frame_of_schema 2"), | |
353 | - ComprepFrame(s,pos,restr,schema) | |
354 | - | schema -> failwith ("make_comprep_frame_of_schema: " ^ ENIAMwalStringOf.schema schema) | |
355 | - | |
356 | -let convert_comprep_frame expands subtypes equivs lexemes valence lexeme pos (s,morf) = | |
357 | - try | |
358 | - let schema = prepare_schema_comprep expands subtypes equivs [simple_arg_schema_field [morf]] in | |
359 | - let schema = if StringMap.is_empty lexemes then schema else reduce_schema lexemes schema in | |
360 | - let schema = expand_lexicalizations_schema schema in | |
361 | - let lexeme,pos,frame = make_comprep_frames_of_schema s schema in | |
362 | - Xlist.fold (extract_lex_frames lexeme pos [] frame) valence (fun valence -> function | |
363 | - lexeme,pos,ComprepFrame(s,pos2,restr,schema) -> | |
364 | - let schemas = simplify_lex (split_xor (split_or_coord schema)) in | |
365 | - Xlist.fold schemas valence (fun valence schema -> | |
366 | - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in | |
367 | - let poss = StringMap.add_inc poss pos [ComprepFrame(s,pos2,restr,schema)] (fun l -> ComprepFrame(s,pos2,restr,schema) :: l) in | |
368 | - StringMap.add valence lexeme poss) | |
369 | - | lexeme,pos,LexFrame(id,pos2,restr,schema) -> | |
370 | - let schemas = simplify_lex (split_xor (split_or_coord schema)) in | |
371 | - Xlist.fold schemas valence (fun valence schema -> | |
372 | - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in | |
373 | - let poss = StringMap.add_inc poss pos [LexFrame(id,pos2,restr,schema)] (fun l -> LexFrame(id,pos2,restr,schema) :: l) in | |
374 | - StringMap.add valence lexeme poss) | |
375 | - | _ -> failwith "convert_comprep_frame") | |
376 | - with ImpossibleSchema -> valence | |
377 | - | |
378 | -let remove_pro_args schema = (* FIXME: sprawdzić czy Pro i Null są zawsze na początku *) | |
379 | - List.rev (Xlist.fold schema [] (fun schema -> function | |
380 | - {morfs=[Phrase Pro]} -> schema | |
381 | - | {morfs=(Phrase Pro) :: morfs} as s -> {s with morfs=morfs} :: schema | |
382 | - | {morfs=[Phrase Null]} -> schema | |
383 | - | {morfs=(Phrase Null) :: morfs} as s -> {s with morfs=morfs} :: schema | |
384 | - | s -> s :: schema)) | |
385 | - | |
386 | - | |
387 | - | |
388 | -(*let _ = | |
389 | - let valence = Xlist.fold all_frames StringMap.empty (fun valence (pos,frame_map) -> | |
390 | - print_endline pos; | |
391 | - StringMap.fold frame_map valence (fun valence lexeme frames -> | |
392 | - Xlist.fold frames valence (fun valence frame -> | |
393 | -(* print_endline (ENIAMwalStringOf.unparsed_frame lexeme frame); *) | |
394 | - convert_frame expands subtypes equivs StringMap.empty valence lexeme pos frame))) in | |
395 | - print_endline "comprepnp"; | |
396 | - let valence = StringMap.fold compreps valence (fun valence lexeme frames -> | |
397 | - Xlist.fold frames valence (fun valence (pos,frame) -> | |
398 | - convert_comprep_frame expands subtypes equivs StringMap.empty valence lexeme pos frame)) in | |
399 | - print_endline "expand_restr"; | |
400 | - let valence = StringMap.mapi valence (fun lexeme poss -> | |
401 | - StringMap.mapi poss (fun pos frames -> | |
402 | - List.flatten (Xlist.map frames (expand_restr valence lexeme pos)))) in | |
403 | - print_endline "transform_frame"; | |
404 | - let _ = StringMap.mapi valence (fun lexeme poss -> | |
405 | - StringMap.mapi poss (fun pos frames -> | |
406 | -(* print_endline lexeme; *) | |
407 | - List.flatten (Xlist.map frames (transform_frame lexeme pos)))) in | |
408 | - print_endline "done"; | |
409 | - ()*) | |
410 | -(* StringMap.iter valence (fun lexeme poss -> | |
411 | - StringMap.iter poss (fun pos frames -> | |
412 | - Xlist.iter frames (fun frame -> print_endline (ENIAMwalStringOf.frame lexeme frame))))*) |
lexSemantics/ENIAMwalReduce.ml
... | ... | @@ -90,36 +90,21 @@ let select_comprep_adjuncts lexemes = |
90 | 90 | not (StringSet.is_empty (StringSet.intersection reqs lexemes)) then s :: l else l) |
91 | 91 | with Not_found -> l) |
92 | 92 | |
93 | - | |
94 | - | |
95 | -(* let rec assign_pro_args schema = | |
96 | - Xlist.map schema (fun s -> | |
97 | - let morfs = match s.morfs with | |
98 | - (E p) :: l -> E Pro :: (E p) :: l | |
99 | - | [LexPhrase _] as morfs -> morfs | |
100 | - | [Phrase(FixedP _)] as morfs -> morfs | |
101 | - | [Phrase(Lex _)] as morfs -> morfs | |
102 | - (* | [Phrase Refl] as morfs -> morfs | |
103 | - | [Phrase Recip] as morfs -> morfs*) | |
104 | - | Phrase Null :: _ as morfs -> morfs | |
105 | - | Phrase Pro :: _ as morfs -> morfs | |
106 | - | morfs -> if s.gf <> SUBJ && s.cr = [] && s.ce = [] then (Phrase Null) :: morfs else (Phrase Pro) :: morfs in (* FIXME: ustalić czy są inne przypadki uzgodnienia *) | |
107 | - (* let morfs = assign_pro_args_lex morfs in *) (* bez pro wewnątrz leksykalizacji *) | |
108 | - {s with morfs=morfs}) *) | |
109 | - | |
110 | -(*let assign_pro_args_lex morfs = | |
111 | - Xlist.map morfs (function | |
112 | - Lex(morf,specs,lex,restr) -> LexN(morf,specs,lex,assign_pro_args_restr restr) | |
113 | - | LexNum(morf,lex1,lex2,restr) -> LexNum(morf,lex1,lex2,assign_pro_args_restr restr) | |
114 | - | LexCompar(morf,l) -> LexCompar(morf,make_gfs_lex l) | |
115 | - | morf -> morf) | |
116 | - | |
117 | - and assign_pro_args_restr = function | |
118 | - Natr -> Natr | |
119 | - | Ratr1 schema -> Ratr1(assign_pro_args schema) | |
120 | - | Atr1 schema -> Atr1(assign_pro_args schema) | |
121 | - | Ratr schema -> Ratr(assign_pro_args schema) | |
122 | - | Atr schema -> Atr(assign_pro_args schema)*) | |
93 | +let set_necessary schema = | |
94 | + Xlist.map schema (fun p -> | |
95 | + let nec = | |
96 | + if p.gf = ADJUNCT then Opt else | |
97 | + if Xlist.fold p.morfs false (fun b -> function | |
98 | + SimpleLexArg _ -> true | |
99 | + | LexArg _ -> true | |
100 | + | FixedP _ -> true | |
101 | + | _ -> b) then Req else | |
102 | + if p.gf <> SUBJ && p.cr = [] && p.ce = [] then Opt else | |
103 | + if Xlist.fold p.morfs false (fun b -> function | |
104 | + NP NomAgr -> true | |
105 | + | NCP(NomAgr,_,_) -> true | |
106 | + | _ -> b) then ProNG else Pro in | |
107 | + {p with is_necessary=nec}) | |
123 | 108 | |
124 | 109 | exception ImpossibleSchema |
125 | 110 | |
... | ... |
lexSemantics/ENIAMwalRenderer.ml
... | ... | @@ -217,20 +217,39 @@ let render_lex_entry = function |
217 | 217 | (* Printf.printf "%s %s %s\n" pos lemma (ENIAMwalStringOf.schema schema); *) |
218 | 218 | selectors,render_lex_entry entry) *) |
219 | 219 | |
220 | +let adjunct morfs = {empty_position with gf=ADJUNCT; is_necessary=Opt; morfs=Xlist.map morfs (fun morf -> LCG morf)} | |
221 | +let adjunct_multi dir morfs = {empty_position with gf=ADJUNCT; is_necessary=Multi; dir=dir; morfs=Xlist.map morfs (fun morf -> LCG morf)} | |
222 | +let adjunct_dir dir morfs = {empty_position with gf=ADJUNCT; is_necessary=Opt; dir=dir; morfs=Xlist.map morfs (fun morf -> LCG morf)} | |
223 | +let adjunct_ce ce morfs = {empty_position with gf=ADJUNCT; ce=[ce]; is_necessary=Opt; morfs=Xlist.map morfs (fun morf -> LCG morf)} | |
224 | + | |
220 | 225 | let render_comprep prep = Both,Plus[One;Tensor[Atom "comprepnp"; Atom prep]] |
221 | 226 | |
227 | +let render_connected_comprep prep = adjunct [Tensor[Atom "comprepnp"; Atom prep]] | |
228 | + | |
222 | 229 | let render_prepnp prep cases = |
223 | 230 | Both,Plus(One :: List.flatten (Xlist.map cases (fun case -> |
224 | 231 | [Tensor[Atom "prepnp"; Atom prep; Atom case]; |
225 | 232 | Tensor[Atom "prepncp"; Atom prep; Atom case; Top; Top]]))) |
226 | 233 | |
234 | +let render_connected_prepnp prep cases = | |
235 | + adjunct (List.flatten (Xlist.map cases (fun case -> | |
236 | + [Tensor[Atom "prepnp"; Atom prep; Atom case]; | |
237 | + Tensor[Atom "prepncp"; Atom prep; Atom case; Top; Top]]))) | |
238 | + | |
227 | 239 | let render_prepadjp prep cases = |
228 | 240 | let postp = if prep = "z" || prep = "po" || prep = "na" then [Tensor[Atom "prepadjp"; Atom prep; Atom "postp"]] else [] in |
229 | 241 | Both,Plus(One :: postp @ (Xlist.map cases (fun case -> |
230 | 242 | Tensor[Atom "prepadjp"; Atom prep; Atom case]))) |
231 | 243 | |
244 | +let render_connected_prepadjp prep cases = | |
245 | + let postp = if prep = "z" || prep = "po" || prep = "na" then [Tensor[Atom "prepadjp"; Atom prep; Atom "postp"]] else [] in | |
246 | + adjunct (postp @ (Xlist.map cases (fun case -> | |
247 | + Tensor[Atom "prepadjp"; Atom prep; Atom case]))) | |
248 | + | |
232 | 249 | let render_compar prep = Both,Plus[One;Tensor[Atom "compar"; Atom prep; Top]] |
233 | 250 | |
251 | +let render_connected_compar prep = adjunct [Tensor[Atom "compar"; Atom prep; Top]] | |
252 | + | |
234 | 253 | let verb_adjuncts_simp = [ |
235 | 254 | Both, Plus[One;Tensor[Atom "advp"; Atom "pron"]]; |
236 | 255 | Both, Plus[One;Tensor[Atom "advp"; Atom "locat"]]; |
... | ... | @@ -249,6 +268,24 @@ let verb_adjuncts_simp = [ |
249 | 268 | Both, Plus[One;Tensor[Atom "padvp"]]; |
250 | 269 | ] |
251 | 270 | |
271 | +let verb_connected_adjuncts_simp = [ | |
272 | + adjunct [Tensor[Atom "advp"; Atom "pron"]]; | |
273 | + adjunct [Tensor[Atom "advp"; Atom "locat"]]; | |
274 | + adjunct [Tensor[Atom "advp"; Atom "abl"]]; | |
275 | + adjunct [Tensor[Atom "advp"; Atom "adl"]]; | |
276 | + adjunct [Tensor[Atom "advp"; Atom "perl"]]; | |
277 | + adjunct [Tensor[Atom "advp"; Atom "temp"]]; | |
278 | + adjunct [Tensor[Atom "advp"; Atom "dur"]]; | |
279 | + adjunct [Tensor[Atom "advp"; Atom "mod"]]; | |
280 | + adjunct [Tensor[Atom "np";Top;Atom "dat"; Top; Top];Tensor[Atom "ncp"; Top; Atom "dat"; Top; Top; Top; Top]]; | |
281 | + adjunct [Tensor[Atom "np";Top;Atom "inst"; Top; Top];Tensor[Atom "ncp"; Top; Atom "inst"; Top; Top; Top; Top]]; | |
282 | + adjunct [Tensor[Atom "date"];Tensor[Atom "day-lex"];Tensor[Atom "day-month"];Tensor[Atom "day"]]; | |
283 | + adjunct_dir Forward_ [Tensor[Atom "cp";Top; Top]]; | |
284 | + adjunct [Tensor[Atom "or"]]; | |
285 | + adjunct [Tensor[Atom "lex";Atom "się";Atom "qub"]]; | |
286 | + adjunct_ce "3" [Tensor[Atom "padvp"]]; | |
287 | +] | |
288 | + | |
252 | 289 | let proper_noun_adjuncts_simp = [ |
253 | 290 | Both, Plus[One;Tensor[Atom "np";Top;Atom "gen"; Top; Top];Tensor[Atom "ncp"; Top; Atom "gen"; Top; Top; Top; Top]]; |
254 | 291 | Forward, Plus[One;Tensor[Atom "np";Top;Atom "nom"; Top; Top];Tensor[Atom "np";Top;AVar "case"; Top; Top]]; |
... | ... | @@ -256,6 +293,13 @@ let proper_noun_adjuncts_simp = [ |
256 | 293 | Forward, Plus[One;Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]]; |
257 | 294 | ] |
258 | 295 | |
296 | +let proper_noun_connected_adjuncts_simp = [ | |
297 | + adjunct [Tensor[Atom "np";Top;Atom "gen"; Top; Top];Tensor[Atom "ncp"; Top; Atom "gen"; Top; Top; Top; Top]]; | |
298 | + adjunct_dir Forward_ [Tensor[Atom "np";Top;Atom "nom"; Top; Top];Tensor[Atom "np";Top;AVar "case"; Top; Top]]; | |
299 | + adjunct_multi Backward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]]; | |
300 | + adjunct_dir Forward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]]; | |
301 | +] | |
302 | + | |
259 | 303 | let common_noun_adjuncts_simp = [ |
260 | 304 | Both, Plus[One;Tensor[Atom "np";Top;Atom "gen"; Top; Top];Tensor[Atom "ncp"; Top; Atom "gen"; Top; Top; Top; Top]]; |
261 | 305 | Forward, Plus[One;Tensor[Atom "np";Top;Atom "nom"; Top; Top];Tensor[Atom "np";Top;AVar "case"; Top; Top]]; |
... | ... | @@ -263,15 +307,45 @@ let common_noun_adjuncts_simp = [ |
263 | 307 | Forward, Plus[One;Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]]; |
264 | 308 | ] |
265 | 309 | |
310 | +let common_noun_connected_adjuncts_simp = [ | |
311 | + adjunct [Tensor[Atom "np";Top;Atom "gen"; Top; Top];Tensor[Atom "ncp"; Top; Atom "gen"; Top; Top; Top; Top]]; | |
312 | + adjunct_dir Forward_ [Tensor[Atom "np";Top;Atom "nom"; Top; Top];Tensor[Atom "np";Top;AVar "case"; Top; Top]]; | |
313 | + adjunct_multi Backward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]]; | |
314 | + adjunct_dir Forward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]]; | |
315 | +] | |
316 | + | |
266 | 317 | let measure_noun_adjuncts_simp = [ |
267 | 318 | Backward, Maybe(Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]); |
268 | 319 | Forward, Plus[One;Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]]; |
269 | 320 | ] |
270 | 321 | |
322 | +let measure_noun_connected_adjuncts_simp = [ | |
323 | + adjunct_multi Backward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]]; | |
324 | + adjunct_dir Forward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]]; | |
325 | +] | |
326 | + | |
271 | 327 | let adj_adjuncts_simp = [ |
272 | 328 | Both, Plus[One;Tensor[Atom "advp"; Top]]; |
273 | 329 | ] |
274 | 330 | |
331 | +let adj_connected_adjuncts_simp = [ | |
332 | + adjunct [Tensor[Atom "advp"; Top]]; | |
333 | +] | |
334 | + | |
275 | 335 | let adv_adjuncts_simp = [ |
276 | 336 | Both, Plus[One;Tensor[Atom "advp"; Top]]; |
277 | 337 | ] |
338 | + | |
339 | +let adv_connected_adjuncts_simp = [ | |
340 | + adjunct [Tensor[Atom "advp"; Top]]; | |
341 | + ] | |
342 | + | |
343 | +let assing_pref_morfs = function | |
344 | + "po","postp" -> [ | |
345 | + LCG(Tensor[Atom "adjp"; Atom "sg"; Atom "dat"; Atom "m1"]); | |
346 | + LCG(Tensor[Atom "adjp"; Top; Atom "postp"; Top])] | |
347 | + | "z","postp" -> [LCG(Tensor[Atom "adjp"; Atom "sg"; Atom "nom"; Atom "f"])] | |
348 | + | "na","postp" -> [LCG(Tensor[Atom "advp"; Top])] | |
349 | + | _,case -> [ | |
350 | + LCG(Tensor[Atom "np"; Top; Atom case; Top; Top]); | |
351 | + LCG(Tensor[Atom "adjp"; Top; Atom case; Top])] | |
... | ... |
lexSemantics/ENIAMwalStringOf.ml
... | ... | @@ -101,6 +101,9 @@ let gf = function |
101 | 101 | SUBJ -> "subj" |
102 | 102 | | OBJ -> "obj" |
103 | 103 | | ARG -> "arg"(*""*) |
104 | + | ADJUNCT -> "adjunct" | |
105 | + | CORE -> "core" | |
106 | + | NOSEM -> "nosem" | |
104 | 107 | |
105 | 108 | let pos = function |
106 | 109 | SUBST(n,c) -> "SUBST(" ^ number n ^ "," ^ case c ^ ")" |
... | ... | @@ -173,13 +176,26 @@ let controllees l = |
173 | 176 | "1" -> "controllee" |
174 | 177 | | n -> "controllee" ^ n) |
175 | 178 | |
179 | +let necessary = function | |
180 | + Opt -> "opt" | |
181 | + | Req -> "req" | |
182 | + | Pro -> "pro" | |
183 | + | ProNG -> "prong" | |
184 | + | Multi -> "multi" | |
185 | + | |
186 | +let dir = function | |
187 | + Both_ -> "" | |
188 | + | Forward_ -> "/" | |
189 | + | Backward_ -> "\\" | |
190 | + | |
176 | 191 | let rec schema schema = |
177 | 192 | String.concat "+" (Xlist.map schema (fun s -> |
178 | 193 | String.concat "," ( |
179 | 194 | (if s.gf = ARG then [] else [gf s.gf])@s.mode@ |
195 | + (if s.is_necessary = Opt then [] else [necessary s.is_necessary])@ | |
180 | 196 | (if s.role = "" then [] else [s.role])@ |
181 | 197 | (if s.role_attr = "" then [] else [s.role_attr])@ |
182 | - (*s.sel_prefs@*)(controllers s.cr)@(controllees s.ce)) ^ "{" ^ String.concat ";" (Xlist.map s.morfs phrase) ^ "}")) | |
198 | + (*s.sel_prefs@*)(controllers s.cr)@(controllees s.ce)) ^ (dir s.dir) ^ "{" ^ String.concat ";" (Xlist.map s.morfs phrase) ^ "}")) | |
183 | 199 | (* |
184 | 200 | and morf = function |
185 | 201 | Phrase p -> phrase p |
... | ... | @@ -222,7 +238,7 @@ let rec connected_schema schema = |
222 | 238 | *) |
223 | 239 | |
224 | 240 | let meaning m = |
225 | - m.name ^ "-" ^ m.variant | |
241 | + m.name ^ "-" ^ m.variant | |
226 | 242 | |
227 | 243 | let lex_entry = function |
228 | 244 | SimpleLexEntry(le,p) -> |
... | ... |
lexSemantics/ENIAMwalTypes.ml
... | ... | @@ -39,7 +39,7 @@ type aux = NoAux | PastAux | FutAux | ImpAux |
39 | 39 | |
40 | 40 | type nsem = Common of string | Time*) |
41 | 41 | |
42 | -type gf = SUBJ | OBJ | ARG | |
42 | +type gf = SUBJ | OBJ | ARG | ADJUNCT | CORE | NOSEM | |
43 | 43 | |
44 | 44 | type pos = |
45 | 45 | SUBST of number * case |
... | ... | @@ -99,13 +99,19 @@ type restr = Natr | Ratr | Ratrs | Ratr1 | Atr | Atr1 | NoRestr |
99 | 99 | type sel_prefs = |
100 | 100 | SynsetId of int |
101 | 101 | | Predef of string |
102 | - | RelationRole of string * string * string (* relacji * rola * atrybut roli *) | |
102 | + | SynsetName of string | |
103 | + | RelationRole of string * string * string (* relacja * rola * atrybut roli *) | |
104 | + | |
105 | +type necessary = Req | Opt | Pro | ProNG | Multi | |
106 | + | |
107 | +type direction = Both_ | Forward_ | Backward_ | |
103 | 108 | |
104 | 109 | type position = {psn_id: int; gf: gf; role: string; role_attr: string; sel_prefs: sel_prefs list; |
105 | - mode: string list; cr: string list; ce: string list; morfs: phrase list} | |
110 | + mode: string list; cr: string list; ce: string list; morfs: phrase list; | |
111 | + dir: direction; is_necessary: necessary} | |
106 | 112 | |
107 | 113 | let empty_position = |
108 | - {psn_id=(-1); gf=ARG; role=""; role_attr=""; mode=[]; sel_prefs=[]; cr=[]; ce=[]; morfs=[]} | |
114 | + {psn_id=(-1); gf=ARG; role=""; role_attr=""; mode=[]; sel_prefs=[]; cr=[]; ce=[]; dir=Both_; morfs=[]; is_necessary=Opt} | |
109 | 115 | |
110 | 116 | type meaning = {mng_id: int; |
111 | 117 | name: string; |
... | ... |
lexSemantics/interface.ml
... | ... | @@ -83,10 +83,7 @@ let rec main_loop in_chan out_chan = |
83 | 83 | let _ = |
84 | 84 | prerr_endline message; |
85 | 85 | Arg.parse spec_list anon_fun usage_msg; |
86 | - ENIAMsubsyntax.initialize (); | |
87 | - ENIAMcategoriesPL.initialize (); | |
88 | - ENIAMwalParser.initialize (); | |
89 | - ENIAMwalReduce.initialize (); | |
86 | + ENIAMlexSemantics.initialize (); | |
90 | 87 | Gc.compact (); |
91 | 88 | prerr_endline "Ready!"; |
92 | 89 | if !comm_stdio then main_loop stdin stdout |
... | ... |
lexSemantics/makefile
... | ... | @@ -3,34 +3,33 @@ OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -#OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa #eniam-plWordnet.cmxa #eniam-lexSemantics.cmxa | |
7 | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa |
8 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
9 | 8 | |
10 | -SOURCES= entries.ml ENIAMwalTypes.ml ENIAMwalStringOf.ml ENIAMwalParser.ml ENIAMwalReduce.ml ENIAMvalence.ml ENIAMwalRenderer.ml ENIAMadjuncts.ml \ | |
11 | - ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsStringOf.ml ENIAMlexSemanticsHTMLof.ml ENIAMlexSemanticsXMLof.ml ENIAMlexSemantics.ml #ENIAMlexSemanticsData.ml | |
9 | +SOURCES= entries.ml ENIAMwalTypes.ml ENIAMwalStringOf.ml ENIAMwalParser.ml ENIAMwalReduce.ml ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsData.ml ENIAMvalence.ml ENIAMwalRenderer.ml ENIAMadjuncts.ml \ | |
10 | + ENIAMlexSemanticsStringOf.ml ENIAMlexSemanticsHTMLof.ml ENIAMlexSemanticsXMLof.ml ENIAMplWordnet.ml ENIAMlexSemantics.ml | |
12 | 11 | |
13 | 12 | all: eniam-lexSemantics.cma eniam-lexSemantics.cmxa |
14 | 13 | |
15 | 14 | install: all |
16 | 15 | mkdir -p $(INSTALLDIR) |
17 | 16 | cp eniam-lexSemantics.cmxa eniam-lexSemantics.a eniam-lexSemantics.cma $(INSTALLDIR) |
18 | - cp entries.cmi ENIAMwalTypes.cmi ENIAMwalStringOf.cmi ENIAMwalParser.cmi ENIAMwalReduce.cmi ENIAMvalence.cmi ENIAMwalRenderer.cmi ENIAMadjuncts.cmi ENIAMlexSemanticsTypes.cmi ENIAMlexSemanticsStringOf.cmi ENIAMlexSemanticsHTMLof.cmi ENIAMlexSemanticsXMLof.cmi ENIAMlexSemantics.cmi $(INSTALLDIR) | |
19 | - cp entries.cmx ENIAMwalTypes.cmx ENIAMwalStringOf.cmx ENIAMwalParser.cmx ENIAMwalReduce.cmx ENIAMvalence.cmx ENIAMwalRenderer.cmx ENIAMadjuncts.cmx ENIAMlexSemanticsTypes.cmx ENIAMlexSemanticsStringOf.cmx ENIAMlexSemanticsHTMLof.cmx ENIAMlexSemanticsXMLof.cmx ENIAMlexSemantics.cmx $(INSTALLDIR) | |
17 | + cp entries.cmi ENIAMwalTypes.cmi ENIAMwalStringOf.cmi ENIAMwalParser.cmi ENIAMwalReduce.cmi ENIAMlexSemanticsData.cmi ENIAMvalence.cmi ENIAMwalRenderer.cmi ENIAMadjuncts.cmi ENIAMlexSemanticsTypes.cmi ENIAMlexSemanticsStringOf.cmi ENIAMlexSemanticsHTMLof.cmi ENIAMlexSemanticsXMLof.cmi ENIAMplWordnet.cmi ENIAMlexSemantics.cmi $(INSTALLDIR) | |
18 | + cp entries.cmx ENIAMwalTypes.cmx ENIAMwalStringOf.cmx ENIAMwalParser.cmx ENIAMwalReduce.cmx ENIAMlexSemanticsData.cmx ENIAMvalence.cmx ENIAMwalRenderer.cmx ENIAMadjuncts.cmx ENIAMlexSemanticsTypes.cmx ENIAMlexSemanticsStringOf.cmx ENIAMlexSemanticsHTMLof.cmx ENIAMlexSemanticsXMLof.cmx ENIAMplWordnet.cmx ENIAMlexSemantics.cmx $(INSTALLDIR) | |
20 | 19 | mkdir -p /usr/share/eniam/lexSemantics |
21 | -# cp resources/* /usr/share/eniam/lexSemantics | |
22 | - ln -s /usr/share/eniam/lexSemantics/proper_names_20160104.tab /usr/share/eniam/lexSemantics/proper_names.tab | |
23 | - ln -s /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab | |
20 | + cp resources/* /usr/share/eniam/lexSemantics | |
21 | +# ln -s /usr/share/eniam/lexSemantics/proper_names_20160104.tab /usr/share/eniam/lexSemantics/proper_names.tab | |
22 | +# ln -s /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab | |
24 | 23 | |
25 | 24 | install-local: all |
26 | 25 | mkdir -p $(INSTALLDIR) |
27 | 26 | cp eniam-lexSemantics.cmxa eniam-lexSemantics.a eniam-lexSemantics.cma $(INSTALLDIR) |
28 | - cp entries.cmi ENIAMwalTypes.cmi ENIAMwalStringOf.cmi ENIAMwalParser.cmi ENIAMwalReduce.cmi ENIAMvalence.cmi ENIAMwalRenderer.cmi ENIAMadjuncts.cmi ENIAMlexSemanticsTypes.cmi ENIAMlexSemantics.cmi $(INSTALLDIR) | |
29 | - cp entries.cmx ENIAMwalTypes.cmx ENIAMwalStringOf.cmx ENIAMwalParser.cmx ENIAMwalReduce.cmx ENIAMvalence.cmx ENIAMwalRenderer.cmx ENIAMadjuncts.cmx ENIAMlexSemanticsTypes.cmx ENIAMlexSemantics.cmx $(INSTALLDIR) | |
27 | + cp entries.cmi ENIAMwalTypes.cmi ENIAMwalStringOf.cmi ENIAMwalParser.cmi ENIAMwalReduce.cmi ENIAMlexSemanticsData.cmi ENIAMvalence.cmi ENIAMwalRenderer.cmi ENIAMadjuncts.cmi ENIAMlexSemanticsTypes.cmi ENIAMlexSemanticsStringOf.cmi ENIAMlexSemanticsHTMLof.cmi ENIAMlexSemanticsXMLof.cmi ENIAMplWordnet.cmi ENIAMlexSemantics.cmi $(INSTALLDIR) | |
28 | + cp entries.cmx ENIAMwalTypes.cmx ENIAMwalStringOf.cmx ENIAMwalParser.cmx ENIAMwalReduce.cmx ENIAMlexSemanticsData.cmx ENIAMvalence.cmx ENIAMwalRenderer.cmx ENIAMadjuncts.cmx ENIAMlexSemanticsTypes.cmx ENIAMlexSemanticsStringOf.cmx ENIAMlexSemanticsHTMLof.cmx ENIAMlexSemanticsXMLof.cmx ENIAMplWordnet.cmx ENIAMlexSemantics.cmx $(INSTALLDIR) | |
30 | 29 | mkdir -p /usr/local/share/eniam/lexSemantics |
31 | -# cp resources/* /usr/local/share/eniam/lexSemantics | |
32 | - ln -s /usr/local/share/eniam/lexSemantics/proper_names_20160104.tab /usr/local/share/eniam/lexSemantics/proper_names.tab | |
33 | - ln -s /usr/local/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/local/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab | |
30 | + cp resources/* /usr/local/share/eniam/lexSemantics | |
31 | +# ln -s /usr/local/share/eniam/lexSemantics/proper_names_20160104.tab /usr/local/share/eniam/lexSemantics/proper_names.tab | |
32 | +# ln -s /usr/local/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/local/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab | |
34 | 33 | |
35 | 34 | eniam-lexSemantics.cma: $(SOURCES) |
36 | 35 | ocamlc -linkall -a -o eniam-lexSemantics.cma $(OCAMLFLAGS) $^ |
... | ... | @@ -38,7 +37,7 @@ eniam-lexSemantics.cma: $(SOURCES) |
38 | 37 | eniam-lexSemantics.cmxa: $(SOURCES) |
39 | 38 | ocamlopt -linkall -a -o eniam-lexSemantics.cmxa $(INCLUDES) $^ |
40 | 39 | |
41 | -test: $(SOURCES) test.ml | |
40 | +test: test.ml | |
42 | 41 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ |
43 | 42 | |
44 | 43 | interface: interface.ml |
... | ... |
plWordnet/resources/predef_prefs.tab renamed to lexSemantics/resources/predef_prefs.tab
1 | -LUDZIE osoba 1 grupa ludzi 1 | |
1 | +LUDZIE osoba 1 ludzie 1 | |
2 | 2 | ISTOTY LUDZIE osoba 1 istota żywa 1 grupa istot 1 |
3 | 3 | PODMIOTY LUDZIE podmiot 3 |
4 | 4 | JADŁO pokarm 1 napój 1 |
... | ... | @@ -17,4 +17,4 @@ SYTUACJA CZYNNOŚĆ czynność 1 zdarzenie 2 okoliczność 1 ciąg zdarzeń 1 |
17 | 17 | KIEDY CZAS SYTUACJA |
18 | 18 | CZEMU CECHA SYTUACJA LUDZIE |
19 | 19 | ILOŚĆ ilość 1 rozmiar 1 rozmiar 2 jednostka 4 wielkość 6 |
20 | -ALL PODMIOTY ISTOTY DOBRA SYTUACJA | |
20 | +#ALL PODMIOTY ISTOTY DOBRA SYTUACJA | |
... | ... |
plWordnet/resources/proper_classes.tab renamed to lexSemantics/resources/proper_classes.tab
lexSemantics/test.ml
... | ... | @@ -23,6 +23,7 @@ let test_strings = [ |
23 | 23 | (* "Kot miauczy w październiku."; *) |
24 | 24 | (* "Arabia Saudyjska biegnie."; *) |
25 | 25 | "Chłopcy mają ulicę kwiatami."; |
26 | + "Kot miałczy."; | |
26 | 27 | (* "Np. Ala."; |
27 | 28 | "Kot np. miauczy."; |
28 | 29 | "Szpak frunie. Kot miauczy."; |
... | ... | @@ -61,6 +62,7 @@ let test_strings = [ |
61 | 62 | ] |
62 | 63 | |
63 | 64 | let _ = |
65 | + ENIAMlexSemantics.initialize (); | |
64 | 66 | print_endline "Testy wbudowane"; |
65 | 67 | Xlist.iter test_strings (fun s -> |
66 | 68 | print_endline ("\nTEST: " ^ s); |
... | ... |
morphology/check_rule_compos.py
0 → 100644
1 | +# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016. | |
2 | +# This file is intended to check the NKJP1M frequency list against rules derived from SGJP. | |
3 | +# If you want to use this, review the end of this file (filenames, column structure) and run with python3. | |
4 | + | |
5 | +import re | |
6 | + | |
7 | +def load_rules_file(fname): | |
8 | + rule_list = [] | |
9 | + contents = '' | |
10 | + | |
11 | + with open(fname) as inp: | |
12 | + contents = inp.read() | |
13 | + | |
14 | + contents = contents.split('\n') | |
15 | + | |
16 | + for line in contents: | |
17 | + data = line.split('\t') | |
18 | + if len(data) != 7: | |
19 | + print('Skipped line in rules: '+line) | |
20 | + rule_list.append(tuple(data)) | |
21 | + | |
22 | + return rule_list | |
23 | + | |
24 | +def make_rules_table(rule_list): | |
25 | + "Given rule_list as list of tuples (name, freq, classification, prefix, suffix, stem ending, \ | |
26 | + tag), create a dictionary: ending -> list of applicable rules, also as tuples. Indices are \ | |
27 | + prefixes followed by - (hyphen) and suffixes preced by -, up to three characters; longer \ | |
28 | + affixes are included in the lists for their outermost three-character parts. If both empty \ | |
29 | + affixes are empty, rule gets listed under '-'." | |
30 | + | |
31 | + rtable = dict() | |
32 | + | |
33 | + for rl in rule_list: | |
34 | + if len(rl) != 7: | |
35 | + print("Skipped invalid rule: "+str(rl)) | |
36 | + continue | |
37 | + | |
38 | + index = '-' | |
39 | + | |
40 | + if rl[3] != '': | |
41 | + index = rl[3] + '-' | |
42 | + elif rl[4] != '': | |
43 | + index = '-' + rl[4] | |
44 | + | |
45 | + if len(index) > 4: | |
46 | + if index[0] == '-': # suffix | |
47 | + index = '-' + index[-3:] | |
48 | + else: # prefix | |
49 | + index = index[:3] + '-' | |
50 | + | |
51 | + if index in rtable: | |
52 | + rtable[index].append(rl) | |
53 | + else: | |
54 | + rtable[index] = [ rl ] | |
55 | + | |
56 | + return rtable | |
57 | + | |
58 | +# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained | |
59 | +# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking | |
60 | +# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better | |
61 | +def strict_tagcomp(tag1, tag2): | |
62 | + tag1_items = tag1.split(':') | |
63 | + tag2_items = tag2.split(':') | |
64 | + | |
65 | + if (tag1_items[0] != tag2_items[0] # POS | |
66 | + or len(tag1_items) != len(tag2_items)): | |
67 | + return False | |
68 | + | |
69 | + for (i, item) in enumerate(tag1_items): | |
70 | + if not item in tag2_items[i].split('.'): | |
71 | + return False | |
72 | + | |
73 | + return True | |
74 | + | |
75 | +def liberal_tagcomp(tag1, tag2): | |
76 | + tag1_items = tag1.split(':') | |
77 | + tag2_items = tag2.split(':') | |
78 | + | |
79 | + if (tag1_items[0] != tag2_items[0] # POS | |
80 | + or len(tag1_items) != len(tag2_items)): | |
81 | + return False | |
82 | + | |
83 | + for (i, item) in enumerate(tag1_items): | |
84 | + # remove tags n1, f1... | |
85 | + item = re.sub(r'(n1|n2|n3)', 'n', item) | |
86 | + model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.') | |
87 | + if not item in model and model[0] != '_': # underscore as a catchall | |
88 | + return False | |
89 | + | |
90 | + return True | |
91 | + | |
92 | +def is_recognizable(entry, rules_table): | |
93 | + "Check whether entry, given as triple (word_form, lemma, tags) is recognizable using \ | |
94 | + rules_table as obtained from make_rules_table() function. Return the rule's class \ | |
95 | + (third column, usually empty string)." | |
96 | + | |
97 | + for chunk_size in range(3, -1, -1): | |
98 | + if len(entry[0]) < chunk_size: | |
99 | + continue | |
100 | + | |
101 | + rule_candidates = [] | |
102 | + | |
103 | + pref_ind = entry[0][:chunk_size]+'-' | |
104 | + suf_ind = '-'+entry[0][-chunk_size:] | |
105 | + if pref_ind in rules_table: | |
106 | + rule_candidates += rules_table[ pref_ind ] | |
107 | + if suf_ind in rules_table: | |
108 | + rule_candidates += rules_table[ suf_ind ] | |
109 | + | |
110 | + if len(rule_candidates) == 0: | |
111 | + continue | |
112 | + for rl in rule_candidates: | |
113 | + # check first the prefix and suffix (the above code just finds rules that are | |
114 | + # potentially relevant), and tag; then proceed to reconstructing the lemma | |
115 | + if (entry[0][:len(rl[3])] == rl[3] and | |
116 | + # check for empty suffix, since string[-0:] returns the string unchanged | |
117 | + (len(rl[4]) == 0 or entry[0][-len(rl[4]):] == rl[4]) and | |
118 | + liberal_tagcomp(entry[2], rl[6])): | |
119 | + # trim the prefix and suffix, and glue the ending suggested by the rule; | |
120 | + # compare with the original lemma | |
121 | + if (entry[0][len(rl[3]):-len(rl[4])]+rl[5] == entry[1] | |
122 | + # another corner case, str[:-0] would be '' | |
123 | + or (len(rl[4]) == 0 and entry[0][len(rl[3]):]+rl[5] == entry[1])): | |
124 | + return rl[2] | |
125 | + | |
126 | + return False | |
127 | + | |
128 | +rlist = load_rules_file('../resources/SGJP/freq_rules.tab') | |
129 | +rtable = make_rules_table(rlist) | |
130 | + | |
131 | +def esccurl(string) : | |
132 | + "Escape the curly brackets in the string, for using it with the string formatter." | |
133 | + return string.replace('{', '{{').replace('}', '}}') | |
134 | + | |
135 | +with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp: | |
136 | + with open('freq_with_rules.tab', 'w+') as out: | |
137 | + for line in inp: | |
138 | + line = line.strip() | |
139 | + data = line.split('\t') | |
140 | + if len(data) != 8: # column count of TAGGED frequency list | |
141 | + print('Skipped line in the list: '+line) | |
142 | + continue | |
143 | + | |
144 | + # The following was added to work on partially done tagged frequency, to get rid of the | |
145 | + # previous COMPOS classification. Otherwise we'd want to use something like this: | |
146 | + # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list | |
147 | + # previous COMPOS column is in data[4], so we skip it below | |
148 | + fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:])) | |
149 | + | |
150 | + rl_class = is_recognizable((data[0], data[1], data[2]), rtable) | |
151 | + if rl_class == '': | |
152 | + print(fmt.format('COMPOS'), file=out) | |
153 | + elif rl_class != False: | |
154 | + print(fmt.format('COMPOS-'+rl_class), file=out) | |
155 | + else: | |
156 | + # Try again, with lowered lemma and word form. | |
157 | + rl_class_low = is_recognizable((data[0].lower(), data[1].lower(), data[2]), | |
158 | + rtable) | |
159 | + if rl_class_low == '': | |
160 | + print(fmt.format('COMPOS-LWR'), file=out) | |
161 | + elif rl_class_low != False: | |
162 | + print(fmt.format('COMPOS-LWR-'+rl_class_low), file=out) | |
163 | + else: | |
164 | + print(fmt.format('NCOMPOS'), file=out) | |
... | ... |
morphology/compare_morphosyn.py
0 → 100644
1 | +# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016. | |
2 | +# | |
3 | +# Given a frequency list and groundtruth dictionary, tag the entries on the frequency list with some | |
4 | +# automatic tags (can be seen at the end of this file). | |
5 | +# | |
6 | +# Run from Python3, with -i (inspect option), eg. `python3 -i compare_morphosyn.py`. | |
7 | +# Then invoke something like (with # representing Python prompt): | |
8 | +# # sgjp = load_sgjp('../../NLP resources/sgjp-20160724.tab') | |
9 | +# # nkjp = load_nkjp('../resources/NKJP1M/NKJP1M-frequency.tab') | |
10 | +# # notmatching(nkjp, sgjp, liberal_tagcomp, 'raw_tagged_frequency.tab') # (may take a while) | |
11 | +# # ^D # Ctrl-D when done | |
12 | +# The last argument points the result file, liberal_tagcomp is the most sane tag comparing function. | |
13 | + | |
14 | +import functools | |
15 | +import re | |
16 | +import unicodedata | |
17 | + | |
18 | +def load_sgjp(fname): | |
19 | + sgjp = dict() | |
20 | + with open(fname) as inp: | |
21 | + for line in inp: | |
22 | + data = line.strip().split('\t') | |
23 | + | |
24 | + if len(data) < 3: | |
25 | + print('Skipped line: ' + line.strip()) | |
26 | + continue | |
27 | + | |
28 | + word_form = data[0] | |
29 | + | |
30 | + lemma = '' | |
31 | + lemma_sub = '' | |
32 | + if data[1] == ':': | |
33 | + lemma = [':'] | |
34 | + else: | |
35 | + lemma = data[1].split(":")[0] # lemma subidentifier | |
36 | + if len(data[1].split(":")) > 1: | |
37 | + lemma_sub = data[1].split(":")[1] | |
38 | + if word_form.find('_') == -1: | |
39 | + lemma = lemma.replace('_', ' ') | |
40 | + | |
41 | + tags = data[2] | |
42 | + | |
43 | + notes = '' | |
44 | + if len(data) == 4: | |
45 | + notes = data[3] | |
46 | + | |
47 | + if lemma in sgjp: | |
48 | + sgjp[lemma].append([word_form, tags, notes]) | |
49 | + else: | |
50 | + sgjp[lemma] = [ [word_form, tags, notes, lemma_sub] ] | |
51 | + return sgjp | |
52 | + | |
53 | + | |
54 | +def load_nkjp(fname): | |
55 | + nkjp = [] | |
56 | + with open(fname) as inp: | |
57 | + nkjp = inp.read().split('\n') | |
58 | + for (n, line) in enumerate(nkjp): | |
59 | + nkjp[n] = nkjp[n].split('\t') # word_form, lemma, tags, freq | |
60 | + if len(nkjp[n]) != 5: | |
61 | + print('Skipped line: ' + str(n)) | |
62 | + del nkjp[n] | |
63 | + return nkjp | |
64 | + | |
65 | +def naive_tagcomp(tag1, tag2): | |
66 | + return (tag1 == tag2) | |
67 | + | |
68 | +def strict_tagcomp(tag1, tag2): | |
69 | + tag1_items = tag1.split(':') | |
70 | + tag2_items = tag2.split(':') | |
71 | + | |
72 | + if (tag1_items[0] != tag2_items[0] # POS | |
73 | + or len(tag1_items) != len(tag2_items)): | |
74 | + return False | |
75 | + | |
76 | + for (i, item) in enumerate(tag1_items): | |
77 | + if not item in tag2_items[i].split('.'): | |
78 | + return False | |
79 | + | |
80 | + return True | |
81 | + | |
82 | +def liberal_tagcomp(tag1, tag2): | |
83 | + tag1_items = tag1.split(':') | |
84 | + tag2_items = tag2.split(':') | |
85 | + | |
86 | + if (tag1_items[0] != tag2_items[0] # POS | |
87 | + or len(tag1_items) != len(tag2_items)): | |
88 | + return False | |
89 | + | |
90 | + for (i, item) in enumerate(tag1_items): | |
91 | + # remove tags n1, f1... | |
92 | + item = re.sub(r'(n1|n2|n3)', 'n', item) | |
93 | + model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.') | |
94 | + if not item in model and model[0] != '_': # underscore as a catchall | |
95 | + return False | |
96 | + | |
97 | + return True | |
98 | + | |
99 | +def compare_entries(nkjp_entry, sgjp_forms, tagcomp_func): | |
100 | + found = False | |
101 | + case1 = False | |
102 | + case2 = False | |
103 | + case3 = False | |
104 | + for (s, sgjp_form) in enumerate(sgjp_forms): | |
105 | + nkjp_word = nkjp_entry[0] | |
106 | + nkjp_tag = re.sub(r':$', '', nkjp_entry[2]) | |
107 | + if nkjp_tag != nkjp_entry[2]: | |
108 | + print("Corrected tag %s for %s %s" % (nkjp_entry[2], nkjp_entry[0], nkjp_entry[1])) | |
109 | + sgjp_word = sgjp_form[0] | |
110 | + sgjp_tag = sgjp_form[1] | |
111 | + | |
112 | + tag_match = tagcomp_func(nkjp_tag, sgjp_tag) # do it once | |
113 | + | |
114 | + if sgjp_word == nkjp_word and tag_match: # word_nkjp_word & tag | |
115 | + found = True | |
116 | + break | |
117 | + | |
118 | + elif tag_match: # tag okay, try with other letter cases | |
119 | + if len(nkjp_word) > 1 and nkjp_word.lower().capitalize() == nkjp_word: # Aaaa -> aaaa | |
120 | + if sgjp_word == nkjp_word.lower(): | |
121 | + case1 = True | |
122 | + if not case1 and nkjp_word.lower() != nkjp_word: | |
123 | + if sgjp_word == nkjp_word.capitalize(): # AAAA -> Aaaa | |
124 | + case2 = True | |
125 | + elif sgjp_word == nkjp_word.lower(): # AAAA -> aaaa, A -> a | |
126 | + case3 = True | |
127 | + return (found, case1, case2, case3) | |
128 | + | |
129 | +def tab_format(collection, label): | |
130 | + "Convert a collection used by notmatching() function to a string of tabbed entries." | |
131 | + fmt = '' | |
132 | + for etr in collection: | |
133 | + fmt = fmt + '\t'.join(etr)+ '\t' + label + '\n' | |
134 | + #print("formatted for "+label+", "+str(len(fmt)) + " bytes") | |
135 | + return fmt | |
136 | + | |
137 | +def nonalphab(string): | |
138 | + for char in string: | |
139 | + if unicodedata.category(char)[0] == 'L': # 'letter' | |
140 | + return False | |
141 | + return True | |
142 | + | |
143 | +def notmatching(nkjp, sgjp, tagcomp_func, result_file): | |
144 | + notmatching = [] | |
145 | + matching = [] | |
146 | + case1_notmatching = [] # Aaaa -> aaaa | |
147 | + case2_notmatching = [] # AAAA -> Aaaa | |
148 | + case3_notmatching = [] # AAAA -> aaaa, A -> a | |
149 | + lower_matching = [] # matching with form and lemma converted to lowercase | |
150 | + symbols = [] | |
151 | + notmatching_numeric = [] | |
152 | + | |
153 | + for (n, nkjp_entry) in enumerate(nkjp): | |
154 | + | |
155 | + lemma = nkjp_entry[1].strip() | |
156 | + form = nkjp_entry[0].strip() | |
157 | + # Warn about stripped whitespaces. | |
158 | + if lemma != nkjp_entry[1]: | |
159 | + print("Stripped whitespaces in lemma: %s" % nkjp_entry[1]) | |
160 | + if form != nkjp_entry[0]: | |
161 | + print("Stripped whitespaces in form: %s" % nkjp_entry[0]) | |
162 | + | |
163 | + # Abbreviations are automatically classified as symbols. | |
164 | + if nkjp_entry[2][:4] == 'brev': | |
165 | + symbols.append(nkjp_entry) | |
166 | + continue | |
167 | + | |
168 | + sgjp_forms = [] | |
169 | + lowered_lemma = False # indicates if lemma was converted to lowercase | |
170 | + if lemma in sgjp: # lemma matching | |
171 | + sgjp_forms = sgjp[lemma] | |
172 | + else: | |
173 | + if lemma.lower() in sgjp: | |
174 | + lowered_lemma = True | |
175 | + sgjp_forms = sgjp[lemma.lower()] | |
176 | + else: | |
177 | + # Continue when we can't find even lowered lemma in SGJP. | |
178 | + if nonalphab(form) and nonalphab(lemma): | |
179 | + symbols.append(nkjp_entry) | |
180 | + elif re.match(r"^[123456789]", form, flags=re.L) != None: | |
181 | + notmatching_numeric.append(nkjp_entry) | |
182 | + continue | |
183 | + else: | |
184 | + notmatching.append(nkjp_entry) | |
185 | + continue | |
186 | + | |
187 | + # The following is executed only if the lemma (maybe in lowercase) was found in SGJP. | |
188 | + | |
189 | + # Go through the entry if it wasn't found in SGJP | |
190 | + found, case1, case2, case3 = 0, 1, 2, 3 # indices in boolean tuple below | |
191 | + case = compare_entries(nkjp_entry, sgjp_forms, tagcomp_func) | |
192 | + | |
193 | + # one more desperate attempt at lowering the lemma, if nothing was found | |
194 | + if (not lowered_lemma) and not True in case: | |
195 | + if lemma.lower() in sgjp: | |
196 | + sgjp_forms = sgjp[lemma.lower()] | |
197 | + case = compare_entries(nkjp_entry, sgjp_forms, tagcomp_func) | |
198 | + if True in case: | |
199 | + lowered_lemma = True | |
200 | + else: # revert for consistency | |
201 | + sgjp_forms = sgjp[lemma] | |
202 | + | |
203 | + if lowered_lemma and (case[found] or case[case1] or case[case2] or case[case3]): | |
204 | + lower_matching.append(nkjp_entry) | |
205 | + continue | |
206 | + | |
207 | + if case[found]: | |
208 | + matching.append(nkjp_entry) | |
209 | + continue | |
210 | + | |
211 | + if nonalphab(form) and nonalphab(lemma): | |
212 | + symbols.append(nkjp_entry) | |
213 | + continue | |
214 | + if re.match(r"^[123456789]", form, flags=re.L) != None: | |
215 | + notmatching_numeric.append(nkjp_entry) | |
216 | + continue | |
217 | + | |
218 | + if case[case1]: | |
219 | + case1_notmatching.append(nkjp_entry) | |
220 | + continue | |
221 | + if case[case2]: | |
222 | + case2_notmatching.append(nkjp_entry) | |
223 | + continue | |
224 | + if case[case3]: | |
225 | + case3_notmatching.append(nkjp_entry) | |
226 | + continue | |
227 | + | |
228 | + # when everything failed: | |
229 | + notmatching.append(nkjp_entry) | |
230 | + | |
231 | + collections = [nkjp, matching, case1_notmatching, case2_notmatching, case3_notmatching, | |
232 | + lower_matching, symbols, notmatching_numeric, notmatching] | |
233 | + # sort the entries in collections by frequency | |
234 | + collections = list(map((lambda coll: sorted(coll, reverse=True, key=(lambda etr: int(etr[3])))), | |
235 | + collections)) | |
236 | + freqs = list(map(lambda coll: functools.reduce((lambda x, y: x+y), | |
237 | + [int(etr[3]) for etr in coll]), # sum of sets' frequencies | |
238 | + collections)) | |
239 | + descs = ["Total:", | |
240 | + "Found:", | |
241 | + "Found when Aaa -> aaa (lemma):", | |
242 | + "Found when AAA -> Aaa (lemma):", | |
243 | + "Found when AAA -> aaa (lemma):", | |
244 | + "Found when word form and lemma are converted to lowercase:", | |
245 | + "Symbols:", | |
246 | + "Not found, numeric:", | |
247 | + "Not found, other:"] | |
248 | + | |
249 | + for (i, _) in enumerate(collections): | |
250 | + info = (len(collections[i]), 100.0*(len(collections[i])/len(collections[0])), | |
251 | + freqs[i], 100.0*(freqs[i]/freqs[0])) | |
252 | + print((descs[i]+" %d entries (%.2f%%), %d occurences (%.2f%%)") % info) | |
253 | + | |
254 | + # below we skip nkjp, which contains everything | |
255 | + labels = ['SGJP-EXACT\tNCH\tCORR', 'SGJP-LMM-UNCAPITAL\tNCH\tCORR', | |
256 | + 'SGJP-LMM-CAPITAL\tNCH\tCORR', 'SGJP-LMM-LOWER\tNCH\tCORR', | |
257 | + 'SGJP-BTH-LOWER\tNCH\tCORR', 'NON-SGJP\tSYMB\tCORR', | |
258 | + 'NON-SGJP\tLATEK\tCORR', 'NON-SGJP\tCW\tCORR'] | |
259 | + with open(result_file, 'w+') as out: | |
260 | + for (c, coll) in enumerate(collections[1:]): | |
261 | + print(tab_format(coll, labels[c]), file=out) | |
... | ... |
morphology/compos_alt.py
0 → 100644
1 | +# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Nov 2016. | |
2 | +# This file is intended to check the (partially tagged) NKJP1M frequency list against list of exce- | |
3 | +# ptions from morphological rules derived from SGJP. | |
4 | +# If you want to use this, review the end of this file (filenames, column structure) and run with python3. | |
5 | + | |
6 | +import re | |
7 | + | |
8 | +# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained | |
9 | +# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking | |
10 | +# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better | |
11 | +def strict_tagcomp(tag1, tag2): | |
12 | + tag1_items = tag1.split(':') | |
13 | + tag2_items = tag2.split(':') | |
14 | + | |
15 | + if (tag1_items[0] != tag2_items[0] # POS | |
16 | + or len(tag1_items) != len(tag2_items)): | |
17 | + return False | |
18 | + | |
19 | + for (i, item) in enumerate(tag1_items): | |
20 | + if not item in tag2_items[i].split('.'): | |
21 | + return False | |
22 | + | |
23 | + return True | |
24 | + | |
25 | +def liberal_tagcomp(tag1, tag2): | |
26 | + tag1_items = tag1.split(':') | |
27 | + tag2_items = tag2.split(':') | |
28 | + | |
29 | + if (tag1_items[0] != tag2_items[0] # POS | |
30 | + or len(tag1_items) != len(tag2_items)): | |
31 | + return False | |
32 | + | |
33 | + for (i, item) in enumerate(tag1_items): | |
34 | + # remove tags n1, f1... | |
35 | + item = re.sub(r'(n1|n2|n3)', 'n', item) | |
36 | + model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.') | |
37 | + if not item in model and model[0] != '_': # underscore as a catchall | |
38 | + return False | |
39 | + | |
40 | + return True | |
41 | + | |
42 | +# the bulk of the following ripped from check_rule_compos.py | |
43 | +def esccurl(string) : | |
44 | + "Escape the curly brackets in the string, for using it with the string formatter." | |
45 | + return string.replace('{', '{{').replace('}', '}}') | |
46 | + | |
47 | +alt_idx = dict() # indexed by data[0] - word form | |
48 | + | |
49 | +with open('../resources/SGJP/alt.tab') as alt_src: | |
50 | + for line in alt_src: | |
51 | + line = line.strip() | |
52 | + data = line.split('\t') | |
53 | + if len(data) != 3: | |
54 | + print('Skipped line in the alt list: '+line) | |
55 | + continue | |
56 | + # handle lemmas with subclassification after colon | |
57 | + if data[1].find(':') != -1 and data[1] != ':': | |
58 | + data[1] = data[1][: data[1].find(':')] | |
59 | + # each entry consists of 0 - list of lemmas, 1 - list of tags | |
60 | + if not data[0] in alt_idx: | |
61 | + alt_idx[data[0]] = [[data[1]], [data[2]]] | |
62 | + else: | |
63 | + alt_idx[data[0]][0].append(data[1]) | |
64 | + alt_idx[data[0]][1].append(data[2]) | |
65 | + | |
66 | +with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp: | |
67 | + with open('freq_with_alt.tab', 'w+') as out: | |
68 | + for line in inp: | |
69 | + line = line.strip() | |
70 | + data = line.split('\t') | |
71 | + if len(data) != 8: # column count of TAGGED frequency list | |
72 | + print('Skipped line in the list: '+line) | |
73 | + continue | |
74 | + | |
75 | + # The following was added to work on partially done tagged frequency, to get rid of the | |
76 | + # previous COMPOS classification. Otherwise we'd want to use something like this: | |
77 | + # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list | |
78 | + # previous COMPOS column is in data[4], so we skip it below | |
79 | + fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:])) | |
80 | + | |
81 | + matched = False | |
82 | + if data[0] in alt_idx: | |
83 | + tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), alt_idx[data[0]][1])) | |
84 | + tagnum = True in tagcomps and tagcomps.index(True) | |
85 | + # (make sure that if lemma is matching, it belongs to the matching tag) | |
86 | + if tagnum != -1 and tagnum != False and alt_idx[data[0]][0][tagnum] == data[1]: | |
87 | + print(fmt.format('COMPOS-ALT'), file=out) | |
88 | + matched = True | |
89 | + # try again with lowering word form and lemma: | |
90 | + if not matched and data[0].lower() in alt_idx: | |
91 | + tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), # data[2] - tag stays the same | |
92 | + alt_idx[data[0].lower()][1])) | |
93 | + tagnum = True in tagcomps and tagcomps.index(True) | |
94 | + if tagnum != -1 and tagnum != False and alt_idx[data[0].lower()][0][tagnum] == data[1].lower(): | |
95 | + print(fmt.format('COMPOS-LWR-ALT'), file=out) | |
96 | + matched = True | |
97 | + if not matched: | |
98 | + print(line, file=out) | |
... | ... |
morphology/data/alternations.dic
... | ... | @@ -1078,3 +1078,222 @@ |
1078 | 1078 | zetrz star |
1079 | 1079 | źr ziar |
1080 | 1080 | |
1081 | +@obce_funkcjonalnie_twarde_a | |
1082 | + ac ak ac | |
1083 | + aq ak aq | |
1084 | + ay ay ay | |
1085 | + c c c | |
1086 | + c k c | |
1087 | + ch ch ch | |
1088 | + cq k cq | |
1089 | + dok dk dok | |
1090 | + du du du | |
1091 | + dź dź dź | |
1092 | + ec ek ec | |
1093 | + ey ey ey | |
1094 | + gh dz gh | |
1095 | + gh gh gh | |
1096 | + gi gi gi | |
1097 | + gn gn gn | |
1098 | + gy gy gy | |
1099 | + ic ik ic | |
1100 | + kie k kie | |
1101 | + lj lj lj | |
1102 | + ly ly ly | |
1103 | + m m m | |
1104 | + nc nk nc | |
1105 | + oc ok oc | |
1106 | + oy oy oy | |
1107 | + que k que | |
1108 | + ri ri ri | |
1109 | + shu shu shu | |
1110 | + st sti st | |
1111 | + szu szu szu | |
1112 | + ti ti ti | |
1113 | + tu tu tu | |
1114 | + use uz use | |
1115 | + v v v | |
1116 | + x ksi x | |
1117 | + x x x | |
1118 | + z z z | |
1119 | + | |
1120 | +@obce_funkcjonalnie_twarde_e | |
1121 | + ac ak ac | |
1122 | + ac aki ac | |
1123 | + aq ak aq | |
1124 | + aq aki aq | |
1125 | + ay ay ay | |
1126 | + c c c | |
1127 | + c ce c | |
1128 | + co ki c | |
1129 | + cq k cq | |
1130 | + cq ki cq | |
1131 | + cques ki cques | |
1132 | + dok dk dok | |
1133 | + dok dki dok | |
1134 | + dź dź dź | |
1135 | + ec ek ec | |
1136 | + ec eki ec | |
1137 | + ey ey ey | |
1138 | + gh dz gh | |
1139 | + gh gh gh | |
1140 | + gh ghi gh | |
1141 | + gi gi gi | |
1142 | + gn gn gn | |
1143 | + gue gi gue | |
1144 | + gues gi gues | |
1145 | + gy gy gy | |
1146 | + ic iki ic | |
1147 | + je j je | |
1148 | + ke ki ke | |
1149 | + kie k kie | |
1150 | + kie ki kie | |
1151 | + lj lj lj | |
1152 | + ly ly ly | |
1153 | + nc nk nc | |
1154 | + nc nki nc | |
1155 | + oc ok oc | |
1156 | + oc oki oc | |
1157 | + ov ov ov | |
1158 | + oy oy oy | |
1159 | + que k que | |
1160 | + que ki que | |
1161 | + ques ki ques | |
1162 | + ri ri ri | |
1163 | + st sti st | |
1164 | + ti ti ti | |
1165 | + use uz use | |
1166 | + x ksi x | |
1167 | +* ng n ng | |
1168 | + | |
1169 | +@obce_funkcjonalnie_twarde_ie | |
1170 | + ais aisi ais | |
1171 | + bes bi bes | |
1172 | + ce si ce | |
1173 | + ct kci ct | |
1174 | + de dzi de | |
1175 | + dh dz dh | |
1176 | + dh dzi dh | |
1177 | + fe fi fe | |
1178 | + h dz h | |
1179 | + m mi m | |
1180 | + me mi me | |
1181 | + ne ni ne | |
1182 | + nes ni nes | |
1183 | + nh ni nh | |
1184 | + ph fi ph | |
1185 | + phe fi phe | |
1186 | + re rz re | |
1187 | + res rz res | |
1188 | + rh rz rh | |
1189 | + rs rz rs | |
1190 | + se si se | |
1191 | + sne śni sne | |
1192 | + ste ści ste | |
1193 | + stes ści stes | |
1194 | + te ci te | |
1195 | + tes ci tes | |
1196 | + th ci th | |
1197 | + th si th | |
1198 | + the ci the | |
1199 | + thes ci thes | |
1200 | + use uzi use | |
1201 | + v vi v | |
1202 | + ve vi ve | |
1203 | + x ksi x | |
1204 | +* ng ni ng | |
1205 | + | |
1206 | +@obce_funkcjonalnie_twarde_iy | |
1207 | + ai ai ai | |
1208 | + bee bee bee | |
1209 | + co ki c | |
1210 | + cques ki cques | |
1211 | + dhi dhi dhi | |
1212 | + die die die | |
1213 | + dy dy dy | |
1214 | + dí dí dí | |
1215 | + eu eu eu | |
1216 | + ghi ghi ghi | |
1217 | + gie gie gie | |
1218 | + gue gi gue | |
1219 | + gues gi gues | |
1220 | + ji ji ji | |
1221 | + ke ki ke | |
1222 | + kie kie kie | |
1223 | + ky c ky | |
1224 | + ky ki ky | |
1225 | + ky ky ky | |
1226 | + lye lye lye | |
1227 | + nii ni nii | |
1228 | + nii nii nii | |
1229 | + nyi nyi nyi | |
1230 | + pi pi pi | |
1231 | + pie pie pie | |
1232 | + que ki que | |
1233 | + ques ki ques | |
1234 | + re ry re | |
1235 | + rii ri rii | |
1236 | + rii rii rii | |
1237 | + ssy ssy ssy | |
1238 | + szky scy szky | |
1239 | + szky ski szky | |
1240 | + szky szky szky | |
1241 | + thy thy thy | |
1242 | + tie tie tie | |
1243 | + zo zi zo | |
1244 | + ři ři ři | |
1245 | +* ng ngy ng | |
1246 | + | |
1247 | +@obce_funkcjonalnie_miekkie_ii_wyglos | |
1248 | + ay ai ay | |
1249 | + ci cyj ci | |
1250 | + czi czyj czi | |
1251 | + oy oi oy | |
1252 | +* ay ai ay | |
1253 | +* oy oi oy | |
1254 | + | |
1255 | +@obce_ais | |
1256 | + ais ais ais | |
1257 | + | |
1258 | +@obce_apostrof | |
1259 | + bes bes bes | |
1260 | + by by by | |
1261 | + ce ce ce | |
1262 | + cy cy cy | |
1263 | + de de de | |
1264 | + dy dy dy | |
1265 | + es es es | |
1266 | + fe fe fe | |
1267 | + ge ge ge | |
1268 | + ges ges ges | |
1269 | + gues gues gues | |
1270 | + ke ke ke | |
1271 | + kes kes kes | |
1272 | + ky ky ky | |
1273 | + le le le | |
1274 | + les les les | |
1275 | + ly ly ly | |
1276 | + ly ly ly | |
1277 | + me me me | |
1278 | + my my my | |
1279 | + ne ne ne | |
1280 | + nes nes nes | |
1281 | + pe pe pe | |
1282 | + phe phe phe | |
1283 | + phy phy phy | |
1284 | + ques ques ques | |
1285 | + re re re | |
1286 | + res res res | |
1287 | + ry ry ry | |
1288 | + se se se | |
1289 | + ses ses ses | |
1290 | + sy sy sy | |
1291 | + te te te | |
1292 | + tes tes tes | |
1293 | + the the the | |
1294 | + thes thes thes | |
1295 | + thy thy thy | |
1296 | + uy uy uy | |
1297 | + ve ve ve | |
1298 | + ze ze ze | |
1299 | +* es e es | |
... | ... |
morphology/data/interps_general.tab
0 → 100644
1 | +adj-sup adj:sg:nom.voc:n1.n2:sup Ca | |
2 | +adj-sup adj:sg:nom.voc:m1.m2.m3:sup Cb | |
3 | +adj-sup adj:sg:nom.voc:f:sup Cc | |
4 | +adj-sup adj:sg:loc:m1.m2.m3.n1.n2:sup Cd | |
5 | +adj-sup adj:sg:loc:f:sup Ce | |
6 | +adj-sup adj:sg:inst:m1.m2.m3.n1.n2:sup Cf | |
7 | +adj-sup adj:sg:inst:f:sup Cg | |
8 | +adj-sup adj:sg:gen:m1.m2.m3.n1.n2:sup Ch | |
9 | +adj-sup adj:sg:gen:f:sup Ci | |
10 | +adj-sup adj:sg:dat:m1.m2.m3.n1.n2:sup Cj | |
11 | +adj-sup adj:sg:dat:f:sup Ck | |
12 | +adj-sup adj:sg:acc:n1.n2:sup Cl | |
13 | +adj-sup adj:sg:acc:m3:sup Cm | |
14 | +adj-sup adj:sg:acc:m1.m2:sup Cn | |
15 | +adj-sup adj:sg:acc:f:sup Co | |
16 | +adj-sup adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup Cp | |
17 | +adj-sup adj:pl:nom.voc:m1.p1:sup Cq | |
18 | +adj-sup adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Cr | |
19 | +adj-sup adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Cs | |
20 | +adj-sup adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Ct | |
21 | +adj-sup adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Cu | |
22 | +adj-sup adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup Cv | |
23 | +adj-sup adj:pl:acc:m1.p1:sup Cw | |
24 | +verb-neg ppas:sg:nom.voc:m1.m2.m3:perf:neg Ua | |
25 | +verb-neg ppas:sg:nom.voc:m1.m2.m3:imperf:neg Ua | |
26 | +verb-neg ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg Ua | |
27 | +verb-neg ppas:sg:nom.voc:f:perf:neg Ub | |
28 | +verb-neg ppas:sg:nom.voc:f:imperf:neg Ub | |
29 | +verb-neg ppas:sg:nom.voc:f:imperf.perf:neg Ub | |
30 | +verb-neg ppas:sg:nom.acc.voc:n1.n2:perf:neg Uc | |
31 | +verb-neg ppas:sg:nom.acc.voc:n1.n2:imperf:neg Uc | |
32 | +verb-neg ppas:sg:nom.acc.voc:n1.n2:imperf.perf:neg Uc | |
33 | +verb-neg ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg Ud | |
34 | +verb-neg ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg Ud | |
35 | +verb-neg ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg Ud | |
36 | +verb-neg ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg Ue | |
37 | +verb-neg ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg Ue | |
38 | +verb-neg ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg Ue | |
39 | +verb-neg ppas:sg:gen.dat.loc:f:perf:neg Uf | |
40 | +verb-neg ppas:sg:gen.dat.loc:f:imperf:neg Uf | |
41 | +verb-neg ppas:sg:gen.dat.loc:f:imperf.perf:neg Uf | |
42 | +verb-neg ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg Ug | |
43 | +verb-neg ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg Ug | |
44 | +verb-neg ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg Ug | |
45 | +verb-neg ppas:sg:acc:m3:perf:neg Uh | |
46 | +verb-neg ppas:sg:acc:m3:imperf:neg Uh | |
47 | +verb-neg ppas:sg:acc:m3:imperf.perf:neg Uh | |
48 | +verb-neg ppas:sg:acc:m1.m2:perf:neg Ui | |
49 | +verb-neg ppas:sg:acc:m1.m2:imperf:neg Ui | |
50 | +verb-neg ppas:sg:acc:m1.m2:imperf.perf:neg Ui | |
51 | +verb-neg ppas:sg:acc.inst:f:perf:neg Uj | |
52 | +verb-neg ppas:sg:acc.inst:f:imperf:neg Uj | |
53 | +verb-neg ppas:sg:acc.inst:f:imperf.perf:neg Uj | |
54 | +verb-neg ppas:pl:nom.voc:m1.p1:perf:neg Uk | |
55 | +verb-neg ppas:pl:nom.voc:m1.p1:imperf:neg Uk | |
56 | +verb-neg ppas:pl:nom.voc:m1.p1:imperf.perf:neg Uk | |
57 | +verb-neg ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:neg Ul | |
58 | +verb-neg ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg Ul | |
59 | +verb-neg ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg Ul | |
60 | +verb-neg ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg Um | |
61 | +verb-neg ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Um | |
62 | +verb-neg ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Um | |
63 | +verb-neg ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg Un | |
64 | +verb-neg ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Un | |
65 | +verb-neg ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Un | |
66 | +verb-neg ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg Uo | |
67 | +verb-neg ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Uo | |
68 | +verb-neg ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Uo | |
69 | +verb-neg ppas:pl:acc:m1.p1:perf:neg Up | |
70 | +verb-neg ppas:pl:acc:m1.p1:imperf:neg Up | |
71 | +verb-neg ppas:pl:acc:m1.p1:imperf.perf:neg Up | |
72 | +verb-neg pact:sg:nom.voc:m1.m2.m3:imperf:neg Va | |
73 | +verb-neg pact:sg:nom.voc:m1.m2.m3:imperf.perf:neg Va | |
74 | +verb-neg pact:sg:nom.voc:f:imperf:neg Vb | |
75 | +verb-neg pact:sg:nom.voc:f:imperf.perf:neg Vb | |
76 | +verb-neg pact:sg:nom.acc.voc:n1.n2:imperf:neg Vc | |
77 | +verb-neg pact:sg:nom.acc.voc:n1.n2:imperf.perf:neg Vc | |
78 | +verb-neg pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg Vd | |
79 | +verb-neg pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg Vd | |
80 | +verb-neg pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg Ve | |
81 | +verb-neg pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg Ve | |
82 | +verb-neg pact:sg:gen.dat.loc:f:imperf:neg Vf | |
83 | +verb-neg pact:sg:gen.dat.loc:f:imperf.perf:neg Vf | |
84 | +verb-neg pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg Vg | |
85 | +verb-neg pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg Vg | |
86 | +verb-neg pact:sg:acc:m3:imperf:neg Vh | |
87 | +verb-neg pact:sg:acc:m3:imperf.perf:neg Vh | |
88 | +verb-neg pact:sg:acc:m1.m2:imperf:neg Vi | |
89 | +verb-neg pact:sg:acc:m1.m2:imperf.perf:neg Vi | |
90 | +verb-neg pact:sg:acc.inst:f:imperf:neg Vj | |
91 | +verb-neg pact:sg:acc.inst:f:imperf.perf:neg Vj | |
92 | +verb-neg pact:pl:nom.voc:m1.p1:imperf:neg Vk | |
93 | +verb-neg pact:pl:nom.voc:m1.p1:imperf.perf:neg Vk | |
94 | +verb-neg pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg Vl | |
95 | +verb-neg pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg Vl | |
96 | +verb-neg pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Vm | |
97 | +verb-neg pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Vm | |
98 | +verb-neg pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Vn | |
99 | +verb-neg pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Vn | |
100 | +verb-neg pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Vo | |
101 | +verb-neg pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Vo | |
102 | +verb-neg pact:pl:acc:m1.p1:imperf:neg Vp | |
103 | +verb-neg pact:pl:acc:m1.p1:imperf.perf:neg Vp | |
104 | +verb-neg ger:sg:nom.acc:n2:perf:neg Ta | |
105 | +verb-neg ger:sg:nom.acc:n2:imperf:neg Ta | |
106 | +verb-neg ger:sg:nom.acc:n2:imperf.perf:neg Ta | |
107 | +verb-neg ger:sg:inst:n2:perf:neg Tb | |
108 | +verb-neg ger:sg:inst:n2:imperf:neg Tb | |
109 | +verb-neg ger:sg:inst:n2:imperf.perf:neg Tb | |
110 | +verb-neg ger:sg:gen:n2:perf:neg Tc | |
111 | +verb-neg ger:sg:gen:n2:imperf:neg Tc | |
112 | +verb-neg ger:sg:gen:n2:imperf.perf:neg Tc | |
113 | +verb-neg ger:sg:dat.loc:n2:perf:neg Td | |
114 | +verb-neg ger:sg:dat.loc:n2:imperf:neg Td | |
115 | +verb-neg ger:sg:dat.loc:n2:imperf.perf:neg Td | |
116 | +verb-neg ger:pl:nom.acc:n2:perf:neg Te | |
117 | +verb-neg ger:pl:nom.acc:n2:imperf:neg Te | |
118 | +verb-neg ger:pl:nom.acc:n2:imperf.perf:neg Te | |
119 | +verb-neg ger:pl:loc:n2:perf:neg Tf | |
120 | +verb-neg ger:pl:loc:n2:imperf:neg Tf | |
121 | +verb-neg ger:pl:loc:n2:imperf.perf:neg Tf | |
122 | +verb-neg ger:pl:inst:n2:perf:neg Tg | |
123 | +verb-neg ger:pl:inst:n2:imperf:neg Tg | |
124 | +verb-neg ger:pl:inst:n2:imperf.perf:neg Tg | |
125 | +verb-neg ger:pl:gen:n2:perf:neg Th | |
126 | +verb-neg ger:pl:gen:n2:imperf:neg Th | |
127 | +verb-neg ger:pl:gen:n2:imperf.perf:neg Th | |
128 | +verb-neg ger:pl:dat:n2:perf:neg Ti | |
129 | +verb-neg ger:pl:dat:n2:imperf:neg Ti | |
130 | +verb-neg ger:pl:dat:n2:imperf.perf:neg Ti | |
131 | +other winien:sg:n1.n2:ter:imperf W | |
132 | +other winien:sg:n1.n2:sec:imperf W | |
133 | +other winien:sg:n1.n2:pri:imperf W | |
134 | +other winien:sg:n1.n2:imperf W | |
135 | +other winien:sg:m1.m2.m3:ter:imperf W | |
136 | +other winien:sg:m1.m2.m3:sec:imperf W | |
137 | +other winien:sg:m1.m2.m3:pri:imperf W | |
138 | +other winien:sg:m1.m2.m3:imperf W | |
139 | +other winien:sg:f:ter:imperf W | |
140 | +other winien:sg:f:sec:imperf W | |
141 | +other winien:sg:f:pri:imperf W | |
142 | +other winien:sg:f:imperf W | |
143 | +other winien:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf W | |
144 | +other winien:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf W | |
145 | +other winien:pl:m2.m3.f.n1.n2.p2.p3:imperf W | |
146 | +other winien:pl:m1.p1:ter:imperf W | |
147 | +other winien:pl:m1.p1:sec:imperf W | |
148 | +other winien:pl:m1.p1:pri:imperf W | |
149 | +other winien:pl:m1.p1:imperf W | |
150 | +noun subst:sg:voc:n2 Gva | |
151 | +noun subst:sg:voc:n1 Gvb | |
152 | +noun subst:sg:voc:m3 Gvc | |
153 | +noun subst:sg:voc:m2 Gvd | |
154 | +noun subst:sg:voc:m1 Gve | |
155 | +noun subst:sg:voc:f Gvf | |
156 | +noun subst:sg:nom:n2 Gna | |
157 | +noun subst:sg:nom:n1 Gnb | |
158 | +noun subst:sg:nom:m3 Gnc | |
159 | +noun subst:sg:nom:m2 Gnd | |
160 | +noun subst:sg:nom:m1 Gne | |
161 | +noun subst:sg:nom:f Gnf | |
162 | +noun subst:sg:loc:n2 Gla | |
163 | +noun subst:sg:loc:n1 Glb | |
164 | +noun subst:sg:loc:m3 Glc | |
165 | +noun subst:sg:loc:m2 Gld | |
166 | +noun subst:sg:loc:m1 Gle | |
167 | +noun subst:sg:loc:f Glf | |
168 | +noun subst:sg:inst:n2 Gia | |
169 | +noun subst:sg:inst:n1 Gib | |
170 | +noun subst:sg:inst:m3 Gic | |
171 | +noun subst:sg:inst:m2 Gid | |
172 | +noun subst:sg:inst:m1 Gie | |
173 | +noun subst:sg:inst:f Gif | |
174 | +noun subst:sg:gen:n2 Gga | |
175 | +noun subst:sg:gen:n1 Ggb | |
176 | +noun subst:sg:gen:m3 Ggc | |
177 | +noun subst:sg:gen:m2 Ggd | |
178 | +noun subst:sg:gen:m1 Gge | |
179 | +noun subst:sg:gen:f Ggf | |
180 | +noun subst:sg:dat:n2 Gda | |
181 | +noun subst:sg:dat:n1 Gdb | |
182 | +noun subst:sg:dat:m3 Gdc | |
183 | +noun subst:sg:dat:m2 Gdd | |
184 | +noun subst:sg:dat:m1 Gde | |
185 | +noun subst:sg:dat:f Gdf | |
186 | +noun subst:sg:acc:n2 Gaa | |
187 | +noun subst:sg:acc:n1 Gab | |
188 | +noun subst:sg:acc:m3 Gac | |
189 | +noun subst:sg:acc:m2 Gad | |
190 | +noun subst:sg:acc:m1 Gae | |
191 | +noun subst:sg:acc:f Gaf | |
192 | +noun subst:pl:voc:p3 Yvp | |
193 | +noun subst:pl:voc:p2 Yvq | |
194 | +noun subst:pl:voc:p1 Yvr | |
195 | +noun subst:pl:voc:n2 Yva | |
196 | +noun subst:pl:voc:n1 Yvb | |
197 | +noun subst:pl:voc:m3 Yvc | |
198 | +noun subst:pl:voc:m2 Yvd | |
199 | +noun subst:pl:voc:m1 Yve | |
200 | +noun subst:pl:voc:f Yvf | |
201 | +noun subst:pl:nom:p3 Ynp | |
202 | +noun subst:pl:nom:p2 Ynq | |
203 | +noun subst:pl:nom:p1 Ynr | |
204 | +noun subst:pl:nom:n2 Yna | |
205 | +noun subst:pl:nom:n1 Ynb | |
206 | +noun subst:pl:nom:m3 Ync | |
207 | +noun subst:pl:nom:m2 Ynd | |
208 | +noun subst:pl:nom:m1 Yne | |
209 | +noun subst:pl:nom:f Ynf | |
210 | +noun subst:pl:loc:p3 Ylp | |
211 | +noun subst:pl:loc:p2 Ylq | |
212 | +noun subst:pl:loc:p1 Ylr | |
213 | +noun subst:pl:loc:n2 Yla | |
214 | +noun subst:pl:loc:n1 Ylb | |
215 | +noun subst:pl:loc:m3 Ylc | |
216 | +noun subst:pl:loc:m2 Yld | |
217 | +noun subst:pl:loc:m1 Yle | |
218 | +noun subst:pl:loc:f Ylf | |
219 | +noun subst:pl:inst:p3 Yip | |
220 | +noun subst:pl:inst:p2 Yiq | |
221 | +noun subst:pl:inst:p1 Yir | |
222 | +noun subst:pl:inst:n2 Yia | |
223 | +noun subst:pl:inst:n1 Yib | |
224 | +noun subst:pl:inst:m3 Yic | |
225 | +noun subst:pl:inst:m2 Yid | |
226 | +noun subst:pl:inst:m1 Yie | |
227 | +noun subst:pl:inst:f Yif | |
228 | +noun subst:pl:gen:p3 Ygp | |
229 | +noun subst:pl:gen:p2 Ygq | |
230 | +noun subst:pl:gen:p1 Ygr | |
231 | +noun subst:pl:gen:n2 Yga | |
232 | +noun subst:pl:gen:n1 Ygb | |
233 | +noun subst:pl:gen:m3 Ygc | |
234 | +noun subst:pl:gen:m2 Ygd | |
235 | +noun subst:pl:gen:m1 Yge | |
236 | +noun subst:pl:gen:f Ygf | |
237 | +noun subst:pl:dat:p3 Ydp | |
238 | +noun subst:pl:dat:p2 Ydq | |
239 | +noun subst:pl:dat:p1 Ydr | |
240 | +noun subst:pl:dat:n2 Yda | |
241 | +noun subst:pl:dat:n1 Ydb | |
242 | +noun subst:pl:dat:m3 Ydc | |
243 | +noun subst:pl:dat:m2 Ydd | |
244 | +noun subst:pl:dat:m1 Yde | |
245 | +noun subst:pl:dat:f Ydf | |
246 | +noun subst:pl:acc:p3 Yap | |
247 | +noun subst:pl:acc:p2 Yaq | |
248 | +noun subst:pl:acc:p1 Yar | |
249 | +noun subst:pl:acc:n2 Yaa | |
250 | +noun subst:pl:acc:n1 Yab | |
251 | +noun subst:pl:acc:m3 Yac | |
252 | +noun subst:pl:acc:m2 Yad | |
253 | +noun subst:pl:acc:m1 Yae | |
254 | +noun subst:pl:acc:f Yaf | |
255 | +other qub W | |
256 | +other prep:nom W | |
257 | +other prep:loc:wok W | |
258 | +other prep:loc:nwok W | |
259 | +other prep:loc W | |
260 | +other prep:inst:wok W | |
261 | +other prep:inst:nwok W | |
262 | +other prep:inst W | |
263 | +other prep:gen:wok W | |
264 | +other prep:gen:nwok W | |
265 | +other prep:gen W | |
266 | +other prep:dat W | |
267 | +other prep:acc:wok W | |
268 | +other prep:acc:nwok W | |
269 | +other prep:acc W | |
270 | +other pred W | |
271 | +verb praet:sg:n1.n2:ter:perf Ja | |
272 | +verb praet:sg:n1.n2:ter:imperf.perf Ja | |
273 | +verb praet:sg:n1.n2:ter:imperf Ja | |
274 | +verb praet:sg:n1.n2:sec:perf Jb | |
275 | +verb praet:sg:n1.n2:sec:imperf.perf Jb | |
276 | +verb praet:sg:n1.n2:sec:imperf Jb | |
277 | +verb praet:sg:n1.n2:pri:perf Jc | |
278 | +verb praet:sg:n1.n2:pri:imperf.perf Jc | |
279 | +verb praet:sg:n1.n2:pri:imperf Jc | |
280 | +verb praet:sg:n1.n2:perf Jd | |
281 | +verb praet:sg:n1.n2:imperf.perf Jd | |
282 | +verb praet:sg:n1.n2:imperf Jd | |
283 | +verb praet:sg:m1.m2.m3:ter:perf Je | |
284 | +verb praet:sg:m1.m2.m3:ter:imperf.perf Je | |
285 | +verb praet:sg:m1.m2.m3:ter:imperf Je | |
286 | +verb praet:sg:m1.m2.m3:sec:perf Jf | |
287 | +verb praet:sg:m1.m2.m3:sec:imperf.perf Jf | |
288 | +verb praet:sg:m1.m2.m3:sec:imperf Jf | |
289 | +verb praet:sg:m1.m2.m3:pri:perf Jg | |
290 | +verb praet:sg:m1.m2.m3:pri:imperf.perf Jg | |
291 | +verb praet:sg:m1.m2.m3:pri:imperf Jg | |
292 | +verb praet:sg:m1.m2.m3:perf:nagl.agl Jh | |
293 | +verb praet:sg:m1.m2.m3:imperf:nagl.agl Jh | |
294 | +verb praet:sg:m1.m2.m3:imperf.perf Jh | |
295 | +verb praet:sg:f:ter:perf Ji | |
296 | +verb praet:sg:f:ter:imperf Ji | |
297 | +verb praet:sg:f:sec:perf Jj | |
298 | +verb praet:sg:f:sec:imperf Jj | |
299 | +verb praet:sg:f:pri:perf Jk | |
300 | +verb praet:sg:f:pri:imperf Jk | |
301 | +verb praet:sg:f:perf Jl | |
302 | +verb praet:sg:f:imperf Jl | |
303 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:ter:perf Jm | |
304 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf.perf Jm | |
305 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf Jm | |
306 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:sec:perf Jn | |
307 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf.perf Jn | |
308 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf Jn | |
309 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:pri:perf Jo | |
310 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf.perf Jo | |
311 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf Jo | |
312 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:perf Jp | |
313 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf Jp | |
314 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:imperf Jp | |
315 | +verb praet:pl:m1.p1:ter:perf Jq | |
316 | +verb praet:pl:m1.p1:ter:imperf Jq | |
317 | +verb praet:pl:m1.p1:sec:perf Jr | |
318 | +verb praet:pl:m1.p1:sec:imperf Jr | |
319 | +verb praet:pl:m1.p1:pri:perf Js | |
320 | +verb praet:pl:m1.p1:pri:imperf Js | |
321 | +verb praet:pl:m1.p1:perf Jt | |
322 | +verb praet:pl:m1.p1:imperf Jt | |
323 | +other ppron3:sg:nom:n1.n2:ter:akc.nakc:praep.npraep W | |
324 | +other ppron3:sg:nom:m1.m2.m3:ter:akc.nakc:praep.npraep W | |
325 | +other ppron3:sg:nom:f:ter:akc.nakc:praep.npraep W | |
326 | +other ppron3:sg:loc:n1.n2:ter:akc.nakc:praep.npraep W | |
327 | +other ppron3:sg:loc:m1.m2.m3:ter:akc.nakc:praep.npraep W | |
328 | +other ppron3:sg:loc:f:ter:akc.nakc:praep.npraep W | |
329 | +other ppron3:sg:inst:n1.n2:ter:akc.nakc:praep.npraep W | |
330 | +other ppron3:sg:inst:m1.m2.m3:ter:akc.nakc:praep.npraep W | |
331 | +other ppron3:sg:inst:f:ter:akc.nakc:praep.npraep W | |
332 | +other ppron3:sg:gen:n1.n2:ter:nakc:npraep W | |
333 | +other ppron3:sg:gen:n1.n2:ter:akc:npraep W | |
334 | +other ppron3:sg:gen:n1.n2:ter:akc.nakc:praep W | |
335 | +other ppron3:sg:gen:m1.m2.m3:ter:nakc:praep W | |
336 | +other ppron3:sg:gen:m1.m2.m3:ter:nakc:npraep W | |
337 | +other ppron3:sg:gen:m1.m2.m3:ter:akc:praep W | |
338 | +other ppron3:sg:gen:m1.m2.m3:ter:akc:npraep W | |
339 | +other ppron3:sg:gen:f:ter:akc.nakc:praep W | |
340 | +other ppron3:sg:gen:f:ter:akc.nakc:npraep W | |
341 | +other ppron3:sg:dat:n1.n2:ter:nakc:npraep W | |
342 | +other ppron3:sg:dat:n1.n2:ter:akc:npraep W | |
343 | +other ppron3:sg:dat:n1.n2:ter:akc.nakc:praep W | |
344 | +other ppron3:sg:dat:m1.m2.m3:ter:nakc:npraep W | |
345 | +other ppron3:sg:dat:m1.m2.m3:ter:akc:npraep W | |
346 | +other ppron3:sg:dat:m1.m2.m3:ter:akc.nakc:praep W | |
347 | +other ppron3:sg:dat:f:ter:akc.nakc:praep W | |
348 | +other ppron3:sg:dat:f:ter:akc.nakc:npraep W | |
349 | +other ppron3:sg:acc:n1.n2:ter:akc.nakc:praep W | |
350 | +other ppron3:sg:acc:n1.n2:ter:akc.nakc:npraep W | |
351 | +other ppron3:sg:acc:m1.m2.m3:ter:nakc:praep W | |
352 | +other ppron3:sg:acc:m1.m2.m3:ter:nakc:npraep W | |
353 | +other ppron3:sg:acc:m1.m2.m3:ter:akc:praep W | |
354 | +other ppron3:sg:acc:m1.m2.m3:ter:akc:npraep W | |
355 | +other ppron3:sg:acc:f:ter:akc.nakc:praep W | |
356 | +other ppron3:sg:acc:f:ter:akc.nakc:npraep W | |
357 | +other ppron3:pl:nom:m2.m3.f.n1.n2.p2.p3:ter:akc.nakc:praep.npraep W | |
358 | +other ppron3:pl:nom:m1.p1:ter:akc.nakc:praep.npraep W | |
359 | +other ppron3:pl:loc:_:ter:akc.nakc:praep.npraep W | |
360 | +other ppron3:pl:inst:_:ter:akc.nakc:praep.npraep W | |
361 | +other ppron3:pl:gen:_:ter:akc.nakc:praep W | |
362 | +other ppron3:pl:gen:_:ter:akc.nakc:npraep W | |
363 | +other ppron3:pl:dat:_:ter:akc.nakc:praep W | |
364 | +other ppron3:pl:dat:_:ter:akc.nakc:npraep W | |
365 | +other ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:akc.nakc:praep W | |
366 | +other ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:akc.nakc:npraep W | |
367 | +other ppron3:pl:acc:m1.p1:ter:akc.nakc:praep W | |
368 | +other ppron3:pl:acc:m1.p1:ter:akc.nakc:npraep W | |
369 | +other ppron12:sg:voc:m1.m2.m3.f.n1.n2:sec W | |
370 | +other ppron12:sg:voc:m1.m2.m3.f.n1.n2:pri W | |
371 | +other ppron12:sg:nom:m1.m2.m3.f.n1.n2:sec W | |
372 | +other ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri W | |
373 | +other ppron12:sg:loc:m1.m2.m3.f.n1.n2:sec W | |
374 | +other ppron12:sg:loc:m1.m2.m3.f.n1.n2:pri W | |
375 | +other ppron12:sg:inst:m1.m2.m3.f.n1.n2:sec W | |
376 | +other ppron12:sg:inst:m1.m2.m3.f.n1.n2:pri W | |
377 | +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:nakc W | |
378 | +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:akc W | |
379 | +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:nakc W | |
380 | +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:akc W | |
381 | +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:nakc W | |
382 | +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:akc W | |
383 | +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:nakc W | |
384 | +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:akc W | |
385 | +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:nakc W | |
386 | +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:akc W | |
387 | +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:nakc W | |
388 | +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:akc W | |
389 | +other ppron12:pl:voc:_:sec W | |
390 | +other ppron12:pl:voc:_:pri W | |
391 | +other ppron12:pl:nom:_:sec W | |
392 | +other ppron12:pl:nom:_:pri W | |
393 | +other ppron12:pl:loc:_:sec W | |
394 | +other ppron12:pl:loc:_:pri W | |
395 | +other ppron12:pl:inst:_:sec W | |
396 | +other ppron12:pl:inst:_:pri W | |
397 | +other ppron12:pl:gen:_:sec W | |
398 | +other ppron12:pl:gen:_:pri W | |
399 | +other ppron12:pl:dat:_:sec W | |
400 | +other ppron12:pl:dat:_:pri W | |
401 | +other ppron12:pl:acc:_:sec W | |
402 | +other ppron12:pl:acc:_:pri W | |
403 | +verb ppas:sg:nom.voc:m1.m2.m3:perf:aff Ra | |
404 | +verb ppas:sg:nom.voc:m1.m2.m3:imperf:aff Ra | |
405 | +verb ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff Ra | |
406 | +verb ppas:sg:nom.voc:f:perf:aff Rb | |
407 | +verb ppas:sg:nom.voc:f:imperf:aff Rb | |
408 | +verb ppas:sg:nom.voc:f:imperf.perf:aff Rb | |
409 | +verb ppas:sg:nom.acc.voc:n1.n2:perf:aff Rc | |
410 | +verb ppas:sg:nom.acc.voc:n1.n2:imperf:aff Rc | |
411 | +verb ppas:sg:nom.acc.voc:n1.n2:imperf.perf:aff Rc | |
412 | +verb ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:aff Rd | |
413 | +verb ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff Rd | |
414 | +verb ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff Rd | |
415 | +verb ppas:sg:gen:m1.m2.m3.n1.n2:perf:aff Re | |
416 | +verb ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff Re | |
417 | +verb ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff Re | |
418 | +verb ppas:sg:gen.dat.loc:f:perf:aff Rf | |
419 | +verb ppas:sg:gen.dat.loc:f:imperf:aff Rf | |
420 | +verb ppas:sg:gen.dat.loc:f:imperf.perf:aff Rf | |
421 | +verb ppas:sg:dat:m1.m2.m3.n1.n2:perf:aff Rg | |
422 | +verb ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff Rg | |
423 | +verb ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff Rg | |
424 | +verb ppas:sg:acc:m3:perf:aff Rh | |
425 | +verb ppas:sg:acc:m3:imperf:aff Rh | |
426 | +verb ppas:sg:acc:m3:imperf.perf:aff Rh | |
427 | +verb ppas:sg:acc:m1.m2:perf:aff Ri | |
428 | +verb ppas:sg:acc:m1.m2:imperf:aff Ri | |
429 | +verb ppas:sg:acc:m1.m2:imperf.perf:aff Ri | |
430 | +verb ppas:sg:acc.inst:f:perf:aff Rj | |
431 | +verb ppas:sg:acc.inst:f:imperf:aff Rj | |
432 | +verb ppas:sg:acc.inst:f:imperf.perf:aff Rj | |
433 | +verb ppas:pl:nom.voc:m1.p1:perf:aff Rk | |
434 | +verb ppas:pl:nom.voc:m1.p1:imperf:aff Rk | |
435 | +verb ppas:pl:nom.voc:m1.p1:imperf.perf:aff Rk | |
436 | +verb ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:aff Rl | |
437 | +verb ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff Rl | |
438 | +verb ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff Rl | |
439 | +verb ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff Rm | |
440 | +verb ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Rm | |
441 | +verb ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Rm | |
442 | +verb ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff Rn | |
443 | +verb ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Rn | |
444 | +verb ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Rn | |
445 | +verb ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff Ro | |
446 | +verb ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Ro | |
447 | +verb ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Ro | |
448 | +verb ppas:pl:acc:m1.p1:perf:aff Rp | |
449 | +verb ppas:pl:acc:m1.p1:imperf:aff Rp | |
450 | +verb ppas:pl:acc:m1.p1:imperf.perf:aff Rp | |
451 | +verb pcon:imperf O | |
452 | +verb pant:perf P | |
453 | +verb pact:sg:nom.voc:m1.m2.m3:imperf:aff Qa | |
454 | +verb pact:sg:nom.voc:m1.m2.m3:imperf.perf:aff Qa | |
455 | +verb pact:sg:nom.voc:f:imperf:aff Qb | |
456 | +verb pact:sg:nom.voc:f:imperf.perf:aff Qb | |
457 | +verb pact:sg:nom.acc.voc:n1.n2:imperf:aff Qc | |
458 | +verb pact:sg:nom.acc.voc:n1.n2:imperf.perf:aff Qc | |
459 | +verb pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff Qd | |
460 | +verb pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff Qd | |
461 | +verb pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff Qe | |
462 | +verb pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff Qe | |
463 | +verb pact:sg:gen.dat.loc:f:imperf:aff Qf | |
464 | +verb pact:sg:gen.dat.loc:f:imperf.perf:aff Qf | |
465 | +verb pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff Qg | |
466 | +verb pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff Qg | |
467 | +verb pact:sg:acc:m3:imperf:aff Qh | |
468 | +verb pact:sg:acc:m3:imperf.perf:aff Qh | |
469 | +verb pact:sg:acc:m1.m2:imperf:aff Qi | |
470 | +verb pact:sg:acc:m1.m2:imperf.perf:aff Qi | |
471 | +verb pact:sg:acc.inst:f:imperf:aff Qj | |
472 | +verb pact:sg:acc.inst:f:imperf.perf:aff Qj | |
473 | +verb pact:pl:nom.voc:m1.p1:imperf:aff Qk | |
474 | +verb pact:pl:nom.voc:m1.p1:imperf.perf:aff Qk | |
475 | +verb pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff Ql | |
476 | +verb pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff Ql | |
477 | +verb pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Qm | |
478 | +verb pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Qm | |
479 | +verb pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Qn | |
480 | +verb pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Qn | |
481 | +verb pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Qo | |
482 | +verb pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Qo | |
483 | +verb pact:pl:acc:m1.p1:imperf:aff Qp | |
484 | +verb pact:pl:acc:m1.p1:imperf.perf:aff Qp | |
485 | +other num:sg:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.n1.n2:rec W | |
486 | +other num:sg:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2:rec W | |
487 | +other num:sg:nom.gen.dat.inst.acc.loc.voc:f:rec W | |
488 | +other num:sg:nom.acc:m1.m2.m3.f.n1.n2:rec W | |
489 | +other num:sg.pl:nom.acc:m1.m2.m3.f.n1.n2.p1.p2:rec W | |
490 | +other num:pl:nom.voc:m1:rec W | |
491 | +other num:pl:nom.voc:m1:congr W | |
492 | +other num:pl:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2.p1.p2:rec W | |
493 | +other num:pl:nom.acc:m1.m2.m3.f.n1.n2.p1.p2:rec W | |
494 | +other num:pl:nom.acc.voc:n1.p1.p2:rec W | |
495 | +other num:pl:nom.acc.voc:m2.m3.n2:congr W | |
496 | +other num:pl:nom.acc.voc:m2.m3.n2.f:congr W | |
497 | +other num:pl:nom.acc.voc:m2.m3.f.n2:rec W | |
498 | +other num:pl:nom.acc.voc:m2.m3.f.n1.n2.p1.p2:rec W | |
499 | +other num:pl:nom.acc.voc:m1:rec W | |
500 | +other num:pl:nom.acc.voc:f:congr W | |
501 | +other num:pl:inst:n1.p1.p2:rec W | |
502 | +other num:pl:inst:m1.m2.m3.n2:congr W | |
503 | +other num:pl:inst:m1.m2.m3.n2.f:congr W | |
504 | +other num:pl:inst:m1.m2.m3.f.n2:congr W | |
505 | +other num:pl:inst:m1.m2.m3.f.n1.n2.p1.p2:congr W | |
506 | +other num:pl:inst:f:congr W | |
507 | +other num:pl:gen:n1.p1.p2:rec W | |
508 | +other num:pl:gen.loc:m1.m2.m3.n2.f:congr W | |
509 | +other num:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2:congr W | |
510 | +other num:pl:gen.dat.loc:m1.m2.m3.n2.f:congr W | |
511 | +other num:pl:gen.dat.inst.loc:m1.m2.m3.f.n2:congr W | |
512 | +other num:pl:gen.dat.inst.loc:m1.m2.m3.f.n1.n2.p1.p2:congr W | |
513 | +other num:pl:dat:m1.m2.m3.n2.f:congr W | |
514 | +other num:pl:dat.loc:n1.p1.p2:congr.rec W | |
515 | +other num:pl:acc:m1:rec W | |
516 | +other num:comp W | |
517 | +other interj W | |
518 | +verb inf:perf I | |
519 | +verb inf:imperf.perf I | |
520 | +verb inf:imperf I | |
521 | +verb impt:sg:sec:perf Ma | |
522 | +verb impt:sg:sec:imperf.perf Ma | |
523 | +verb impt:sg:sec:imperf Ma | |
524 | +verb impt:pl:sec:perf Mb | |
525 | +verb impt:pl:sec:imperf.perf Mb | |
526 | +verb impt:pl:sec:imperf Mb | |
527 | +verb impt:pl:pri:perf Mc | |
528 | +verb impt:pl:pri:imperf.perf Mc | |
529 | +verb impt:pl:pri:imperf Mc | |
530 | +verb imps:perf N | |
531 | +verb imps:imperf.perf N | |
532 | +verb imps:imperf N | |
533 | +verb ger:sg:nom.acc:n2:perf:aff Xa | |
534 | +verb ger:sg:nom.acc:n2:imperf:aff Xa | |
535 | +verb ger:sg:nom.acc:n2:imperf.perf:aff Xa | |
536 | +verb ger:sg:inst:n2:perf:aff Xb | |
537 | +verb ger:sg:inst:n2:imperf:aff Xb | |
538 | +verb ger:sg:inst:n2:imperf.perf:aff Xb | |
539 | +verb ger:sg:gen:n2:perf:aff Xc | |
540 | +verb ger:sg:gen:n2:imperf:aff Xc | |
541 | +verb ger:sg:gen:n2:imperf.perf:aff Xc | |
542 | +verb ger:sg:dat.loc:n2:perf:aff Xd | |
543 | +verb ger:sg:dat.loc:n2:imperf:aff Xd | |
544 | +verb ger:sg:dat.loc:n2:imperf.perf:aff Xd | |
545 | +verb ger:pl:nom.acc:n2:perf:aff Xe | |
546 | +verb ger:pl:nom.acc:n2:imperf:aff Xe | |
547 | +verb ger:pl:nom.acc:n2:imperf.perf:aff Xe | |
548 | +verb ger:pl:loc:n2:perf:aff Xf | |
549 | +verb ger:pl:loc:n2:imperf:aff Xf | |
550 | +verb ger:pl:loc:n2:imperf.perf:aff Xf | |
551 | +verb ger:pl:inst:n2:perf:aff Xg | |
552 | +verb ger:pl:inst:n2:imperf:aff Xg | |
553 | +verb ger:pl:inst:n2:imperf.perf:aff Xg | |
554 | +verb ger:pl:gen:n2:perf:aff Xh | |
555 | +verb ger:pl:gen:n2:imperf:aff Xh | |
556 | +verb ger:pl:gen:n2:imperf.perf:aff Xh | |
557 | +verb ger:pl:dat:n2:perf:aff Xi | |
558 | +verb ger:pl:dat:n2:imperf:aff Xi | |
559 | +verb ger:pl:dat:n2:imperf.perf:aff Xi | |
560 | +verb fin:sg:ter:perf La | |
561 | +verb fin:sg:ter:imperf.perf La | |
562 | +verb fin:sg:ter:imperf La | |
563 | +verb fin:sg:sec:perf Lb | |
564 | +verb fin:sg:sec:imperf.perf Lb | |
565 | +verb fin:sg:sec:imperf Lb | |
566 | +verb fin:sg:pri:perf Lc | |
567 | +verb fin:sg:pri:imperf.perf Lc | |
568 | +verb fin:sg:pri:imperf Lc | |
569 | +verb fin:pl:ter:perf Ld | |
570 | +verb fin:pl:ter:imperf.perf Ld | |
571 | +verb fin:pl:ter:imperf Ld | |
572 | +verb fin:pl:sec:perf Le | |
573 | +verb fin:pl:sec:imperf.perf Le | |
574 | +verb fin:pl:sec:imperf Le | |
575 | +verb fin:pl:pri:perf Lf | |
576 | +verb fin:pl:pri:imperf.perf Lf | |
577 | +verb fin:pl:pri:imperf Lf | |
578 | +noun depr:pl:voc:m2 Hv | |
579 | +noun depr:pl:nom:m2 Hn | |
580 | +other conj W | |
581 | +verb cond:sg:n1.n2:ter:perf Ka | |
582 | +verb cond:sg:n1.n2:ter:imperf.perf Ka | |
583 | +verb cond:sg:n1.n2:ter:imperf Ka | |
584 | +verb cond:sg:n1.n2:sec:perf Kb | |
585 | +verb cond:sg:n1.n2:sec:imperf.perf Kb | |
586 | +verb cond:sg:n1.n2:sec:imperf Kb | |
587 | +verb cond:sg:n1.n2:pri:perf Kc | |
588 | +verb cond:sg:n1.n2:pri:imperf.perf Kc | |
589 | +verb cond:sg:n1.n2:pri:imperf Kc | |
590 | +verb cond:sg:n1.n2:perf Kd | |
591 | +verb cond:sg:n1.n2:imperf.perf Kd | |
592 | +verb cond:sg:n1.n2:imperf Kd | |
593 | +verb cond:sg:m1.m2.m3:ter:perf Ke | |
594 | +verb cond:sg:m1.m2.m3:ter:imperf.perf Ke | |
595 | +verb cond:sg:m1.m2.m3:ter:imperf Ke | |
596 | +verb cond:sg:m1.m2.m3:sec:perf Kf | |
597 | +verb cond:sg:m1.m2.m3:sec:imperf.perf Kf | |
598 | +verb cond:sg:m1.m2.m3:sec:imperf Kf | |
599 | +verb cond:sg:m1.m2.m3:pri:perf Kg | |
600 | +verb cond:sg:m1.m2.m3:pri:imperf.perf Kg | |
601 | +verb cond:sg:m1.m2.m3:pri:imperf Kg | |
602 | +verb cond:sg:f:ter:perf Kh | |
603 | +verb cond:sg:f:ter:imperf.perf Kh | |
604 | +verb cond:sg:f:ter:imperf Kh | |
605 | +verb cond:sg:f:sec:perf Ki | |
606 | +verb cond:sg:f:sec:imperf.perf Ki | |
607 | +verb cond:sg:f:sec:imperf Ki | |
608 | +verb cond:sg:f:pri:perf Kj | |
609 | +verb cond:sg:f:pri:imperf.perf Kj | |
610 | +verb cond:sg:f:pri:imperf Kj | |
611 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:ter:perf Kk | |
612 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf.perf Kk | |
613 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf Kk | |
614 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:sec:perf Kl | |
615 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf.perf Kl | |
616 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf Kl | |
617 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:pri:perf Km | |
618 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf.perf Km | |
619 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf Km | |
620 | +verb cond:pl:m1.p1:ter:perf Kn | |
621 | +verb cond:pl:m1.p1:ter:imperf.perf Kn | |
622 | +verb cond:pl:m1.p1:ter:imperf Kn | |
623 | +verb cond:pl:m1.p1:sec:perf Ko | |
624 | +verb cond:pl:m1.p1:sec:imperf.perf Ko | |
625 | +verb cond:pl:m1.p1:sec:imperf Ko | |
626 | +verb cond:pl:m1.p1:pri:perf Kp | |
627 | +verb cond:pl:m1.p1:pri:imperf.perf Kp | |
628 | +verb cond:pl:m1.p1:pri:imperf Kp | |
629 | +other comp W | |
630 | +other burk W | |
631 | +other brev:pun W | |
632 | +other brev:npun W | |
633 | +other bedzie:sg:ter:imperf W | |
634 | +other bedzie:sg:sec:imperf W | |
635 | +other bedzie:sg:pri:imperf W | |
636 | +other bedzie:pl:ter:imperf W | |
637 | +other bedzie:pl:sec:imperf W | |
638 | +other bedzie:pl:pri:imperf W | |
639 | +other aglt:sg:sec:imperf:wok W | |
640 | +other aglt:sg:sec:imperf:nwok W | |
641 | +other aglt:sg:pri:imperf:wok W | |
642 | +other aglt:sg:pri:imperf:nwok W | |
643 | +other aglt:pl:sec:imperf:wok W | |
644 | +other aglt:pl:sec:imperf:nwok W | |
645 | +other aglt:pl:pri:imperf:wok W | |
646 | +other aglt:pl:pri:imperf:nwok W | |
647 | +adv-sup adv:sup W | |
648 | +adv adv:pos W | |
649 | +adv-com adv:com W | |
650 | +adv adv W | |
651 | +adj adjp F | |
652 | +adj adjc E | |
653 | +adj adja D | |
654 | +adj adj:sg:nom.voc:n1.n2:pos Aa | |
655 | +adj-com adj:sg:nom.voc:n1.n2:com Ba | |
656 | +adj adj:sg:nom.voc:m1.m2.m3:pos Ab | |
657 | +adj-com adj:sg:nom.voc:m1.m2.m3:com Bb | |
658 | +adj adj:sg:nom.voc:f:pos Ac | |
659 | +adj-com adj:sg:nom.voc:f:com Bc | |
660 | +adj adj:sg:loc:m1.m2.m3.n1.n2:pos Ad | |
661 | +adj-com adj:sg:loc:m1.m2.m3.n1.n2:com Bd | |
662 | +adj adj:sg:loc:f:pos Ae | |
663 | +adj-com adj:sg:loc:f:com Be | |
664 | +adj adj:sg:inst:m1.m2.m3.n1.n2:pos Af | |
665 | +adj-com adj:sg:inst:m1.m2.m3.n1.n2:com Bf | |
666 | +adj adj:sg:inst:f:pos Ag | |
667 | +adj-com adj:sg:inst:f:com Bg | |
668 | +adj adj:sg:gen:m1.m2.m3.n1.n2:pos Ah | |
669 | +adj-com adj:sg:gen:m1.m2.m3.n1.n2:com Bh | |
670 | +adj adj:sg:gen:f:pos Ai | |
671 | +adj-com adj:sg:gen:f:com Bi | |
672 | +adj adj:sg:dat:m1.m2.m3.n1.n2:pos Aj | |
673 | +adj-com adj:sg:dat:m1.m2.m3.n1.n2:com Bj | |
674 | +adj adj:sg:dat:f:pos Ak | |
675 | +adj-com adj:sg:dat:f:com Bk | |
676 | +adj adj:sg:acc:n1.n2:pos Al | |
677 | +adj-com adj:sg:acc:n1.n2:com Bl | |
678 | +adj adj:sg:acc:m3:pos Am | |
679 | +adj-com adj:sg:acc:m3:com Bm | |
680 | +adj adj:sg:acc:m1.m2:pos An | |
681 | +adj-com adj:sg:acc:m1.m2:com Bn | |
682 | +adj adj:sg:acc:f:pos Ao | |
683 | +adj-com adj:sg:acc:f:com Bo | |
684 | +adj adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos Ap | |
685 | +adj-com adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:com Bp | |
686 | +adj adj:pl:nom.voc:m1.p1:pos Aq | |
687 | +adj-com adj:pl:nom.voc:m1.p1:com Bq | |
688 | +adj adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos Ar | |
689 | +adj-com adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:com Br | |
690 | +adj adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos As | |
691 | +adj-com adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:com Bs | |
692 | +adj adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos At | |
693 | +adj-com adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:com Bt | |
694 | +adj adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos Au | |
695 | +adj-com adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:com Bu | |
696 | +adj adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos Av | |
697 | +adj-com adj:pl:acc:m2.m3.f.n1.n2.p2.p3:com Bv | |
698 | +adj adj:pl:acc:m1.p1:pos Aw | |
699 | +adj-com adj:pl:acc:m1.p1:com Bw | |
700 | +verb pacta XYZ | |
... | ... |
morphology/data/rev_rules.dic
... | ... | @@ -32,6 +32,7 @@ |
32 | 32 | @NOUN-LEMMA-CAP |
33 | 33 | kapitaliki_wyglos lemma=ε |
34 | 34 | kapitaliki_wyglos A lemma=A |
35 | +# lemma=końcówka_lematu | |
35 | 36 | |
36 | 37 | @VERB-LEMMA-PATAL |
37 | 38 | funkcjonalnie_miekkie_iy ć lemma=ć |
... | ... | @@ -57,3 +58,14 @@ |
57 | 58 | dowolne yć lemma=yć |
58 | 59 | inf_e eć lemma=eć |
59 | 60 | |
61 | +@NOUN-LEMMA-FOREIGN | |
62 | + obce_funkcjonalnie_twarde_a a lemma=a | |
63 | +* obce_funkcjonalnie_twarde_a ah lemma=ah | |
64 | + obce_funkcjonalnie_twarde_e lemma=ε | |
65 | + obce_funkcjonalnie_twarde_ie lemma=ε | |
66 | + obce_funkcjonalnie_twarde_ie a lemma=a | |
67 | + obce_funkcjonalnie_twarde_iy lemma=y | |
68 | + obce_funkcjonalnie_miekkie_ii_wyglos a lemma=a | |
69 | + obce_funkcjonalnie_miekkie_ii_wyglos lemma=ε | |
70 | + obce_ais lemma=ais | |
71 | + obce_apostrof lemma=' | |
... | ... |
morphology/data/rules.dic
... | ... | @@ -312,3 +312,67 @@ |
312 | 312 | verb_j e group=e cat=verb |
313 | 313 | funkcjonalnie_twarde_i e group=ie cat=verb |
314 | 314 | |
315 | +@NOUN-FLEX-FOREIGN | |
316 | + obce_funkcjonalnie_twarde_iy flex=y2 con cat=noun | |
317 | + obce_funkcjonalnie_twarde_iy ch flex=ych con cat=noun | |
318 | + obce_funkcjonalnie_twarde_iy m flex=ym con cat=noun | |
319 | + obce_funkcjonalnie_twarde_iy mi flex=ymi con cat=noun | |
320 | +* obce_funkcjonalnie_twarde_iy es flex=s con cat=noun | |
321 | +* obce_funkcjonalnie_twarde_iy s flex=s con cat=noun | |
322 | + obce_funkcjonalnie_twarde_e e flex=e1 con cat=noun | |
323 | + obce_funkcjonalnie_twarde_ie e flex=ie con cat=noun | |
324 | + obce_funkcjonalnie_twarde_e ego flex=ego con cat=noun | |
325 | + obce_funkcjonalnie_twarde_e ej flex=ej con cat=noun | |
326 | + obce_funkcjonalnie_twarde_e em flex=em2 con cat=noun | |
327 | + obce_funkcjonalnie_twarde_e emu flex=emu con cat=noun | |
328 | + obce_funkcjonalnie_twarde_a a flex=a1 con cat=noun | |
329 | + obce_funkcjonalnie_twarde_a ach flex=ach con cat=noun | |
330 | + obce_funkcjonalnie_twarde_a ami flex=ami1 con cat=noun | |
331 | + obce_funkcjonalnie_twarde_a ą flex=ą con cat=noun | |
332 | + obce_funkcjonalnie_twarde_a ę flex=ę con cat=noun | |
333 | + obce_funkcjonalnie_twarde_a o flex=o2 con cat=noun | |
334 | + obce_funkcjonalnie_twarde_a om flex=om con cat=noun | |
335 | + obce_funkcjonalnie_twarde_a ów flex=ów con cat=noun | |
336 | + obce_funkcjonalnie_twarde_a owie flex=owie con cat=noun | |
337 | + obce_funkcjonalnie_twarde_a u flex=u1 con cat=noun | |
338 | + obce_funkcjonalnie_twarde_a i flex=i2 con cat=noun | |
339 | + obce_funkcjonalnie_twarde_a y flex=i2 con cat=noun | |
340 | + obce_funkcjonalnie_miekkie_ii_wyglos flex=ε3 con cat=noun | |
341 | + | |
342 | +@NOUN-FLEX-APOSTROF | |
343 | + obce_ais ’go flex=ego con cat=noun | |
344 | + obce_ais ’mu flex=emu con cat=noun | |
345 | + obce_ais ’m flex=em con cat=noun | |
346 | + obce_ais 'go flex=ego con cat=noun | |
347 | + obce_ais 'mu flex=emu con cat=noun | |
348 | + obce_ais 'm flex=em con cat=noun | |
349 | + obce_ais flex=ε con cat=noun | |
350 | + obce_apostrof 'a flex='a con cat=noun | |
351 | + obce_apostrof ’u flex='u con cat=noun | |
352 | + obce_apostrof ’owi flex='owi con cat=noun | |
353 | + obce_apostrof ’em flex='em con cat=noun | |
354 | + obce_apostrof ’ie flex='ie con cat=noun | |
355 | + obce_apostrof ’u flex='u con cat=noun | |
356 | + obce_apostrof ’y flex='y con cat=noun | |
357 | + obce_apostrof ’owie flex='owie con cat=noun | |
358 | + obce_apostrof ’e flex='e con cat=noun | |
359 | + obce_apostrof ’ów flex='ów con cat=noun | |
360 | + obce_apostrof ’i flex='i con cat=noun | |
361 | + obce_apostrof ’om flex='om con cat=noun | |
362 | + obce_apostrof ’ami flex='ami con cat=noun | |
363 | + obce_apostrof ’ach flex='ach con cat=noun | |
364 | + obce_apostrof 'a flex='a con cat=noun | |
365 | + obce_apostrof 'u flex='u con cat=noun | |
366 | + obce_apostrof 'owi flex='owi con cat=noun | |
367 | + obce_apostrof 'em flex='em con cat=noun | |
368 | + obce_apostrof 'ie flex='ie con cat=noun | |
369 | + obce_apostrof 'u flex='u con cat=noun | |
370 | + obce_apostrof 'y flex='y con cat=noun | |
371 | + obce_apostrof 'owie flex='owie con cat=noun | |
372 | + obce_apostrof 'e flex='e con cat=noun | |
373 | + obce_apostrof 'ów flex='ów con cat=noun | |
374 | + obce_apostrof 'i flex='i con cat=noun | |
375 | + obce_apostrof 'om flex='om con cat=noun | |
376 | + obce_apostrof 'ami flex='ami con cat=noun | |
377 | + obce_apostrof 'ach flex='ach con cat=noun | |
378 | + obce_apostrof flex=ε con cat=noun | |
... | ... |
morphology/data/schemata.dic
... | ... | @@ -27,3 +27,5 @@ PREF-ε FIN-FLEX-J VERB-FLEX2-J VERB-GROUP-J-NĄ VERB-LEMMA-NĄ |
27 | 27 | PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-J-NĄ VERB-LEMMA-NĄ |
28 | 28 | PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-J-NĄ VERB-LEMMA-NĄ |
29 | 29 | PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-J-NĄ VERB-LEMMA-NĄ |
30 | +NOUN-FLEX-FOREIGN NOUN-LEMMA-FOREIGN | |
31 | +NOUN-FLEX-APOSTROF NOUN-LEMMA-FOREIGN | |
... | ... |
morphology/doc/decyzje-scalanie.txt
0 → 100644
1 | +Plik freqListInterps.ml wykorzystuje plik data/interps_general.tab, skąd usunięte zostały niektóre wpisy, | |
2 | +głównie dotyczące adj (powodujące dwuznaczności przy scalaniu). | |
3 | + | |
4 | +Pozostawione intepretacje: | |
5 | +-siebie, numcol (nieobecne w SGJP) pozostają jak są | |
6 | +-ppron12, ppron3 pozostają jak są | |
7 | +(tzn. nie ma znalezionych odpowiedników, są pomijane) | |
8 | + | |
9 | +W przypadku praet, imps, imp, fin, inf, ger, pact, ppas wybrana została interpretacja najbardziej podobna | |
10 | +do oryginalnej licząc od końca, co rozwiązuje problemy perf.impef, neg.aff | |
11 | + | |
12 | +Przekształcenia: | |
13 | +-qub:wok, qub:nwok -> qub | |
14 | + | |
15 | +Co do num spoza SGJP: | |
16 | +-jeżeli forma składa się wyłącznie z cyfr arabskich i rzymskich, wybierana jest najdłuższa interpretacja | |
17 | +Zapewne w rzeczywistości powinny wtedy obejmować wszystkie możliwe tagi. | |
18 | +-w przeciwnym wypadku wybierana jest najkrótsza (najwęższa) interpretacja | |
... | ... |
morphology/doc/model.pdf
No preview for this file type
morphology/doc/model.tex
... | ... | @@ -6,89 +6,162 @@ |
6 | 6 | \usepackage[polish]{babel} |
7 | 7 | % \usepackage{tikz} |
8 | 8 | % \usetikzlibrary{conceptgraph} |
9 | +\usepackage{amsthm} | |
9 | 10 | |
10 | 11 | \parindent 0pt |
11 | 12 | \parskip 4pt |
12 | 13 | |
13 | -% \newcommand{\tensor}{\otimes} | |
14 | -% \newcommand{\forward}{\operatorname{/}} | |
15 | -% \newcommand{\backward}{\operatorname{\backslash}} | |
16 | -% \newcommand{\both}{\mid} | |
17 | -% \newcommand{\plus}{\oplus} | |
18 | -% \newcommand{\zero}{0} | |
19 | -% \newcommand{\one}{1} | |
20 | -% \newcommand{\letin}[2]{{\bf let}\;#1\;{\bf in}\;#2} | |
21 | -% \newcommand{\caseof}[2]{{\bf case}\;#1\;{\bf of}\;#2} | |
22 | -% \newcommand{\emp}{{\bf emp}} | |
23 | -% \newcommand{\inl}{{\bf inl}} | |
24 | -% \newcommand{\inr}{{\bf inr}} | |
25 | -% \newcommand{\coord}[1]{{#1}^\star} | |
26 | -% \newcommand{\map}[2]{{\bf map}\;#1\;#2} | |
27 | -% \newcommand{\concat}[1]{{\bf concat}\;#1} | |
28 | -% \newcommand{\makeset}[1]{{\bf makeset}\;#1} | |
29 | -% \newcommand{\maketerm}[1]{{\bf maketerm}\;#1} | |
30 | -% \newcommand{\addlist}[2]{{\bf add}\;#1\;#2} | |
31 | -% \newcommand{\ana}[1]{{\bf ana}(#1)} | |
32 | -% \newcommand{\One}{\bullet} | |
33 | - | |
34 | - | |
35 | -\title{Model probabilistyczny guessera dla języka polskiego} | |
14 | +\newcommand{\form}{{\it form}} | |
15 | +\newcommand{\lemma}{{\it lemma}} | |
16 | +\newcommand{\cat}{{\it cat}} | |
17 | +\newcommand{\interp}{{\it interp}} | |
18 | +\newcommand{\fsuf}{{\it fsuf}} | |
19 | +\newcommand{\lsuf}{{\it lsuf}} | |
20 | + | |
21 | +\newtheorem{task}{Zadanie} | |
22 | +\newtheorem{answer}{Odpowiedź} | |
23 | + | |
24 | +\title{Model probabilistyczny fleksji języka polskiego} | |
36 | 25 | \author{Wojciech Jaworski} |
37 | 26 | %\date{} |
38 | 27 | |
39 | 28 | \begin{document} |
40 | 29 | \maketitle |
41 | 30 | |
42 | -Zakładamy, że język jest rozkładem probabilistycznym na czwórkach (form,lemma,cat,interp), | |
31 | +Zakładamy, że język jest rozkładem probabilistycznym na czwórkach (\form,\lemma,\cat,\interp), | |
43 | 32 | czyli, że wystąpienia kolejnych słów w tekście są od siebie niezależne. |
44 | -Interpretacja interp jest zbiorem tagów zgodnym a tagsetem SGJP. | |
45 | -Kategoria $cat \in \{ noun, adj, adv, verb, other \}$ | |
33 | +Interpretacja \interp{} jest zbiorem tagów zgodnym a tagsetem SGJP. | |
34 | +Kategoria $\cat \in \{ {\rm noun}, {\rm adj}, {\rm adv}, {\rm verb}, {\rm other} \}$ | |
46 | 35 | Zakładamy też, że język jest poprawny, tzn. nie ma literówek, ani błędów gramatycznych. |
47 | 36 | |
48 | 37 | Dysponujemy następującymi danymi: |
49 | 38 | \begin{itemize} |
50 | 39 | \item słownikiem gramatycznym S, czyli zbiorem czwórek, o których wiemy, że należą do języka; |
51 | -\item zbiorem reguł, czyli zbiorem czwórek (fsuf,lsuf,cat,interp) | |
40 | +\item zbiorem reguł, czyli zbiorem czwórek (\fsuf,\lsuf,\cat,\interp) | |
52 | 41 | \item zbiorem wyjątków, czyli zbiorem czwórek, o których wiemy, że należą do języka, które nie są opisywane przez reguły |
53 | 42 | \item otagowaną listą frekwencyjną. |
54 | 43 | \end{itemize} |
55 | -Reguła przyłożona do formy ucina fsuf i przykleja lsuf. | |
44 | +Reguła przyłożona do formy ucina \fsuf{} i przykleja \lsuf. | |
45 | + | |
46 | +Lista frekwencyjna wytworzona jest na podstawie NKJP1M. Usunięte zostały z niej symbole | |
47 | +(formy do których odczytania nie wystarczy znajomość reguł wymowy takie, jak liczby zapisane cyframi, oznaczenia godzin i lat, | |
48 | +znaki interpunkcyjne, skróty, emotikony). Usunięte zostały również formy odmienialne z użyciem myślnika i apostrofu | |
49 | +(np. odmienione akronimy i nazwiska obce, formy takie jak ,,12-latek``). | |
50 | +Interpretacje na liście frekwencyjnej zostały skonwertowane do postaci takiej jaka występuje w SGJP, | |
51 | +łączącej interpretacje form identycznych. Na przykład interpretacje adj:pl:nom:m1:pos, adj:pl:voc:m1:pos, adj:pl:nom:p1:pos i adj:pl:voc:p1:pos | |
52 | +zostały złączone w adj:pl:nom.voc:m1.p1:pos, a frekwencje form zsumowane. | |
53 | + | |
54 | +Celem jest aproksymacja wartości P(\lemma,\cat,\interp|\form). | |
55 | + | |
56 | +%Jakość aproksymacji mierzymy licząc jak często wśród $k$ najbardziej prawdopodobnych trójek $\lemma,\cat,\interp$ | |
57 | +%wskazanych przez model dla zadanej formy znajduje się trójka poprawna. Wyniki dla poszczególnych form agregujemy | |
58 | +%za pomocą średniej ważonej po ich częstościach. | |
59 | + | |
60 | +%Pytanie 0: Ile wynosi powyższa miara liczona z użyciem p-stw wziętych z listy frekwencyjnej? (To jest ograniczenie górne dla modelu) | |
61 | + | |
62 | +%Pytanie 0': Ile wynosi powyższa miara liczona z użyciem częstości wziętych ze zbioru reguł? (To jest ograniczenie dolne dla modelu) | |
63 | + | |
64 | +Pierwszym kryterium jest przynależność formy do słownika S. | |
65 | +Jeśli forma należy do S zakładamy, że jedno z haseł S zawierające tę formę | |
66 | +poprawnie opisuje jej lemat, kategorię i interpretację. | |
67 | + | |
68 | +\begin{task} | |
69 | +Jakie jest prawdopodobieństwo trafienia na formę, której lemat, kategoria i interpretacja należy do słownika, czyli | |
70 | +\[P((\form,\lemma,\cat,\interp) \in S)\] | |
71 | +Jakie jest prawdopodobieństwo trafienia na formę, która należy do słownika, ale jej lemat, kategoria lub interpretacja należy do słownika, czyli | |
72 | +\[P((\form,\lemma,\cat,\interp) \not\in S \wedge \form \in S)\] | |
73 | +\end{task} | |
74 | + | |
75 | +\begin{answer} | |
76 | +Prawdopodobieństwo natrafienia na formę należącą do słownika wynosi 95,67\%, zaś natrafienia na formę należącą do SGJP bez odpowiedniej | |
77 | +interpretacji -- 3,92\% (lista tych form znajduje się w pliku traps.txt). | |
78 | +\end{answer} | |
79 | + | |
80 | +W przypadku form należących do słownika różnorodność interpretacji będzie niewielka, | |
81 | +natomiast istotne będzie prawdopodobieństwo wystąpienia danego lematu. | |
82 | +Zaś w przypadku form nie należących do słownika prawdopodobieństwo wystąpienia lematu | |
83 | +będzie zawsze małe. | |
56 | 84 | |
57 | -Celem jest aproksymacja wartości P(lemma,cat,interp|form). | |
85 | +Dzielimy teraz listę frekwencyjną na część należącą do S i nie należącą do S. | |
86 | +Od tej pory budujemy model osobno dla każdej z części. | |
58 | 87 | |
59 | -Pytanie 1: $P((form,lemma,cat,interp) \in S)$ | |
88 | +W przypadku cześci należącej do S zauważamy, że \[P(\lemma,\cat,\interp|\form)=P(\form|\lemma,\cat,\interp)\frac{P(\lemma,\cat,\interp)}{P(\form)}\] | |
60 | 89 | |
61 | -Pytanie 2: $P((form,lemma,cat,interp) \not\in S \wedge form \in S)$ | |
90 | +Zakładamy, że \interp{} jest niezależne od \lemma, pod warunkiem określonego \cat | |
91 | +\[P(\lemma,\cat,\interp)=P(\lemma,\cat)P(\interp|\lemma,\cat)=P(\lemma,\cat)P(\interp|\cat)\] | |
62 | 92 | |
63 | -Załóżmy, że reguły i wyjątki mają postać taką, że do danej formy można zaaplikować tylko jedną z nich | |
64 | -(dla żadnej reguły sufix nie jest podciągiem innego sufixu). Wtedy | |
65 | -\[P(lemma,cat,interp|form)\approx P(rule|form)=P(rule|fsuf)\] | |
66 | -(W powyższym drzewie sufixowym w każdym węźle mamy dowiązania do sufixów o jeden znak dłuższych oraz kategorię pozostałe traktową łącznie | |
93 | +$P(\form)$, $P(\lemma,\cat)$ i $P(\interp|\cat)$ szacujemy na podstawie listy frekwencyjnej, | |
94 | +w przypadku pierwszych dwu stosując wygładzanie. Wyliczenie $P(\form)$ zawiera uogólniona lista frekwencyjna | |
95 | +(ścieżka {\tt resources/NKJP1M/NKJP1M-generalized-frequency.tab} w repozytorium ENIAM), $P(\lemma,\cat)$ -- plik | |
96 | + {\tt prob\_lemmacat.txt}, zaś $P(\interp|\cat)$ -- {\tt prob\_itp\_givencat.txt} (oba zawarte w katalogu {\tt morphology/doc}). | |
67 | 97 | |
68 | -Pytanie 3: Czy faktycznie zachodzi powyższa zależność? Jak zmierzyć podobieństwo? | |
98 | +$P(\form|\lemma,\cat,\interp)$ wynosi 0, gdy w S nie ma krotki postaci (\form,\lemma,\cat,\interp); | |
99 | +1, gdy jest dokładnie jedna krotka z (\lemma,\cat,\interp). Gdy jest ich więcej oznacza to, że | |
100 | +lemat ma przynajmniej dwa warianty odmiany. Są to przypadki rzadkie. Przypisujemy każdej z możliwości | |
101 | +prawdopodobieństwo 1. | |
69 | 102 | |
70 | -Problem tu jest taki, że lista frekwencyjna jest zbyt mała by precyzyjnie określić p-stwo ok. 40000 reguł | |
103 | +\begin{task} | |
104 | +Przejrzeć SGJP i znaleźć wszystkie przykłady, w których dla ustalonego lematu, kategorii i interpretacji | |
105 | +jest więcej niż jedna forma. Znaleźć wystąpienia tych krotek na liście frekwencyjnej. | |
106 | +\end{task} | |
71 | 107 | |
72 | -\[P(rule|fsuf)=P(lsuf,cat,interp|fsuf)=P(fsuf|lsuf,cat,interp)\frac{P(lsuf,cat,interp)}{P(fsuf)}\] | |
108 | +\begin{answer} | |
109 | +Lista takich form znajduje się w pliku multi\_forms.txt. | |
110 | +\end{answer} | |
73 | 111 | |
74 | -$P(fsuf)$ jest prawdopodobieństwem tego, że do języka należy słowo o zadanym sufixie. | |
112 | +Teraz zanalizujemy drugą część listy frekwencyjnej. | |
113 | +Załóżmy, że reguły mają postać taką, że sufiks żadnej reguły nie jest podciągiem sufixu innej z nich. | |
114 | +Sufiksy reguł tworzą drzewo, które w każdym węźle ma dowiązania do sufixów o jeden znak dłuższych oraz kategorię pozostałe traktową łącznie. | |
115 | +Przyjmujemy następujące założenie modelowe: | |
116 | +\[P(\lemma,\cat,\interp|\form)\approx P(rule|\form)=P(rule|\fsuf)\] | |
117 | +Wynika ono z tego, że mając nieznaną formę musimy oprzeć się na ogólnych regułach | |
118 | +odmiany i nie możemy korzystać z tego że ma ona jakieś konkretne brzmienie. | |
119 | +Korzystamy tutaj tylko z reguł oznaczonych jako produktywne. | |
120 | + | |
121 | +Problem tu jest taki, że lista frekwencyjna jest zbyt mała by precyzyjnie określić p-stwo ok. 40000 reguł. | |
122 | +Dlatego znowu stosujemy zabieg z prawdopodobieństwem warunkowym. | |
123 | + | |
124 | +\[P(rule|\fsuf)=P(\lsuf,\cat,\interp|\fsuf)=P(\fsuf|\lsuf,\cat,\interp)\frac{P(\lsuf,\cat,\interp)}{P(\fsuf)}\] | |
125 | + | |
126 | +$P(\fsuf)$ jest prawdopodobieństwem tego, że do języka należy słowo o zadanym sufixie. | |
75 | 127 | Można je oszacować za pomocą listy frekwencyjnej. |
76 | 128 | |
77 | -Zakładamy, że interp jest niezależne od lsuf, pod warunkiem określonego cat | |
78 | -$P(lsuf,cat,interp)=P(lsuf,cat)P(interp|lsuf,cat)=P(lsuf,cat)P(interp|cat)$ | |
129 | +Zakładamy, że \interp{} jest niezależne od \lsuf, pod warunkiem określonego \cat | |
130 | +\[P(\lsuf,\cat,\interp)=P(\lsuf,\cat)P(\interp|\lsuf,\cat)=P(\lsuf,\cat)P(\interp|\cat)\] | |
79 | 131 | |
80 | -$P(lsuf,cat)$ i $P(interp|cat)$ można oszacować na podstawie listy frekwencyjnej. | |
132 | +$P(\lsuf,\cat)$ i $P(\interp|\cat)$ można oszacować na podstawie listy frekwencyjnej. | |
81 | 133 | |
82 | -$P(fsuf|lsuf,cat,interp)$ wynosi 0, gdy nie ma reguły postaci (fsuf,lsuf,cat,interp); | |
83 | -1, gdy jest dokładnie jedna reguła z (lsuf,cat,interp), a gdy jest ich więcej trzeba | |
84 | -oszacować z listy frekwencyjnej. | |
134 | +\begin{task} | |
135 | +Oszacować $P(\fsuf)$ i $P(\lsuf,\cat)$ na podstawie listy frekwencyjnej. | |
136 | +Sprawdzić dla jakich sufiksów próbka jest mała albo nie ma jej wcale. | |
137 | +\end{task} | |
85 | 138 | |
86 | -Pytanie 4: Czy powyższe przybliżenie jest poprawne, jak często jest więcej niż jedna reguła i ile wynoszą wówczas p-stwa? | |
139 | +% w razie gdyby był problem można próbować dzielić sufiksy na części i założyć niezależność tych części | |
140 | + | |
141 | +$P(\fsuf|\lsuf,\cat,\interp)$ wynosi 0, gdy nie ma reguły postaci (\fsuf,\lsuf,\cat,\interp); | |
142 | +1, gdy jest dokładnie jedna reguła z (\fsuf,\lsuf,\cat,\interp). Ustawiamy produktywność reguł tak | |
143 | +by nie pojawiało się więcej pasujących reguł. | |
144 | + | |
145 | +\begin{task} | |
146 | +Określić produktywność reguł i sprawdzić, czy nie ma niejednoznacznych dopasowań. | |
147 | +\end{task} | |
87 | 148 | |
88 | -Pytanie 5: Co zrobić z niejednoznacznymi interpretacjami? | |
149 | +\begin{task} | |
150 | +Określić jakość modelu. | |
151 | +\end{task} | |
89 | 152 | |
90 | -Zadania poboczne: wytworzenie otagowanej listy frekwencyjnej, wytworzenie zbioru reguł, wskazanie, które reguły opisują sytuacje wyjątkowe. | |
153 | +\begin{answer} | |
154 | +Wyliczona jakość modelu (stopień pokrycia listy frekwencyjnej przez co najmniej 95\% najbardziej prawdopodobnych interpretacji wg modelu) wyniosła 79,90\%. | |
155 | +\end{answer} | |
156 | + | |
157 | +%czasowniki produktywne to te z lematem ać ować ywać, ić, yć, (nąć) | |
158 | + | |
159 | +Pytanie 4: Czy powyższe przybliżenie jest poprawne, jak często jest więcej niż jedna reguła i ile wynoszą wówczas p-stwa? | |
160 | + | |
161 | +Zadania poboczne: wytworzenie otagowanej listy frekwencyjnej, wytworzenie (uzupełnienie) zbioru reguł na podstawie SGJP i listy frekwencyjnej, wskazanie, które reguły opisują sytuacje wyjątkowe. | |
91 | 162 | |
92 | 163 | Zadanie na przyszłość: reguły słowotwórstwa i ich interpretacja semantyczna. |
93 | 164 | |
94 | -\end{document} | |
95 | 165 | \ No newline at end of file |
166 | +Do powyższego modelu trzeba jeszcze dodać prefixy nie i naj. | |
167 | + | |
168 | +\end{document} | |
... | ... |