Commit dd02bfeb64ff545df3950a4d8a4910a09b4c712d

Authored by Wojciech Jaworski
2 parents 66045a35 ec2ccc69

Merge branch 'integration' into dep_trees

Showing 98 changed files with 512041 additions and 494159 deletions

Too many changes to show.

To preserve performance only 49 of 98 files are displayed.

LCGlexicon/ENIAM_LCGlexicon.ml
... ... @@ -71,6 +71,30 @@ let assign_quantifiers (selectors,rule,weight) =
71 71 let quant = merge_quant categories quant in
72 72 selectors, (bracket,quant,syntax),(rule,weight)
73 73  
  74 +let rec check_quantifiers_int_rec (selectors,syntax) quants = function
  75 + Atom x -> ()
  76 + | AVar "schema" -> ()
  77 + | AVar x ->
  78 + if not (SelectorSet.mem quants (selector_of_string x))
  79 + then failwith ("Variable '" ^ x ^ "' is not quantified in rule " ^ string_of_selectors selectors ^ ": " ^ ENIAM_LCGstringOf.grammar_symbol 0 syntax)
  80 + | With l -> Xlist.iter l (check_quantifiers_int_rec (selectors,syntax) quants)
  81 + | Zero -> ()
  82 + | Top -> ()
  83 +
  84 +let rec check_quantifiers_rec rule quants = function
  85 + Tensor l -> Xlist.iter l (check_quantifiers_int_rec rule quants)
  86 + | Plus l -> Xlist.iter l (check_quantifiers_rec rule quants)
  87 + | Imp(s,d,t) -> check_quantifiers_rec rule quants s; check_quantifiers_rec rule quants t
  88 + | One -> ()
  89 + | ImpSet(s,l) -> check_quantifiers_rec rule quants s; Xlist.iter l (fun (_,t) -> check_quantifiers_rec rule quants t)
  90 + | Star s -> check_quantifiers_rec rule quants s
  91 + | Maybe s -> check_quantifiers_rec rule quants s
  92 + | _ -> failwith "check_quantifiers_rec"
  93 +
  94 +let check_quantifiers (selectors,(bracket,quant,syntax),_) =
  95 + let quants = Xlist.fold quant SelectorSet.empty (fun quants (q,_) -> SelectorSet.add quants q) in
  96 + check_quantifiers_rec (selectors,syntax) quants syntax
  97 +
74 98 let assign_semantics (selectors,(bracket,quant,syntax),(rule,weight)) =
75 99 let semantics = try
76 100 let raised,rule = get_raised [] rule in
... ... @@ -135,6 +159,7 @@ let dict_of_grammar grammar =
135 159 let make_rules x_flag filename =
136 160 let lexicon = ENIAM_LCGlexiconParser.load_lexicon filename in
137 161 let lexicon = List.rev (Xlist.rev_map lexicon assign_quantifiers) in
  162 + Xlist.iter lexicon check_quantifiers;
138 163 let lexicon = List.rev (Xlist.rev_map lexicon assign_semantics) in
139 164 let lexicon = if x_flag then List.rev (Xlist.rev_map lexicon add_x_args) else lexicon in
140 165 dict_of_grammar lexicon
... ...
LCGlexicon/ENIAM_LCGlexiconTypes.ml
... ... @@ -36,6 +36,7 @@ module OrderedSelector = struct
36 36 end
37 37  
38 38 module SelectorMap=Xmap.Make(OrderedSelector)
  39 +module SelectorSet=Xset.Make(OrderedSelector)
39 40  
40 41 type rule =
41 42 Bracket
... ...
LCGlexicon/ENIAMcategoriesPL.ml
... ... @@ -36,7 +36,7 @@ let selector_values = Xlist.fold [
36 36 "match-result";"url";"email";"obj-id";"adj";"adjc";"adjp";"adja";
37 37 "adv";"ger";"pact";"ppas";"fin";"bedzie";"praet";"winien";"impt";
38 38 "imps";"pred";"aglt";"inf";"pcon";"pant";"qub";"part";"comp";"conj";"interj";
39   - "sinterj";"burk";"interp";"unk";"html-tag"];
  39 + "sinterj";"burk";"interp";"xxx";"unk";"html-tag"];
40 40 Pos2, [];
41 41 Cat, [];
42 42 Number, all_numbers;
... ... @@ -365,6 +365,8 @@ let clarify_categories proper cat = function
365 365 | lemma,"interp",[] -> [{empty_cats with lemma=lemma; pos="interp"; pos2="interp"}]
366 366 | lemma,"unk",[] ->
367 367 [{empty_cats with lemma=lemma; pos="unk"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}]
  368 + | lemma,"xxx",[] ->
  369 + [{empty_cats with lemma=lemma; pos="xxx"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}]
368 370 | lemma,"html-tag",[] -> [{empty_cats with lemma=lemma; pos="html-tag"; pos2="html-tag"}]
369 371 | lemma,c,l -> failwith ("clarify_categories: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat "."))))
370 372  
... ... @@ -557,9 +559,9 @@ let pos_categories = Xlist.fold [
557 559 "imps",[Lemma;(*NewLemma;*)Cat;Number;Gender;Person;Aspect;Negation;Mood;Tense;];
558 560 "pred",[Lemma;(*NewLemma;*)Cat;Number;Gender;Person;Aspect;Negation;Mood;Tense;];
559 561 "aglt",[Lemma;Number;Person;Aspect;];
560   - "inf",[Lemma;(*NewLemma;*)Cat;Aspect;];
561   - "pcon",[Lemma;(*NewLemma;*)Cat;Aspect;];
562   - "pant",[Lemma;(*NewLemma;*)Cat;Aspect;];
  562 + "inf",[Lemma;(*NewLemma;*)Cat;Aspect;Negation;];
  563 + "pcon",[Lemma;(*NewLemma;*)Cat;Aspect;Negation;];
  564 + "pant",[Lemma;(*NewLemma;*)Cat;Aspect;Negation;];
563 565 "qub",[Lemma;];
564 566 "part",[Lemma;];
565 567 "comp",[Lemma;];(* ctype *)
... ... @@ -569,5 +571,6 @@ let pos_categories = Xlist.fold [
569 571 "burk",[Lemma;];
570 572 "interp",[Lemma;];
571 573 "unk",[Lemma;Number;Case;Gender;Person;];
  574 + "xxx",[Lemma;Number;Case;Gender;Person;];
572 575 "html-tag",[Lemma;];
573 576 ] StringMap.empty (fun map (k,l) -> StringMap.add map k l)
... ...
LCGlexicon/TODO
  1 +- dodac prepncp
  2 +
1 3 - dodać podniesione comprepy
2 4 Pod jakim tytułem brykasz?
3 5 Niezależnie od kogo brykasz?
... ...
LCGlexicon/resources/lexicon-pl.dic
... ... @@ -104,11 +104,14 @@ pos=subst,case=gen,nsem=measure:
104 104 measure*sg*case*n2*person{\num*number*case*gender*person*rec}{schema}{\(1+qub),/(1+inclusion)}: measure_weight; # UWAGA: number "sg" i gender "n2", żeby uzgadniać z podmiotem czasownika
105 105  
106 106 # frazy przyimkowe
  107 +#lemma!=temu,pos=prep: prepnp*lemma*case{\(1+advp*T),/np*T*case*T*T}{\(1+qub),/(1+inclusion)};
  108 +#lemma!=temu,pos=prep: prepadjp*lemma*case{\(1+advp*T),/adjp*T*case*T}{\(1+qub),/(1+inclusion)};
107 109 pos=prep: prepnp*lemma*case{\(1+advp*T),/np*T*case*T*T}{\(1+qub),/(1+inclusion)};
108 110 pos=prep: prepadjp*lemma*case{\(1+advp*T),/adjp*T*case*T}{\(1+qub),/(1+inclusion)};
109 111 lemma=po,pos=prep: QUANT[case=postp] prepadjp*lemma*case{\(1+advp*T),/(adjp*sg*dat*m1+adjp*T*postp*T)}{\(1+qub),/(1+inclusion)}; # po polsku, po kreciemu
110 112 lemma=z,pos=prep: QUANT[case=postp] prepadjp*lemma*case{\(1+advp*T),/adjp*sg*nom*f}{\(1+qub),/(1+inclusion)}; # z bliska
111 113 lemma=na,pos=prep: QUANT[case=postp] prepadjp*lemma*case{\(1+advp*T),/advp*T}{\(1+qub),/(1+inclusion)}; # na lewo
  114 +lemma=temu,pos=prep: prepnp*lemma*case\np*T*case*T*T; # chwilę temu
112 115  
113 116 # przimkowe określenia czasu
114 117 lemma=z,pos=prep,case=gen: prepnp*lemma*case{\(1+advp*T),/(day-month+day+year+date+hour+hour-minute)}{\(1+qub),/(1+inclusion)};
... ... @@ -292,6 +295,7 @@ lemma=[,pos=interp: (inclusion/rparen2)/(np*T*T*T*T+ip*T*T*T+adjp*T*T*T+pr
292 295 lemma=),pos=interp: rparen;
293 296 lemma=],pos=interp: rparen2;
294 297 pos=unk: np*number*case*gender*person;
  298 +pos=xxx: np*number*case*gender*person;
295 299  
296 300 lemma=<conll_root>,pos=interp: <conll_root>/(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj);
297 301  
... ... @@ -302,7 +306,7 @@ lemma=&lt;sentence&gt;,pos=interp: BRACKET &lt;root&gt;/s;
302 306  
303 307 lemma=:,pos=interp: BRACKET or;
304 308 lemma=:s,pos=interp: BRACKET <colon>\<speaker>;
305   -lemma=:s,pos=interp: BRACKET (<colon>\<speaker>)/<squery>;
  309 +lemma=:s,pos=interp: BRACKET (<colon>\<speaker>)/<squery>; #FIXME <squery> nie jest nigdzie generowane
306 310 lemma=<or-sentence>,pos=interp: BRACKET <root>/s;
307 311 lemma=<or-sentence>,pos=interp: BRACKET ((<root>/<speaker-end>)/(ip*T*T*T/or))/or2 SEM[λxλyλz.NODE(yx,z)];
308 312 lemma=</or-sentence>,pos=interp: BRACKET or2\?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj);
... ...
LCGlexicon/test.ml
... ... @@ -19,6 +19,7 @@
19 19  
20 20 open ENIAM_LCGlexiconTypes
21 21 open ENIAM_LCGtypes
  22 +open Xstd
22 23  
23 24 let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename
24 25 (* let rules = ENIAM_LCGlexicon.make_rules false "resources/lexicon-pl.dic" *)
... ... @@ -97,19 +98,32 @@ let create_chart valence tokens last =
97 98 ENIAM_LCGrenderer.reset_variable_names ();
98 99 ENIAM_LCGrenderer.add_variable_numbers ();
99 100 let cats = ENIAMcategoriesPL.clarify_categories proper ["X"] (lemma,pos,interp) in
100   - let l = ENIAM_LCGlexicon.create_entries rules id orth cats valence in
  101 + let l = ENIAM_LCGlexicon.create_entries rules id orth cats valence [] in
101 102 ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
102 103 chart
103 104  
  105 +let create_text_fragments tokens last =
  106 + let text_fragments = Array.make last IntMap.empty in
  107 + Xlist.iter tokens (fun (id,lnode,rnode,orth,lemma,pos,interp,proper) ->
  108 + text_fragments.(lnode) <- IntMap.add text_fragments.(lnode) rnode orth);
  109 + Int.iter_down 0 (last - 1) (fun i ->
  110 + let map = IntMap.fold text_fragments.(i) text_fragments.(i) (fun map j orth ->
  111 + if j = last then map else
  112 + IntMap.fold text_fragments.(j) map (fun map k orth2 ->
  113 + IntMap.add map k (orth ^ " " ^ orth2))) in
  114 + text_fragments.(i) <- map);
  115 + text_fragments
  116 +
104 117 let test_example valence (name,tokens,last) =
105 118 ENIAM_LCGreductions.reset_variant_label ();
106 119 let chart = create_chart valence tokens last in
107   - ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart;
  120 + let text_fragments = create_text_fragments tokens last in
  121 + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" text_fragments chart;
108 122 let chart,references = ENIAM_LCGchart.lazify chart in
109   - ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart;
  123 + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" text_fragments chart;
110 124 ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references;
111 125 let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
112   - ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart;
  126 + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" text_fragments chart;
113 127 ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references;
114 128 if ENIAM_LCGchart.is_parsed chart then (
115 129 let term = ENIAM_LCGchart.get_parsed_term chart in
... ...
LCGparser/ENIAM_LCGgraphOf.ml
... ... @@ -25,6 +25,7 @@ let escape_string s =
25 25 match String.sub s i 1 with
26 26 "<" -> t ^ "〈"
27 27 | ">" -> t ^ "〉"
  28 + | "\"" -> t ^ "\\\""
28 29 | c -> t ^ c)
29 30  
30 31 let string_of_node t =
... ... @@ -58,6 +59,10 @@ let rec print_dependency_tree_rec file edge upper id = function
58 59 | Val s ->
59 60 fprintf file " %s [shape=box,label=\"%s\"]\n" id s;
60 61 print_edge file edge upper id
  62 + | SetAttr(a,s,t) ->
  63 + fprintf file " %s [shape=box,label=\"SetAttr(%s,%s)\"]\n" id a (ENIAM_LCGstringOf.linear_term 0 s);
  64 + print_edge file edge upper id;
  65 + print_dependency_tree_rec2 file "" id t
61 66 | Dot -> ()
62 67 (* fprintf file " %s [shape=box,label=\"Dot\"]\n" id;
63 68 print_edge file edge upper id*)
... ... @@ -87,17 +92,26 @@ let rec print_simplified_dependency_tree_rec2 file edge upper = function
87 92 Xlist.iter l (fun (i,t) -> print_simplified_dependency_tree_rec2 file i e t)
88 93 | Dot -> ()
89 94 | Ref i -> print_edge file edge upper ("x" ^ string_of_int i)
90   - | t -> failwith ("print_simplified_dependency_tree_rec: " ^ ENIAM_LCGstringOf.linear_term 0 t)
  95 + | SetAttr(a,s,t) -> ()
  96 + (* fprintf file " %s [shape=box,label=\"SetAttr(%s,%s)\"]\n" id a (ENIAM_LCGstringOf.linear_term 0 s);
  97 + print_edge file edge upper id; *)
  98 + (* print_simplified_dependency_tree_rec2 file "" id t *)
  99 + | Node t -> ()
  100 + | t -> failwith ("print_simplified_dependency_tree_rec2: " ^ ENIAM_LCGstringOf.linear_term 0 t)
91 101  
92 102 let rec print_simplified_dependency_tree_rec file edge upper id = function
93 103 Node t ->
94   - fprintf file " %s [label=\"%s\\n%s:%s\\n%s\\n%f\"]\n" id t.orth t.lemma t.pos (ENIAM_LCGstringOf.linear_term 0 t.symbol) t.weight;
  104 + fprintf file " %s [label=\"%s\\n%s:%s\\n%s\\n%f\"]\n" id (escape_string t.orth) (escape_string t.lemma) t.pos (escape_string (ENIAM_LCGstringOf.linear_term 0 t.symbol)) t.weight;
95 105 print_edge file edge upper id;
96 106 print_simplified_dependency_tree_rec2 file "" id t.args
97 107 | Variant(e,l) ->
98 108 fprintf file " %s [shape=diamond,label=\"%s\"]\n" id e;
99 109 print_edge file edge upper id;
100 110 Xlist.iter l (fun (i,t) -> print_simplified_dependency_tree_rec file i id (id ^ "y" ^ i) t)
  111 + (* | SetAttr(a,s,t) ->
  112 + fprintf file " %s [shape=box,label=\"SetAttr(%s,%s)\"]\n" id a (ENIAM_LCGstringOf.linear_term 0 s);
  113 + print_edge file edge upper id; *)
  114 + (* print_simplified_dependency_tree_rec2 file "" id t *)
101 115 | Dot -> ()
102 116 | t -> failwith ("print_simplified_dependency_tree_rec: " ^ ENIAM_LCGstringOf.linear_term 0 t)
103 117  
... ...
LCGparser/ENIAM_LCGlatexOf.ml
... ... @@ -199,7 +199,7 @@ let rec grammar_symbol c = function
199 199 | BracketSet d -> "{\\bf BracketSet}(" ^ direction d ^ ")"
200 200 | Maybe s -> "?" ^ grammar_symbol 2 s
201 201  
202   -let chart page g =
  202 +let chart page text_fragments g =
203 203 let layers = ENIAM_LCGchart.fold g IntMap.empty (fun layers (symbol,node1,node2,sem,layer) ->
204 204 let nodes = try IntMap.find layers layer with Not_found -> IntMap.empty in
205 205 let content = node2, grammar_symbol 0 symbol, linear_term 0 sem in
... ... @@ -207,16 +207,17 @@ let chart page g =
207 207 let nodes = IntMap.add_inc nodes node1 [content] (fun l -> content :: l) in
208 208 IntMap.add layers layer nodes) in
209 209 let n = match page with "a4" -> "10" | "a1" -> "40" | _ -> "20" in
210   - "\\begin{longtable}{|l|l|l|p{" ^ n ^ "cm}|}\n\\hline\n" ^
  210 + "\\begin{longtable}{|l|l|l|l|p{" ^ n ^ "cm}|}\n\\hline\n" ^
211 211 String.concat "" (List.rev (IntMap.fold layers [] (fun l layer nodes ->
212 212 IntMap.fold nodes l (fun l node1 contents ->
213 213 Xlist.fold contents l (fun l (node2,symbol,sem) ->
214   - (Printf.sprintf "%d & %d--%d & $\\begin{array}{l}%s\\end{array}$ & $%s$\\\\\n\\hline\n" layer node1 node2 symbol sem) :: l))))) ^
  214 + let s = try IntMap.find text_fragments.(node1) node2 with Not_found -> failwith (Printf.sprintf "chart: text_fragment not found %d-%d" node1 node2) in
  215 + (Printf.sprintf "%d & %d--%d & %s & $\\begin{array}{l}%s\\end{array}$ & $%s$\\\\\n\\hline\n" layer node1 node2 s symbol sem) :: l))))) ^
215 216 "\\end{longtable}"
216 217  
217   -let print_chart path name page g =
  218 +let print_chart path name page text_fragments g =
218 219 Xlatex.latex_file_out path name page false (fun file ->
219   - Printf.fprintf file "%s\n" (chart page g));
  220 + Printf.fprintf file "%s\n" (chart page text_fragments g));
220 221 Xlatex.latex_compile_and_clean path name
221 222  
222 223  
... ...
LCGparser/test.ml
... ... @@ -18,6 +18,7 @@
18 18 *)
19 19  
20 20 open ENIAM_LCGtypes
  21 +open Xstd
21 22  
22 23 type entry =
23 24 Basic of grammar_symbol
... ... @@ -90,15 +91,28 @@ let create_chart tokens last =
90 91 ENIAM_LCGchart.add chart lnode rnode (Bracket(lf,rf,syntax),semantics) 0) in
91 92 chart
92 93  
  94 +let create_text_fragments tokens last =
  95 + let text_fragments = Array.make last IntMap.empty in
  96 + Xlist.iter tokens (fun (lnode,rnode,orth,lemma,pos,entry) ->
  97 + text_fragments.(lnode) <- IntMap.add text_fragments.(lnode) rnode orth);
  98 + Int.iter_down 0 (last - 1) (fun i ->
  99 + let map = IntMap.fold text_fragments.(i) text_fragments.(i) (fun map j orth ->
  100 + if j = last then map else
  101 + IntMap.fold text_fragments.(j) map (fun map k orth2 ->
  102 + IntMap.add map k (orth ^ " " ^ orth2))) in
  103 + text_fragments.(i) <- map);
  104 + text_fragments
  105 +
93 106 let test_example (name,tokens,last) =
94 107 ENIAM_LCGreductions.reset_variant_label ();
95 108 let chart = create_chart tokens last in
96   - ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a3" chart;
  109 + let text_fragments = create_text_fragments tokens last in
  110 + ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a3" text_fragments chart;
97 111 let chart,references = ENIAM_LCGchart.lazify chart in
98   - ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart;
  112 + ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" text_fragments chart;
99 113 ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references;
100 114 let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
101   - ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart;
  115 + ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" text_fragments chart;
102 116 ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references;
103 117 if ENIAM_LCGchart.is_parsed chart then (
104 118 let term = ENIAM_LCGchart.get_parsed_term chart in
... ...
NKJP2/ENIAM_NKJP.ml
... ... @@ -330,6 +330,42 @@ let rec split_front rev n p =
330 330 if n = 0 then List.rev rev, p else
331 331 split_front (List.hd p :: rev) (n-1) (List.tl p)
332 332  
  333 +let rec combine_three = function
  334 + [],[],[] -> []
  335 + | x1 :: l1, x2 :: l2, x3 :: l3 -> (x1,x2,x3) :: combine_three (l1,l2,l3)
  336 + | _ -> failwith "combine_three"
  337 +
  338 +type split = Single of string | Split of (string * string * string * string list) list | Correct
  339 +
  340 +type err = Err | ErrTagE | TagE | TErr | DErr | CErr | Corr
  341 +
  342 +let parse_err = function
  343 + "ERR" -> Err
  344 + | "ERR-TAGE" -> ErrTagE
  345 + | "TAGE" -> TagE
  346 + | "TERR" -> TErr
  347 + | "DERR" -> DErr
  348 + | "CERR" -> CErr
  349 + | s -> failwith ("parser_err: " ^ s)
  350 +
  351 +let load_err_corr err_corr_filename =
  352 + File.fold_tab err_corr_filename StringMap.empty (fun err_corr -> function
  353 + [real_orth;lemma;orth;interp;freq;compos;sgjp;common;err] ->
  354 + StringMap.add_inc err_corr (real_orth^"\t"^lemma^"\t"^interp) (Single orth,parse_err err) (fun _ -> failwith "load_err_corr")
  355 + | [real_orth;lemma;interp;split_orth;split_lemma;split_interp;freq;compos;sgjp;common;err] ->
  356 + let l = combine_three (Xstring.split "|" split_orth,Xstring.split "|" split_lemma,Xstring.split "|" split_interp) in
  357 + let l = Xlist.map l (fun (orth,lemma,interp) ->
  358 + match Xstring.split ":" interp with
  359 + cat :: interp -> orth,lemma,cat,interp
  360 + | _ -> failwith "load_err_corr") in
  361 + StringMap.add_inc err_corr (real_orth^"\t"^lemma^"\t"^interp) (Split l,parse_err err) (fun _ -> failwith "load_err_corr")
  362 + | l -> print_endline ("load_err_corr: " ^ String.concat "\t" l); err_corr)
  363 + (* oooo o o interj 1 NCOMPOS NON-SGJP CW DERR
  364 + o opylać opyla fin:sg:ter:imperf 1 NCOMPOS NON-SGJP CW ERR *)
  365 + (* napewno napewno qub na|pewno na|pewno prep:acc|adv:pos 2 NCOMPOS NON-SGJP CW ERR *)
  366 +
  367 +let err_corr = load_err_corr "../resources/NKJP1M/NKJP1M-frequency-with-corrections.tab"
  368 +
333 369 let match_tokens name id_p s sentences =
334 370 let p = Xunicode.utf8_chars_of_utf8_string s in
335 371 let len = Xlist.size p in
... ... @@ -340,8 +376,43 @@ let match_tokens name id_p s sentences =
340 376 (* if no_spaces>0 && (nps || i=0) then Printf.printf "match_tokens spaces: %s %n i=%d beg=%d len=%d\n" name id_p i beg len; *)
341 377 let i = i+no_spaces in
342 378 let real_orth,p = split_front [] len p in
343   - if beg = i then i+len, p, (beg,len,no_spaces,String.concat "" real_orth,orth,lemma,cat,interp) :: tokens else
344   - failwith (Printf.sprintf "match_tokens 1: %s %n i=%d beg=%d len=%d" name id_p i beg len)) in
  379 + let split,err = try StringMap.find err_corr (orth ^ "\t" ^ lemma ^ "\t" ^ String.concat ":" (cat :: interp)) with Not_found -> Correct,Corr in
  380 + if beg <> i then failwith (Printf.sprintf "match_tokens 1: %s %n i=%d beg=%d len=%d" name id_p i beg len) else (
  381 + (* if err <> Corr then Printf.printf "match_tokens err: orth=%s lemma=%s cat=%s\n" orth lemma cat; *)
  382 + match split with
  383 + Correct ->
  384 + i+len, p, (beg,len,no_spaces,String.concat "" real_orth,orth,lemma,cat,interp) :: tokens
  385 + | Single new_orth ->
  386 + let new_orth = if err = TErr then orth else new_orth in
  387 + i+len, p, (beg,len,no_spaces,String.concat "" real_orth,new_orth,lemma,cat,interp) :: tokens
  388 + | Split["w",lemma1,cat1,interp1;"ogóle",lemma2,cat2,interp2] ->
  389 + if "wogole" = String.concat "" real_orth then
  390 + i+len, p, (beg+1,len-1,0,"ogole","ogóle",lemma2,cat2,interp2) :: (beg,1,no_spaces,"w","w",lemma1,cat1,interp1) :: tokens else
  391 + if "wogóle" <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: wogole") else
  392 + i+len, p, (beg+1,len-1,0,"ogóle","ogóle",lemma2,cat2,interp2) :: (beg,1,no_spaces,"w","w",lemma1,cat1,interp1) :: tokens
  393 + | Split["z",lemma1,cat1,interp1;"pewnością",lemma2,cat2,interp2] ->
  394 + if "spewnością" <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: spewnością") else
  395 + i+len, p, (beg+1,len-1,0,"pewnością","pewnością",lemma2,cat2,interp2) :: (beg,1,no_spaces,"s","z",lemma1,cat1,interp1) :: tokens
  396 + | Split["z",lemma1,cat1,interp1;"powrotem",lemma2,cat2,interp2] ->
  397 + if "spowrotem" <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: spowrotem") else
  398 + i+len, p, (beg+1,len-1,0,"powrotem","powrotem",lemma2,cat2,interp2) :: (beg,1,no_spaces,"s","z",lemma1,cat1,interp1) :: tokens
  399 + | Split["Słyszała",lemma1,cat1,interp1;"m",lemma2,cat2,interp2] ->
  400 + if "Słyszalam" <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: Słyszalam") else
  401 + i+len, p, (beg+len-1,1,0,"m","m",lemma2,cat2,interp2) :: (beg,len-1,no_spaces,"Słyszala","Słyszała",lemma1,cat1,interp1) :: tokens
  402 + | Split[orth1,lemma1,cat1,interp1;orth2,lemma2,cat2,interp2] ->
  403 + if orth1 ^ orth2 <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 3: %s|%s <> %s" orth1 orth2 (String.concat "" real_orth)) else
  404 + let len1 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth1) in
  405 + let len2 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth2) in
  406 + if len1 + len2 <> len then failwith "match_tokens 4" else
  407 + i+len, p, (beg+len1,len2,0,orth2,orth2,lemma2,cat2,interp2) :: (beg,len1,no_spaces,orth1,orth1,lemma1,cat1,interp1) :: tokens
  408 + | Split[orth1,lemma1,cat1,interp1;orth2,lemma2,cat2,interp2;orth3,lemma3,cat3,interp3] ->
  409 + if orth1 ^ orth2 ^ orth3 <> String.concat "" real_orth then failwith (Printf.sprintf "match_tokens 5: %s|%s|%s <> %s" orth1 orth2 orth3 (String.concat "" real_orth)) else
  410 + let len1 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth1) in
  411 + let len2 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth2) in
  412 + let len3 = Xlist.size (Xunicode.utf8_chars_of_utf8_string orth3) in
  413 + if len1 + len2 + len3 <> len then failwith "match_tokens 6" else
  414 + i+len, p, (beg+len1+len2,len3,0,orth3,orth3,lemma3,cat3,interp3) :: (beg+len1,len2,0,orth2,orth2,lemma2,cat2,interp2) :: (beg,len1,no_spaces,orth1,orth1,lemma1,cat1,interp1) :: tokens
  415 + | Split _ -> failwith "match_tokens: ni")) in
345 416 i,p,(id_s,List.rev tokens,named_tokens) :: sentences) in
346 417 let no_spaces,p = get_spaces 0 p in
347 418 if i+no_spaces <> len then failwith (Printf.sprintf "match_tokens 2: %s %n i=%d len=%d p='%s'" name id_p i len (String.concat "" p))
... ...
NKJP2/spelling.ml
... ... @@ -34,7 +34,7 @@ let generate_error_sentences sentences =
34 34 let no_tokens = Xlist.size tokens in
35 35 let tokens,prev_orth,prev_cat = Xlist.fold tokens ([],prev_orth,prev_cat) (fun (tokens,prev_orth,prev_cat) (_,_,no_spaces,real_orth,orth,_,cat,_) ->
36 36 let tokens = Int.fold 1 no_spaces tokens (fun tokens _ -> xml_space :: tokens) in
37   - let tokens = if no_spaces = 0 && ValidateTokenizer.is_space_required prev_orth prev_cat orth cat then xml_err_space:: tokens else tokens in
  37 + let tokens = if no_spaces = 0 && ValidateTokenizer.is_space_required prev_orth prev_cat orth cat then xml_err_space :: tokens else tokens in
38 38 (make_xml_token real_orth orth) :: tokens, orth, cat) in
39 39 Xml.Element("s",["id",id_s;"length",string_of_int no_tokens],merge_pcdata (List.rev tokens)) :: sentences,prev_orth,prev_cat) in
40 40 Xml.Element("p",[],List.rev sentences)
... ...
exec/ENIAMexec.ml
... ... @@ -81,9 +81,25 @@ let create_chart rules tokens lex_sems paths last =
81 81 ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
82 82 chart
83 83  
  84 +let create_text_fragments tokens paths last =
  85 + let text_fragments = Array.make last IntMap.empty in
  86 + Xlist.iter paths (fun (id,lnode,rnode) ->
  87 + let t = ExtArray.get tokens id in
  88 + let orth = if t.ENIAMtokenizerTypes.beg + t.ENIAMtokenizerTypes.len = t.ENIAMtokenizerTypes.next
  89 + then t.ENIAMtokenizerTypes.orth else t.ENIAMtokenizerTypes.orth ^ " " in
  90 + text_fragments.(lnode) <- IntMap.add text_fragments.(lnode) rnode orth);
  91 + Int.iter_down 0 (last - 1) (fun i ->
  92 + let map = IntMap.fold text_fragments.(i) text_fragments.(i) (fun map j orth ->
  93 + if j = last then map else
  94 + IntMap.fold text_fragments.(j) map (fun map k orth2 ->
  95 + IntMap.add map k (orth ^ orth2))) in
  96 + text_fragments.(i) <- map);
  97 + text_fragments
  98 +
84 99 let eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last =
85 100 ENIAM_LCGreductions.reset_variant_label ();
86 101 let result = {empty_eniam_parse_result with paths_size = Xlist.size paths} in
  102 + let result = if verbosity = 0 then result else {result with text_fragments=create_text_fragments tokens paths last} in
87 103 let time1 = time_fun () in
88 104 try
89 105 let chart = create_chart rules tokens lex_sems paths last in
... ...
exec/ENIAMexecTypes.ml
... ... @@ -18,6 +18,7 @@
18 18 *)
19 19  
20 20 open ENIAM_LCGtypes
  21 +open Xstd
21 22  
22 23 type status = Idle | PreprocessingError | LexiconError | ParseError | ParseTimeout | Parsed | TooManyNodes | NotParsed | NotReduced | ReductionError | SemError | NotTranslated
23 24  
... ... @@ -41,6 +42,7 @@ type eniam_parse_result = {
41 42 dependency_tree4: linear_term array;
42 43 dependency_tree5: linear_term array;
43 44 dependency_tree6: linear_term array;
  45 + text_fragments: string IntMap.t array;
44 46 }
45 47 (*
46 48 type conll_parse_result = {
... ... @@ -175,6 +177,7 @@ let empty_eniam_parse_result = {
175 177 dependency_tree4=[| |];
176 178 dependency_tree5=[| |];
177 179 dependency_tree6=[| |];
  180 + text_fragments=[| |];
178 181 }
179 182  
180 183 (*
... ...
exec/ENIAMsemLexicon.ml 0 → 100644
  1 +(*
  2 + * ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
  3 + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open Xstd
  21 +open ENIAM_LCGtypes
  22 +open Lexer
  23 +open ENIAMwalTypes
  24 +open ENIAMlexSemanticsTypes
  25 +
  26 +let remove_comments line =
  27 + try
  28 + let n = String.index line '#' in
  29 + String.sub line 0 n
  30 + with Not_found -> line
  31 +
  32 +let rec manage_tokens = function
  33 + [arg;[T role]] -> [arg,role]
  34 + | arg :: (T role :: arg2) :: tokens -> (arg,role) :: manage_tokens (arg2 :: tokens)
  35 + | _ -> failwith "manage_tokens"
  36 +
  37 +let parse_dir p = function
  38 + T "/" :: tokens -> tokens, {p with dir=Forward_}
  39 + | T "\\" :: tokens -> tokens, {p with dir=Backward_}
  40 + | T "|" :: tokens -> tokens, {p with dir=Both_}
  41 + | tokens -> failwith ("parse_dir: " ^ Lexer.string_of_token_list tokens)
  42 +
  43 +let parse_multi p = function
  44 + T "?" :: tokens -> tokens, {p with is_necessary=Multi}
  45 + | tokens -> tokens,p
  46 +
  47 +let parse_morf p = function
  48 + [T "1"] -> {p with is_necessary=Opt}
  49 + | tokens ->
  50 + let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function
  51 + [T s] -> Atom s
  52 + | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in
  53 + {p with morfs=LCG (Tensor l) :: p.morfs}
  54 +
  55 +let parse_arg tokens p =
  56 + (* Printf.printf "parse_arg: %s\n" (Lexer.string_of_token_list tokens); *)
  57 + let tokens,p = parse_dir p tokens in
  58 + let tokens,p = parse_multi p tokens in
  59 + match Lexer.find_brackets ["(",")"] [] tokens with
  60 + [B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf
  61 + | tokens -> parse_morf p tokens
  62 +
  63 +
  64 +let parse_role p = function
  65 + "adjunct" -> {p with gf=ADJUNCT}
  66 + | "unk" -> {p with role="unk"}
  67 + | "nosem" -> {p with gf=NOSEM}
  68 + | "Count" -> {p with role="Count"}
  69 + | "Measure" -> {p with role="Measure"}
  70 + | s -> failwith ("parse_role: " ^ s)
  71 +
  72 +let parse_entry = function
  73 + [T symbol; T ":"; T "null"] -> symbol,[]
  74 + | T symbol :: T ":" :: tokens ->
  75 + (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *)
  76 + let tokens = Lexer.split_symbol (T ":") [] tokens in
  77 + let tokens = manage_tokens tokens in
  78 + let positions = Xlist.map tokens (fun (arg,role) ->
  79 + parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in
  80 + symbol,positions
  81 + | tokens -> failwith ("parse_entry: " ^ Lexer.string_of_token_list tokens)
  82 +
  83 +let load_lexicon filename =
  84 + let lines = File.load_lines filename in
  85 + let lines = List.rev (Xlist.rev_map lines remove_comments) in
  86 + let tokens = List.flatten (List.rev (Xlist.rev_map lines (Lexer.split "\\]\\| \\|\t\\|\r\\|\\?\\|:\\|;\\|&\\|!\\|=\\|}\\|{\\|,\\|\\*\\|/\\|\\+\\|)\\|(\\||\\|\\[\\|\\"))) in
  87 + let tokens = List.rev (Xlist.fold tokens [] (fun tokens -> function
  88 + T " " -> tokens
  89 + | T "\t" -> tokens
  90 + | T "\r" -> tokens
  91 + | t -> t :: tokens)) in
  92 + let entries = Lexer.split_symbol (T ";") [] tokens in
  93 + Xlist.fold entries StringMap.empty (fun map entry ->
  94 + let symbol,args = parse_entry entry in
  95 + StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol)))
  96 +
  97 +let sem_lexicon = load_lexicon "resources/lexicon-pl.dic"
  98 +
  99 +let extend_frame symbol frame =
  100 + try
  101 + let positions = StringMap.find sem_lexicon symbol in
  102 + {frame with positions=positions @ frame.positions}
  103 + with Not_found -> failwith ("extend_frame: " ^ symbol)
... ...
exec/ENIAMsemValence.ml 0 → 100644
  1 +(*
  2 + * ENIAMexec implements ENIAM processing stream
  3 + * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open ENIAMexecTypes
  21 +open ENIAM_LCGtypes
  22 +open ENIAM_LCGlexiconTypes
  23 +open ENIAMlexSemanticsTypes
  24 +open Xstd
  25 +
  26 +type pos = {role: linear_term; role_attr: linear_term; selprefs: linear_term; gf: ENIAMwalTypes.gf;
  27 + cr: string list; ce: string list;
  28 + is_necessary: bool; is_pro: bool; is_prong: bool; is_multi: bool; dir: string; morfs: StringSet.t}
  29 +
  30 +let match_value v2 = function
  31 + Val v -> if v = v2 then Val v else raise Not_found
  32 + | _ -> failwith "match_value"
  33 +
  34 +let match_neg_value vals = function
  35 + Val v -> if Xlist.mem vals v then raise Not_found else Val v
  36 + | _ -> failwith "match_neg_value"
  37 +
  38 +let rec apply_selector v2 = function
  39 + (sel,[]) -> failwith ("apply_selector: " ^ ENIAMcategoriesPL.string_of_selector sel)
  40 + | Negation,("NEGATION",v) :: l -> ("NEGATION",match_value v2 v) :: l
  41 + | Aspect,("ASPECT",v) :: l -> ("ASPECT",match_value v2 v) :: l
  42 + | Mood,("MOOD",v) :: l -> ("MOOD",match_value v2 v) :: l
  43 + | Nsyn,("NSYN",v) :: l -> ("NSYN",match_value v2 v) :: l
  44 + | Nsem,("NSEM",v) :: l -> ("NSEM",match_value v2 v) :: l
  45 + | Case,("CASE",v) :: l -> ("CASE",match_value v2 v) :: l
  46 + | Mode,("MODE",v) :: l -> ("MODE",match_value v2 v) :: l
  47 + | sel,(attr,v) :: l -> print_endline ("apply_selector: " ^ ENIAMcategoriesPL.string_of_selector sel ^ " " ^ attr); (attr,v) :: (apply_selector v2 (sel,l))
  48 +
  49 +let rec apply_neg_selector vals = function
  50 + (sel,[]) -> failwith ("apply_neg_selector: " ^ ENIAMcategoriesPL.string_of_selector sel)
  51 + | Nsem,("NSEM",v) :: l -> ("NSEM",match_neg_value vals v) :: l
  52 + | Case,("CASE",v) :: l -> ("CASE",match_neg_value vals v) :: l
  53 + | sel,(attr,v) :: l -> (*print_endline ("apply_neg_selector: " ^ ENIAMcategoriesPL.string_of_selector sel ^ " " ^ attr);*) (attr,v) :: (apply_neg_selector vals (sel,l))
  54 +
  55 +let rec apply_selectors attrs = function
  56 + [] -> attrs
  57 + | (sel,Eq,[v]) :: l -> apply_selectors (apply_selector v (sel,attrs)) l
  58 + | (sel,Neq,vals) :: l -> apply_selectors (apply_neg_selector vals (sel,attrs)) l
  59 + | _ -> failwith "apply_selectors"
  60 +
  61 +module OrderedStringDir =
  62 + struct
  63 + type t = string * string
  64 + let compare = compare
  65 + end
  66 +
  67 +module StringDirMap = Xmap.Make(OrderedStringDir)
  68 +
  69 +let rec get_arg_symbols_variant arg_symbols = function
  70 + Ref i ->
  71 + let l,dir = arg_symbols.(i) in
  72 + Xlist.map l (fun s -> (s,dir),Ref i)
  73 + | Variant(e,l) ->
  74 + let map = Xlist.fold l StringDirMap.empty (fun map (i,t) ->
  75 + Xlist.fold (get_arg_symbols_variant arg_symbols t) map (fun map (arg_symbol,t) ->
  76 + StringDirMap.add_inc map arg_symbol [i,t] (fun l -> (i,t) :: l))) in
  77 + StringDirMap.fold map [] (fun found arg_symbol l -> (arg_symbol,Variant(e,l)) :: found)
  78 + | t -> failwith ("get_arg_symbols_variant: " ^ ENIAM_LCGstringOf.linear_term 0 t)
  79 +
  80 +let rec get_arg_symbols_tuple arg_symbols rev = function
  81 + Dot -> rev
  82 + | Tuple l -> Xlist.fold l rev (get_arg_symbols_tuple arg_symbols)
  83 + | t -> (get_arg_symbols_variant arg_symbols t) :: rev
  84 +
  85 +let string_of_argdir = function
  86 + "forward" -> "/"
  87 + | "backward" -> "\\"
  88 + | "both" -> "|"
  89 + | _ -> failwith "string_of_argdir"
  90 +
  91 +let string_of_arg arg =
  92 + String.concat ", " (Xlist.map arg (fun ((arg_symbol,dir),t) -> (string_of_argdir dir) ^ arg_symbol ^ ":" ^ ENIAM_LCGstringOf.linear_term 0 t))
  93 +
  94 +let string_of_position p =
  95 + (string_of_argdir p.dir) ^ String.concat "+" (StringSet.to_list p.morfs)
  96 +
  97 +let rec match_arg_positions arg rev = function
  98 + p :: positions ->
  99 + Printf.printf "match_arg_positions: arg=%s rev=[%s] positions=%s :: [%s]\n%!" (string_of_arg arg) (String.concat "; " (Xlist.map rev string_of_position)) (string_of_position p) (String.concat "; " (Xlist.map positions string_of_position));
  100 + let l = Xlist.fold arg [] (fun l ((arg_symbol,dir),t) ->
  101 + if StringSet.mem p.morfs arg_symbol && p.dir = dir then t :: l else l) in
  102 + (match l with
  103 + [] -> print_endline "match_arg_positions: not matched"; match_arg_positions arg (p :: rev) positions
  104 + | [t] ->
  105 + let t = if p.gf = ENIAMwalTypes.SUBJ || p.gf = ENIAMwalTypes.OBJ || p.gf = ENIAMwalTypes.ARG then
  106 + SetAttr("role",p.role,SetAttr("role_attr",p.role_attr,SetAttr("selprefs",p.selprefs,t)))
  107 + else if p.gf = ENIAMwalTypes.ADJUNCT then t else failwith "match_arg_positions: ni 2" in
  108 + let t = SetAttr("gf",Val (ENIAMwalStringOf.gf p.gf),t) in
  109 + let t = Xlist.fold p.cr t (fun t cr -> SetAttr("controller",Val cr,t)) in
  110 + let t = Xlist.fold p.ce t (fun t ce -> SetAttr("controllee",Val ce,t)) in
  111 + if p.is_multi then (t, rev @ (p :: positions)) :: (match_arg_positions arg (p :: rev) positions)
  112 + else (t, rev @ positions) :: (match_arg_positions arg (p :: rev) positions)
  113 + | _ -> failwith "match_arg_positions: ni")
  114 + | [] -> Printf.printf "match_arg_positions: arg=%s rev=[%s] positions=[]\n%!" (string_of_arg arg) (String.concat "; " (Xlist.map rev string_of_position)); []
  115 +
  116 +(* Jeśli ta funkcja zwróci pustą listę, oznacza to, że argumentów nie dało się dopasować do pozycji *)
  117 +let rec match_args_positions_rec positions = function
  118 + arg :: args ->
  119 + Printf.printf "match_args_positions_rec: args=%s :: [%s] positions=[%s]\n%!" (string_of_arg arg) (String.concat "; " (Xlist.map args string_of_arg)) (String.concat "; " (Xlist.map positions string_of_position));
  120 + Xlist.fold (match_arg_positions arg [] positions) [] (fun found (arg_pos,positions) ->
  121 + Xlist.fold (match_args_positions_rec positions args) found (fun found l -> (arg_pos :: l) :: found))
  122 + | [] ->
  123 + Printf.printf "match_args_positions_rec: args=[] positions=[%s]\n%!" (String.concat "; " (Xlist.map positions string_of_position));
  124 + let b = Xlist.fold positions false (fun b p -> p.is_necessary || b) in
  125 + if b then print_endline "match_args_positions: not matched";
  126 + if b then [] else
  127 + [Xlist.fold positions [] (fun found p ->
  128 + if not p.is_pro then found else
  129 + let attrs = ["role",p.role; "role_attr",p.role_attr; "selprefs",p.selprefs; "gf",Val (ENIAMwalStringOf.gf p.gf)] in
  130 + let attrs = if p.is_prong then attrs else attrs in (* FIXME: dodać number, gender *)
  131 + let attrs = Xlist.fold p.cr attrs (fun attrs cr -> ("controller",Val cr) :: attrs) in
  132 + let attrs = Xlist.fold p.ce attrs (fun attrs ce -> ("controllee",Val ce) :: attrs) in
  133 + Node{ENIAM_LCGrenderer.empty_node with lemma="pro"; pos="pro"; attrs=attrs} :: found)]
  134 +
  135 +(* FIXME: opcjonalność podrzędników argumentów zleksykalizowanych *)
  136 +
  137 +(* Jeśli ta funkcja zwróci pustą listę, oznacza to, że argumentów nie dało się dopasować do pozycji *)
  138 +let match_args_positions args positions =
  139 + Printf.printf "match_args_positions: args=[%s] positions=[%s]\n%!" (String.concat "; " (Xlist.map args string_of_arg)) (String.concat "; " (Xlist.map positions string_of_position));
  140 + Xlist.rev_map (match_args_positions_rec positions args) (function
  141 + [] -> Dot
  142 + | [t] -> t
  143 + | l -> Tuple l)
  144 +
  145 +let translate_selprefs = function
  146 + ENIAMwalTypes.SynsetId _ -> failwith "translate_selprefs"
  147 + | ENIAMwalTypes.Predef _ -> failwith "translate_selprefs"
  148 + | ENIAMwalTypes.SynsetName s -> s
  149 + | ENIAMwalTypes.RelationRole _ -> "ALL"
  150 +
  151 +let string_of_internal_morf = function
  152 + Atom s -> s
  153 + | AVar s -> s
  154 + | Top -> "T"
  155 + | t -> failwith ("string_of_internal_morf: " ^ ENIAM_LCGstringOf.internal_grammar_symbol_prime t)
  156 +
  157 +
  158 +let string_of_morf = function
  159 + ENIAMwalTypes.LCG Tensor l -> String.concat "*" (Xlist.map l string_of_internal_morf)
  160 + | ENIAMwalTypes.LCG t -> failwith ("string_of_morf: " ^ ENIAM_LCGstringOf.grammar_symbol_prime t)
  161 + | _ -> failwith "string_of_morf"
  162 +
  163 +let rec string_of_arg_symbol = function
  164 + Dot -> ""
  165 + | Val s -> s
  166 + | Tuple l -> String.concat "*" (Xlist.map l string_of_arg_symbol)
  167 + | t -> failwith ("string_of_arg_symbol: " ^ ENIAM_LCGstringOf.linear_term 0 t)
  168 +
  169 +let translate_dir = function
  170 + ENIAMwalTypes.Both_ -> "both"
  171 + | ENIAMwalTypes.Forward_ -> "forward"
  172 + | ENIAMwalTypes.Backward_ -> "backward"
  173 +
  174 +let translate_position id p =
  175 + {role = Val p.ENIAMwalTypes.role;
  176 + role_attr = Val p.ENIAMwalTypes.role_attr;
  177 + selprefs = (match Xlist.map p.ENIAMwalTypes.sel_prefs translate_selprefs with
  178 + [] -> Dot
  179 + | [s] -> Val s
  180 + | l -> Tuple(Xlist.rev_map l (fun s -> Val s)));
  181 + gf=p.ENIAMwalTypes.gf;
  182 + cr=Xlist.map p.ENIAMwalTypes.cr (fun cr -> id ^ "-" ^ cr);
  183 + ce=Xlist.map p.ENIAMwalTypes.ce (fun ce -> id ^ "-" ^ ce);
  184 + is_necessary = p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.Req(*Xlist.fold p.ENIAMwalTypes.morfs true (fun b -> function ENIAMwalTypes.LCG One -> false | _ -> b)*);
  185 + is_pro = p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.Pro || p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.ProNG;
  186 + is_prong = p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.ProNG;
  187 + is_multi = p.ENIAMwalTypes.is_necessary = ENIAMwalTypes.Multi;
  188 + dir= translate_dir p.ENIAMwalTypes.dir;
  189 + morfs = Xlist.fold p.ENIAMwalTypes.morfs StringSet.empty (fun morfs morf ->
  190 + if morf = ENIAMwalTypes.LCG One then (Printf.printf "translate_position: One%!"; morfs) else
  191 + StringSet.add morfs (string_of_morf morf))}
  192 +
  193 +let get_phrase_symbol = function
  194 + Tuple[Val "lex";Val "się";Val "qub"] -> "lex-się-qub"
  195 + | Tuple(Val s :: _) -> s
  196 + | Val s -> s
  197 + (* | Dot -> "dot" *)
  198 + | t -> failwith ("get_phrase_symbol: " ^ ENIAM_LCGstringOf.linear_term 0 t)
  199 +
  200 +(* let extend_frame symbol = function *)
  201 +
  202 +let rec assign_frames_rec tokens lex_sems tree arg_symbols visited = function
  203 + Ref i ->
  204 + if IntSet.mem visited i then Ref i,visited else
  205 + let t,visited = assign_frames_rec tokens lex_sems tree arg_symbols (IntSet.add visited i) tree.(i) in
  206 + tree.(i) <- t;
  207 + Ref i,visited
  208 + | Node t ->
  209 + let args,visited = assign_frames_rec tokens lex_sems tree arg_symbols visited t.args in
  210 + let t = {t with args=args} in
  211 + if t.symbol = Dot then Node t,visited else
  212 + let args = get_arg_symbols_tuple arg_symbols [] args in
  213 + let s = ExtArray.get lex_sems t.id in
  214 + let symbol = get_phrase_symbol t.symbol in
  215 + let frames = Xlist.fold s.ENIAMlexSemanticsTypes.frames [] (fun frames frame ->
  216 + print_endline ("selectors: " ^ ENIAMcategoriesPL.string_of_selectors frame.selectors);
  217 + try
  218 + let attrs = apply_selectors t.attrs frame.selectors in
  219 + let frame = ENIAMsemLexicon.extend_frame symbol frame in
  220 + print_endline "passed";
  221 + (attrs,frame,Xlist.rev_map frame.positions (translate_position (string_of_int t.id))) :: frames
  222 + with Not_found -> print_endline "rejected"; frames) in
  223 + if frames = [] then failwith "assign_frames_rec: no frame" else
  224 + let e = ENIAM_LCGreductions.get_variant_label () in
  225 + let l,_ = Xlist.fold frames ([],1) (fun (l,n) (attrs,frame,positions) ->
  226 + Printf.printf "assign_frames_rec: lemma=%s args=[%s] positions=[%s]\n%!" t.lemma (String.concat "; " (Xlist.map args string_of_arg)) (String.concat "; " (Xlist.map positions string_of_position));
  227 + if frame.meanings = [] then failwith "assign_frames_rec: no meanings" else
  228 + Xlist.fold (match_args_positions args positions) (l,n) (fun (l,n) args ->
  229 + Xlist.fold frame.meanings (l,n) (fun (l,n) (meaning,hipero,weight) ->
  230 + (string_of_int n, Node{t with attrs=
  231 + ("meaning",Val meaning) ::
  232 + ("hipero",ENIAM_LCGrules.make_variant (Xlist.map hipero (fun (h,n) -> Tuple[Val h;Val(string_of_int n)]))) ::
  233 + ("arole",Val frame.arole) ::
  234 + ("arole-attr",Val frame.arole_attr) ::
  235 + ("arev",Val (if frame.arev then "+" else "-")) ::
  236 + ("fopinion",Val (ENIAMwalStringOf.opinion frame.fopinion)) ::
  237 + ("sopinion",Val (ENIAMwalStringOf.opinion frame.sopinion)) :: t.attrs; args=args}) ::
  238 + l,n+1))) in
  239 + if l = [] then failwith ("assign_frames_rec: no frame assingment found for " ^ t.lemma ^ " " ^ ENIAM_LCGstringOf.linear_term 0 t.symbol) else
  240 + Variant(e,l),visited
  241 + | Variant(e,l) ->
  242 + let l,visited = Xlist.fold l ([],visited) (fun (l,visited) (i,t) ->
  243 + let t,visited = assign_frames_rec tokens lex_sems tree arg_symbols visited t in
  244 + (i,t) :: l, visited) in
  245 + Variant(e,List.rev l),visited
  246 + | Tuple l ->
  247 + let l,visited = Xlist.fold l ([],visited) (fun (l,visited) t ->
  248 + let t,visited = assign_frames_rec tokens lex_sems tree arg_symbols visited t in
  249 + t :: l, visited) in
  250 + Tuple(List.rev l),visited
  251 + | Dot -> Dot,visited
  252 + | t -> failwith ("assign_frames_rec: " ^ ENIAM_LCGstringOf.linear_term 0 t)
  253 +
  254 +let rec get_arg_symbols = function
  255 + Node{arg_symbol=Tuple([Val "cp"; Val "T"; Val "T"]);
  256 + symbol=Tuple([Val "cp"; ctype; comp]); arg_dir=dir} ->
  257 + [string_of_arg_symbol (Tuple([Val "cp"; Val "T"; Val "T"]));
  258 + string_of_arg_symbol (Tuple([Val "cp"; ctype; comp]))],dir
  259 + | Node{arg_symbol=Tuple([Val "ncp"; Val "T"; Val arg_case; Val "T"; Val "T"; Val "T"; Val "T"]);
  260 + symbol=Tuple([Val "ncp"; number; case; gender; person; ctype; comp]); arg_dir=dir} ->
  261 + [string_of_arg_symbol (Tuple([Val "ncp"; Val "T"; Val arg_case; Val "T"; Val "T"; Val "T"; Val "T"]));
  262 + string_of_arg_symbol (Tuple([Val "ncp"; Val "T"; Val arg_case; Val "T"; Val "T"; ctype; comp]))],dir
  263 + | Node{arg_symbol=Tuple([Val "prepncp"; Val arg_prep; Val arg_case; Val "T"; Val "T"]);
  264 + symbol=Tuple([Val "prepncp"; prep; case; ctype; comp]); arg_dir=dir} ->
  265 + [string_of_arg_symbol (Tuple([Val "prepncp"; Val arg_prep; Val arg_case; Val "T"; Val "T"]));
  266 + string_of_arg_symbol (Tuple([Val "prepncp"; prep; case; ctype; comp]))],dir
  267 + | Node t -> [string_of_arg_symbol t.arg_symbol], t.arg_dir
  268 + | t -> failwith ("get_arg_symbols: " ^ ENIAM_LCGstringOf.linear_term 0 t)
  269 +
  270 +let assign_frames tokens lex_sems tree =
  271 + print_endline "assign_frames";
  272 + let tree = Array.copy tree in
  273 + let arg_symbols = Array.make (Array.length tree) ([],"") in
  274 + Int.iter 0 (Array.length tree - 1) (fun i ->
  275 + arg_symbols.(i) <- get_arg_symbols tree.(i));
  276 + let _ = assign_frames_rec tokens lex_sems tree arg_symbols IntSet.empty (Ref 0) in
  277 + tree
  278 +
  279 +let assign tokens lex_sems text =
  280 + map_text Struct (fun mode -> function
  281 + ENIAMSentence result ->
  282 + if result.status <> Parsed then ENIAMSentence result else
  283 + ENIAMSentence {result with dependency_tree6=assign_frames tokens lex_sems result.dependency_tree6}
  284 + | t -> t) text
  285 +
  286 +let rec cut_nodes result_tree = function
  287 + | Node t ->
  288 + let i = ExtArray.add result_tree (Node t) in
  289 + Ref i
  290 + | Variant(e,l) ->
  291 + let l = Xlist.rev_map l (fun (i,t) -> i, cut_nodes result_tree t) in
  292 + Variant(e,List.rev l)
  293 + | Tuple l ->
  294 + let l = Xlist.rev_map l (cut_nodes result_tree) in
  295 + Tuple(List.rev l)
  296 + | Dot -> Dot
  297 + | t -> failwith ("cut_nodes: " ^ ENIAM_LCGstringOf.linear_term 0 t)
  298 +
  299 +let rec reduce_set_attr attr v = function
  300 + Node t -> Node{t with attrs=(attr,v) :: t.attrs}
  301 + | Variant(e,l) ->
  302 + Variant(e,List.rev (Xlist.rev_map l (fun (i,t) ->
  303 + i, reduce_set_attr attr v t)))
  304 + | t -> failwith ("reduce_set_attr: " ^ ENIAM_LCGstringOf.linear_term 0 t)
  305 +
  306 +let rec reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree = function
  307 + Ref i ->
  308 + if mid_tree.(i) <> Dot then mid_tree.(i) else
  309 + let t = reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree orig_tree.(i) in
  310 + mid_tree.(i) <- t;
  311 + t
  312 + | Node t ->
  313 + let args = reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree t.args in
  314 + print_endline ("reduce_tree_rec 1: " ^ ENIAM_LCGstringOf.linear_term 0 args);
  315 + let args = cut_nodes result_tree args in
  316 + print_endline ("reduce_tree_rec 2: " ^ ENIAM_LCGstringOf.linear_term 0 args);
  317 + let id =
  318 + if t.id = 0 then
  319 + let id = ExtArray.add tokens {ENIAMtokenizerTypes.empty_token_env with ENIAMtokenizerTypes.token=ENIAMtokenizerTypes.Lemma("pro","pro",[[]])} in
  320 + let _ = ExtArray.add lex_sems empty_lex_sem in
  321 + id
  322 + else t.id in
  323 + Node{t with args=args; id=id}
  324 + | Variant(e,l) ->
  325 + let l = Xlist.rev_map l (fun (i,t) -> i, reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree t) in
  326 + Variant(e,List.rev l)
  327 + | Tuple l ->
  328 + let l = Xlist.rev_map l (reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree) in
  329 + Tuple(List.rev l)
  330 + | Dot -> Dot
  331 + | SetAttr(attr,v,t) ->
  332 + let t = reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree t in
  333 + reduce_set_attr attr v t
  334 + | t -> failwith ("reduce_tree_rec: " ^ ENIAM_LCGstringOf.linear_term 0 t)
  335 +
  336 +let reduce_tree tokens lex_sems orig_tree =
  337 + print_endline "reduce_tree";
  338 + let mid_tree = Array.make (Array.length orig_tree) Dot in
  339 + let result_tree = ExtArray.make (Array.length orig_tree) Dot in
  340 + let _ = ExtArray.add result_tree Dot in
  341 + let t = reduce_tree_rec tokens lex_sems result_tree mid_tree orig_tree orig_tree.(0) in
  342 + ExtArray.set result_tree 0 t;
  343 + ExtArray.to_array result_tree
  344 +
  345 +let reduce tokens lex_sems text =
  346 + map_text Struct (fun mode -> function
  347 + ENIAMSentence result ->
  348 + if result.status <> Parsed then ENIAMSentence result else
  349 + ENIAMSentence {result with dependency_tree6=reduce_tree tokens lex_sems result.dependency_tree6}
  350 + | t -> t) text
... ...
exec/ENIAMvisualization.ml
... ... @@ -774,8 +774,8 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
774 774 | LexiconError -> sprintf "error_lex: %s paths_size=%d\n" result.msg result.paths_size
775 775 | ParseError ->
776 776 if verbosity = 0 then () else (
777   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1;
778   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2;
  777 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
  778 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2;
779 779 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2);
780 780 sprintf "error_parse: %s paths_size=%d\n" result.msg result.paths_size ^
781 781 (if verbosity = 0 then "" else
... ... @@ -785,10 +785,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
785 785 ""
786 786 | ParseTimeout ->
787 787 if verbosity < 2 then () else (
788   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1;
  788 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
789 789 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2);
790 790 if verbosity = 0 then () else (
791   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2);
  791 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2);
792 792 sprintf "timeout: %s paths_size=%d\n" result.msg result.paths_size ^
793 793 (if verbosity < 2 then "" else
794 794 sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix ^
... ... @@ -798,13 +798,13 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
798 798 ""
799 799 | NotParsed ->
800 800 if verbosity = 0 then () else (
801   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1);
  801 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1);
802 802 if verbosity < 2 then () else (
803   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2;
  803 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2;
804 804 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2;
805 805 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3);
806 806 if verbosity = 0 then () else (
807   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3);
  807 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3);
808 808 sprintf "not_parsed: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^
809 809 (if verbosity = 0 then "" else
810 810 sprintf "<BR><A HREF=\"%s_1_chart.pdf\">Chart 1</A>\n" file_prefix) ^
... ... @@ -817,11 +817,11 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
817 817 ""
818 818 | ReductionError ->
819 819 if verbosity < 2 then () else (
820   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2;
  820 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2;
821 821 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2;
822   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3);
  822 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3);
823 823 if verbosity = 0 then () else (
824   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1;
  824 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
825 825 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3);
826 826 (if verbosity < 2 then "" else
827 827 sprintf "error_reduction: %s paths_size=%d chart_size=%d\n" result.msg result.paths_size result.chart_size ^
... ... @@ -834,10 +834,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
834 834 ""
835 835 | TooManyNodes ->
836 836 if verbosity < 2 then () else (
837   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1;
838   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2;
  837 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
  838 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2;
839 839 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2;
840   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3;
  840 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3;
841 841 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3);
842 842 sprintf "to_many_nodes: paths_size=%d chart_size=%d\n" result.paths_size result.chart_size ^
843 843 (if verbosity < 2 then "" else
... ... @@ -849,10 +849,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
849 849 ""
850 850 | NotReduced ->
851 851 if verbosity < 2 then () else (
852   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1;
853   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2;
  852 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
  853 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2;
854 854 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2;
855   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3);
  855 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3);
856 856 if verbosity = 0 then () else (
857 857 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3;
858 858 Xlatex.latex_file_out path (file_prefix ^ "_4_term") "a4" false (fun file ->
... ... @@ -872,10 +872,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
872 872 ""
873 873 | SemError ->
874 874 if verbosity < 2 then () else (
875   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1;
876   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2;
  875 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
  876 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2;
877 877 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2;
878   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3);
  878 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3);
879 879 if verbosity = 0 then () else (
880 880 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3;
881 881 Xlatex.latex_file_out path (file_prefix ^ "_4_term") "a4" false (fun file ->
... ... @@ -895,10 +895,10 @@ let html_of_eniam_sentence path file_prefix img verbosity tokens (result : eniam
895 895 ""
896 896 | Parsed ->
897 897 if verbosity < 2 then () else (
898   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.chart1;
899   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.chart2;
  898 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_1_chart") "a1" result.text_fragments result.chart1;
  899 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_2_chart") "a4" result.text_fragments result.chart2;
900 900 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_2_references") "a0" result.references2;
901   - ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.chart3;
  901 + ENIAM_LCGlatexOf.print_chart path (file_prefix ^ "_3_chart") "a4" result.text_fragments result.chart3;
902 902 ENIAM_LCGlatexOf.print_references path (file_prefix ^ "_3_references") "a0" result.references3;
903 903 Xlatex.latex_file_out path (file_prefix ^ "_4_term") "a4" false (fun file ->
904 904 Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 result.term4));
... ...
exec/TODO
... ... @@ -4,3 +4,5 @@ przetwarzanie biogramu do końca
4 4 przetwarzanie dialogów
5 5 przechwytywanie błędów subsyntax itp w parserze i semparserze
6 6 interfejs dla clarin
  7 +
  8 +przetwarzanie kontroli jako dodawanie pro/koreferencji, oraz uzgadnianie przypadków
... ...
exec/makefile
... ... @@ -9,15 +9,15 @@ OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa b
9 9 eniam-lexSemantics.cmxa #eniam-exec.cmxa
10 10 INSTALLDIR=`ocamlc -where`/eniam
11 11  
12   -SOURCES= ENIAMexecTypes.ml ENIAMexec.ml ENIAMselectSent.ml ENIAMvisualization.ml
  12 +SOURCES= ENIAMexecTypes.ml ENIAMexec.ml ENIAMselectSent.ml ENIAMsemLexicon.ml ENIAMsemValence.ml ENIAMvisualization.ml
13 13  
14 14 all: eniam-exec.cma eniam-exec.cmxa
15 15  
16 16 install: all
17 17 mkdir -p $(INSTALLDIR)
18 18 cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR)
19   - cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMvisualization.cmi $(INSTALLDIR)
20   - cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMvisualization.cmx $(INSTALLDIR)
  19 + cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR)
  20 + cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR)
21 21  
22 22  
23 23 eniam-exec.cma: $(SOURCES)
... ...
exec/parser.ml
... ... @@ -30,6 +30,7 @@ let img = ref 1
30 30 let timeout = ref 30.
31 31 let select_sentence_modes_flag = ref false
32 32 let select_sentences_flag = ref true
  33 +let assign_semantic_valence_flag = ref true
33 34 let output_dir = ref "results/"
34 35 let spec_list = [
35 36 (* "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
... ... @@ -56,6 +57,8 @@ let spec_list = [
56 57 "--no_sel_modes", Arg.Unit (fun () -> select_sentence_modes_flag:=false), "Do not select sencence modes (default)";
57 58 "--sel_sent", Arg.Unit (fun () -> select_sentences_flag:=true), "Select parsed sentences (default)";
58 59 "--no_sel_sent", Arg.Unit (fun () -> select_sentences_flag:=false), "Do not select parsed sentences";
  60 + "--sem_valence", Arg.Unit (fun () -> assign_semantic_valence_flag:=true), "Assign semantic valence (default)";
  61 + "--no_sem_valence", Arg.Unit (fun () -> assign_semantic_valence_flag:=false), "Do not assign semantic valence";
59 62 ]
60 63  
61 64 let usage_msg =
... ... @@ -93,6 +96,8 @@ let rec main_loop sub_in sub_out =
93 96 let text = ENIAMexec.parse !timeout !verbosity rules tokens lex_sems text in
94 97 let text = if !select_sentence_modes_flag then ENIAMselectSent.select_sentence_modes_text text else text in
95 98 let text = if !select_sentences_flag then ENIAMselectSent.select_sentences_text ENIAMexecTypes.Struct text else text in
  99 + let text = if !assign_semantic_valence_flag then ENIAMsemValence.assign tokens lex_sems text else text in
  100 + let text = if !assign_semantic_valence_flag then ENIAMsemValence.reduce tokens lex_sems text else text in
96 101 ENIAMvisualization.print_html_text !output_dir "parsed_text" text !img !verbosity tokens);
97 102 prerr_endline "Done!";
98 103 main_loop sub_in sub_out)
... ... @@ -106,10 +111,7 @@ let _ =
106 111 prerr_endline message;
107 112 ENIAMcategoriesPL.initialize ();
108 113 Arg.parse spec_list anon_fun usage_msg;
109   - if !lexSemantics_built_in then (
110   - ENIAMsubsyntax.initialize ();
111   - ENIAMwalParser.initialize ();
112   - ENIAMwalReduce.initialize ());
  114 + if !lexSemantics_built_in then ENIAMlexSemantics.initialize ();
113 115 Gc.compact ();
114 116 let sub_in,sub_out =
115 117 if !lexSemantics_built_in then stdin,stdout
... ...
exec/resources/lexicon-pl.dic 0 → 100644
  1 +
  2 +day-lex: /(date+day+day-month):unk;
  3 +date: /(1+year-lex):unk;
  4 +day: /month-lex:unk;
  5 +day-interval: /month-lex:unk;
  6 +day-month: /(1+year-lex):unk;
  7 +year-lex: |(1+adjp*number*case*gender):unk;
  8 +month-lex: /(1+year+np*T*gen*T*T):unk;
  9 +
  10 +date-interval: null;
  11 +day-month-interval: null;
  12 +month-interval: null;
  13 +year: null;
  14 +year-interval: null;
  15 +roman: null;
  16 +roman-interval: null;
  17 +hour-minute: null;
  18 +hour: null;
  19 +hour-minute-interval: null;
  20 +hour-interval: null;
  21 +obj-id: null;
  22 +match-result: null;
  23 +url: null;
  24 +email: null;
  25 +
  26 +np:
  27 + \(1+num*number*case*gender*person*congr+num*number*case*gender*person*rec):Count \(1+qub):adjunct /(1+inclusion):adjunct
  28 + \(1+measure*unumber*ucase*ugender*uperson):Measure
  29 + /(1+date+day+day-month+date-interval+day-interval+day-month-interval+year+year-interval+hour+hour-minute+hour-interval+hour-minute-interval):unk
  30 + |(1+year):unk /(1+obj-id):unk;
  31 +
  32 +num: \(1+qub):adjunct /(1+inclusion):adjunct;
  33 +
  34 +measure:
  35 + \(1+num*number*case*gender*person*congr+num*number*case*gender*person*rec):Count \(1+qub):adjunct /(1+inclusion):adjunct;
  36 +
  37 +prepnp: \(1+advp*T):adjunct /(np*T*case*T*T+day-month+day+year+date+hour+hour-minute):unk \(1+qub):adjunct /(1+inclusion):adjunct;
  38 +prepadjp: \(1+advp*T):adjunct /(adjp*T*case*T+adjp*sg*dat*m1+adjp*T*postp*T+adjp*sg*nom*f+advp*T):unk \(1+qub):adjunct /(1+inclusion):adjunct;
  39 +compar: \(1+advp*T):adjunct /(np*T*case*T*T+prepnp*T*T+prepadjp*T*T):unk \(1+qub):adjunct /(1+inclusion):adjunct;
  40 +
  41 +adjp: \(1+qub):adjunct /(1+inclusion):adjunct \(1+adja):unk;
  42 +
  43 +adja: /hyphen:nosem;
  44 +
  45 +advp: \(1+qub):adjunct /(1+inclusion):adjunct \(1+adja):unk;
  46 +
  47 +#FIXME: sprawdzić czy 'or' czy 'or2'
  48 +ip: /(1+int):unk \(1+qub):adjunct /(1+inclusion):adjunct \(1+nie):nosem |(1+aux-imp):nosem
  49 + |(1+aux-fut*number*gender*person+aux-past*number*gender*person):nosem |(1+aglt*number*person):nosem |(1+by):nosem /(1+or):adjunct;
  50 +
  51 +aux-fut: null;
  52 +aux-past: null;
  53 +aglt: null;
  54 +
  55 +infp: \(1+qub):adjunct /(1+inclusion):adjunct \(1+nie):nosem;
  56 +padvp: \(1+qub):adjunct /(1+inclusion):adjunct \(1+nie):nosem;
  57 +
  58 +cp: /ip*T*T*T:unk;
  59 +ncp: \(1+qub):adjunct /(1+inclusion):adjunct /cp*ctype*plemma:unk;
  60 +
  61 +#lemma=i|lub|czy|bądź,pos=conj:
  62 +# QUANT[number=all_numbers,gender=all_genders,person=all_persons]
  63 +# (ip*number*gender*person/ip*T*T*T)\ip*T*T*T;
  64 +#lemma=,|i|lub|czy|bądź,pos=conj: (advp*mod/prepnp*T*T)\prepnp*T*T;
  65 +#lemma=,|i|lub|czy|bądź,pos=conj: QUANT[mode=0] (advp*mode/advp*mode)\prepnp*T*T;
  66 +#lemma=,|i|lub|czy|bądź,pos=conj: QUANT[mode=0] (advp*mode/prepnp*T*T)\advp*mode;
  67 +#lemma=,|i|lub|czy|bądź,pos=conj: (advp*mod/advp*T)\advp*T; #FIXME: przydałaby się wersja zachowująca mode
  68 +#lemma=,|i|lub|czy|bądź,pos=conj:
  69 +# QUANT[plemma=0,case=all_cases]
  70 +# (prepnp*plemma*case/prepnp*plemma*case)\prepnp*plemma*case;
  71 +#lemma=,|i|lub|czy|bądź,pos=conj:
  72 +# QUANT[number=all_numbers,case=all_cases,gender=all_genders,person=all_persons]
  73 +# (np*number*case*gender*person/np*T*case*T*T)\np*T*case*T*T;
  74 +#lemma=,|i|lub|czy|bądź,pos=conj:
  75 +# QUANT[number=all_numbers,case=all_cases,gender=all_genders]
  76 +# (adjp*number*case*gender/adjp*number*case*gender)\adjp*number*case*gender;
  77 +
  78 +lex-się-qub: null;
  79 +nie: null;
  80 +by: null;
  81 +aux-imp: null;
  82 +qub: null;
  83 +interj: null;
  84 +hyphen: null;
  85 +int: null;
  86 +#lemma=„,pos=interp: QUANT[number=0,case=0,gender=0,person=0] (np*number*case*gender*person/rquot)/np*number*case*gender*person; #SetAttr("QUOT",Val "+",Var "x"
  87 +#lemma=«,pos=interp: QUANT[number=0,case=0,gender=0,person=0] (np*number*case*gender*person/rquot2)/np*number*case*gender*person; #SetAttr("QUOT",Val "+",Var "x"
  88 +#lemma=»,pos=interp: QUANT[number=0,case=0,gender=0,person=0] (np*number*case*gender*person/rquot3)/np*number*case*gender*person; #SetAttr("QUOT",Val "+",Var "x"
  89 +rquot: null;
  90 +rquot2: null;
  91 +rquot3: null;
  92 +#lemma=(,pos=interp: (inclusion/rparen)/(np*T*T*T*T+ip*T*T*T+adjp*T*T*T+prepnp*T*T); #SetAttr("INCLUSION",Val "+",
  93 +#lemma=[,pos=interp: (inclusion/rparen2)/(np*T*T*T*T+ip*T*T*T+adjp*T*T*T+prepnp*T*T); #SetAttr("INCLUSION",Val "+",
  94 +rparen: null;
  95 +rparen2: null;
  96 +
  97 +<conll_root>: /(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj):unk;
  98 +s: \?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj):unk;
  99 +<root>: /(1+s):unk /(1+<speaker-end>):unk /(1+or):unk /(1+np*T*nom*T*T):unk /(1+ip*T*T*T):unk;
  100 +
  101 +or: null;
  102 +<colon>: \<speaker>:unk /(1+<squery>):unk;
  103 +or2: \?(ip*T*T*T+cp*int*T+np*sg*voc*T*T+interj):unk;
  104 +<speaker-end>: null
... ...
lexSemantics/ENIAMadjuncts.ml
... ... @@ -268,6 +268,27 @@ let add_adjuncts preps compreps compars pos2 (selectors,schema) =
268 268 | "adv" -> [selectors,schema @ ENIAMwalRenderer.adv_adjuncts_simp @ compars]
269 269 | _ -> []
270 270  
  271 +open ENIAMlexSemanticsTypes
  272 +
  273 +let add_subj_cr cr positions =
  274 + Xlist.map positions (fun p ->
  275 + if p.gf = SUBJ then {p with cr=cr :: p.cr} else p)
  276 +
  277 +let add_connected_adjuncts preps compreps compars pos2 frame =
  278 + let compreps = Xlist.rev_map compreps ENIAMwalRenderer.render_connected_comprep in
  279 + let prepnps = Xlist.rev_map preps (fun (prep,cases) -> ENIAMwalRenderer.render_connected_prepnp prep cases) in
  280 + let prepadjps = Xlist.rev_map preps (fun (prep,cases) -> ENIAMwalRenderer.render_connected_prepadjp prep cases) in
  281 + let compars = Xlist.rev_map compars ENIAMwalRenderer.render_connected_compar in
  282 + match pos2 with
  283 + "verb" -> [{frame with positions=(add_subj_cr "3" frame.positions) @ ENIAMwalRenderer.verb_connected_adjuncts_simp @ prepnps @ prepadjps @ compreps @ compars}]
  284 + | "noun" -> [
  285 + {frame with selectors=[Nsyn,Eq,["proper"]] @ frame.selectors; positions=ENIAMwalRenderer.proper_noun_connected_adjuncts_simp @ prepnps @ compreps @ compars};
  286 + {frame with selectors=[Nsyn,Eq,["common"];Nsem,Eq,["measure"]] @ frame.selectors; positions=ENIAMwalRenderer.measure_noun_connected_adjuncts_simp @ prepnps @ compreps @ compars};
  287 + {frame with selectors=[Nsyn,Eq,["common"];Nsem,Neq,["measure"]] @ frame.selectors; positions=frame.positions @ ENIAMwalRenderer.common_noun_connected_adjuncts_simp @ prepnps @ compreps @ compars}]
  288 + | "adj" -> [{frame with positions=frame.positions @ ENIAMwalRenderer.adj_connected_adjuncts_simp @ compars}]
  289 + | "adv" -> [{frame with positions=frame.positions @ ENIAMwalRenderer.adv_connected_adjuncts_simp @ compars}]
  290 + | _ -> []
  291 +
271 292 (* let _ =
272 293 let schemata,entries = ENIAMvalence.prepare_all_valence ENIAMwalParser.phrases ENIAMwalParser.schemata ENIAMwalParser.entries in
273 294 let _ = Entries.map2 schemata (fun pos lemma schemata -> simplify_schemata pos (ENIAMvalence.simplify_pos pos) lemma schemata) in
... ...
lexSemantics/ENIAMlexSemantics.ml
... ... @@ -23,12 +23,61 @@ open ENIAMlexSemanticsTypes
23 23 open ENIAMwalTypes
24 24 open Xstd
25 25  
26   -(*let find_senses t = (* FIXME: sensy zawierające 'się' *)
27   - match t.token with
28   - Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos
29   - | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses
30   - | _ -> []
31   -*)
  26 +let find_meaning m =
  27 + try
  28 + ENIAMplWordnet.find_meaning m.plwnluid
  29 + with Not_found ->
  30 + m.name ^ "-" ^ m.variant, [], unknown_meaning_weight
  31 +
  32 +let find_prep_meaning lemma hipero =
  33 + let hipero = match hipero with
  34 + [Predef hipero] -> hipero
  35 + | _ -> failwith "find_prep_meaning" in
  36 + if hipero = "ALL" then lemma, [hipero,0], unknown_meaning_weight else
  37 + let syn_id = StringMap.find !ENIAMplWordnet.predef hipero in
  38 + let hipero = IntMap.fold (ENIAMplWordnet.get_hipero syn_id) [] (fun hipero syn_id cost -> (ENIAMplWordnet.synset_name syn_id, cost) :: hipero) in
  39 + lemma, hipero, unknown_meaning_weight
  40 +
  41 +let lex_sie = LCG (ENIAMwalRenderer.render_morf (SimpleLexArg("się",QUB)))
  42 +
  43 +(* FIXME: naiwnie wierzymy, że jeśli leksem jest opisany semantycznie w walentym to zawiera ramy dla wszystkich sensów *)
  44 +let find_senses t s =
  45 + (*let set = Xlist.fold s.frames StringSet.empty (fun set frame ->
  46 + Xlist.fold frame.meanings set (fun set (name,hipero,weight) ->
  47 + StringSet.add set name)) in*)
  48 + let senses = match t.token with
  49 + Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos
  50 + | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses
  51 + | _ -> [] in
  52 + (* let senses =
  53 + Xlist.fold senses [] (fun senses (name,hipero,weight) ->
  54 + if StringSet.mem set name then senses else (name,hipero,weight) :: senses) in *)
  55 + let senses_sie = match t.token with
  56 + Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses (lemma ^ " się") pos
  57 + | Proper(_,_,_,senses) -> []
  58 + | _ -> [] in
  59 +(* let senses_sie = Xlist.fold senses_sie [] (fun senses_sie (name,hipero,weight) ->
  60 + if StringSet.mem set name then senses_sie else (name,hipero,weight) :: senses_sie) in
  61 + let frames = if senses = [] then s.frames else {empty_frame with meanings=senses} :: s.frames in
  62 + let frames = if senses_sie = [] then frames else {empty_frame with meanings=senses_sie;
  63 + positions=[{empty_position with role="Lemma"; mode=["lemma"]; morfs=[lex_sie]; is_necessary=Req}]} :: frames in*) (* FIXME: czy to nie usuwa elementów z ramy? *)
  64 + let frames = Xlist.fold s.frames [] (fun frames f ->
  65 + if f.meanings <> [] then f :: frames else
  66 + if senses = [] && senses_sie = [] then {f with meanings=[ENIAMtokens.get_lemma t.token, [], unknown_meaning_weight]} :: frames else
  67 + (if senses_sie = [] then [] else [{f with meanings=senses_sie; positions={empty_position with role="Lemma"; mode=["lemma"]; morfs=[lex_sie]; is_necessary=Req} :: f.positions}]) @
  68 + [{f with meanings=senses}] @ frames) in
  69 + {s with frames=frames}
  70 +
  71 +let find_selprefs schema = (* FIXME: RelationRole *)
  72 + Xlist.map schema (fun p ->
  73 + let l = Xlist.fold p.sel_prefs [] (fun l -> function
  74 + SynsetId id -> (try ENIAMplWordnet.synset_name id :: l with ENIAMplWordnet.SynsetNotFound -> l)
  75 + | Predef s -> s :: l
  76 + | SynsetName _ -> failwith "find_selprefs"
  77 + | RelationRole _ -> l) in
  78 + let l = if l = [] then ["ALL"] else l in
  79 + {p with sel_prefs=Xlist.map l (fun s -> SynsetName s)})
  80 +
32 81 let rec find a l i =
33 82 if a.(i) = max_int then (
34 83 a.(i) <- i;
... ... @@ -102,6 +151,33 @@ let get_preps tokens group = (* FIXME: To nie zadziała przy kilku wystąpieniac
102 151 | _ -> preps,compars) in
103 152 StringMap.fold preps [] (fun l prep v -> (prep, StringSet.to_list v) :: l), StringSet.to_list compars
104 153  
  154 +let make_unique schemata =
  155 + let map = Xlist.fold schemata StringMap.empty (fun map (selectors,schema) ->
  156 + let s = "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "}" in
  157 + StringMap.add map s (selectors,schema)) in
  158 + StringMap.fold map [] (fun l _ (selectors,schema) -> (selectors,schema) :: l)
  159 +
  160 +let semantize lemma pos (selectors,schema) =
  161 + let schema = Xlist.rev_map schema (fun p ->
  162 + {p with role="Arg"; sel_prefs=[Predef "X"]}) in (* FIXME: zaślepka, żeby preferować znane argumenty *)
  163 + Xlist.rev_map (ENIAMvalence.get_aroles schema lemma pos) (fun (sel,arole,arole_attr,arev) ->
  164 + {empty_frame with selectors=sel @ selectors; positions=schema;
  165 + arole=arole; arole_attr=arole_attr; arev=arev})
  166 +
  167 +let assign_prep_semantics lemma =
  168 + let roles = try StringMap.find ENIAMlexSemanticsData.prep_roles lemma with Not_found -> [] in
  169 + Printf.printf "assign_prep_semantics: |roles|=%d\n%!" (Xlist.size roles);
  170 + Xlist.map roles (function (case,arole,arole_attr,hipero,sel_prefs) ->
  171 + Printf.printf "assign_prep_semantics: case=%s arole=%s arole_attr=%s\n%!" case arole arole_attr;
  172 + let meaning = find_prep_meaning lemma hipero in (* FIXME: zaślepka dla meaning i weight *)
  173 + print_endline "assign_prep_semantics 1";
  174 + let positions = [{empty_position with
  175 + sel_prefs=sel_prefs; dir=if lemma="temu" then Backward_ else Forward_;
  176 + morfs=ENIAMwalRenderer.assing_pref_morfs (lemma,case); is_necessary=Req}] in
  177 + print_endline "assign_prep_semantics 2";
  178 + {empty_frame with selectors=[ENIAM_LCGlexiconTypes.Case,ENIAM_LCGlexiconTypes.Eq,[case]]; meanings=[meaning]; positions=find_selprefs positions;
  179 + arole=arole; arole_attr=arole_attr; arev=false})
  180 +
105 181 let assign_valence tokens lex_sems group =
106 182 let lexemes = Xlist.fold group StringSet.empty (fun lexemes id ->
107 183 let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens id).token in
... ... @@ -118,10 +194,10 @@ let assign_valence tokens lex_sems group =
118 194 (* Printf.printf "A %s %s %s |schemata|=%d\n" lemma pos pos2 (Xlist.size schemata); *)
119 195 let entries = Entries.find entries pos lemma in
120 196 let connected = Entries.find connected pos2 lemma in
121   - let schemata = List.flatten (Xlist.map schemata (fun (opinion,neg,pred,aspect,schema) ->
122   - ENIAMvalence.transform_entry pos lemma neg pred aspect schema)) in (* FIXME: gubię opinię *)
  197 + let schemata1 = List.flatten (Xlist.map schemata (fun (opinion,neg,pred,aspect,schema) ->
  198 + ENIAMvalence.transform_entry pos lemma neg pred aspect schema)) in (* gubię opinię *)
123 199 (* Printf.printf "B %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
124   - let schemata = ENIAMadjuncts.simplify_schemata lexemes pos pos2 lemma schemata in
  200 + let schemata = ENIAMadjuncts.simplify_schemata lexemes pos pos2 lemma schemata1 in
125 201 (* Printf.printf "C %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
126 202 let schemata = Xlist.rev_map schemata (fun (selectors,schema) ->
127 203 selectors,ENIAMwalRenderer.render_simple_schema schema) in
... ... @@ -130,13 +206,31 @@ let assign_valence tokens lex_sems group =
130 206 let entries = List.flatten (Xlist.rev_map entries (ENIAMvalence.transform_lex_entry pos lemma)) in
131 207 let entries = Xlist.map entries (fun (selectors,entry) ->
132 208 selectors,ENIAMwalRenderer.render_lex_entry entry) in
133   - let connected = List.flatten (Xlist.map connected (fun (sopinion,fopinion,meanings,neg,pred,aspect,schema) ->
134   - Xlist.rev_map (ENIAMvalence.transform_entry pos lemma neg pred aspect schema) (fun (selectors,schema) ->
135   - selectors,meanings,schema))) in (* FIXME: gubię opinię *)
136   - let connected = Xlist.fold connected [] (fun connected (selectors,meanings,schema) ->
137   - if ENIAMadjuncts.check_selector_lex_constraints lexemes pos selectors then (selectors,meanings,schema) :: connected else connected) in
138   - let connected = Xlist.rev_map connected (fun (selectors,meanings,schema) ->
139   - selectors,meanings,ENIAMwalRenderer.render_connected_schema schema) in
  209 + let connected = List.flatten (Xlist.map connected (fun (sopinion,fopinion,meanings,neg,pred,aspect,schema1) ->
  210 + List.flatten (Xlist.rev_map (ENIAMvalence.transform_entry pos lemma neg pred aspect schema1) (fun (selectors,schema) ->
  211 + Xlist.rev_map (ENIAMvalence.get_aroles schema1 lemma pos) (fun (sel,arole,arole_attr,arev) ->
  212 + {selectors=sel @ selectors; meanings=Xlist.map meanings find_meaning; positions=schema;
  213 + arole=arole; arole_attr=arole_attr; arev=arev; sopinion=sopinion; fopinion=fopinion}))))) in
  214 + (* Printf.printf "E %s |connected|=%d\n" lemma (Xlist.size connected); *)
  215 + let connected = if connected = [] then List.flatten (Xlist.rev_map (make_unique schemata1) (semantize lemma pos)) else connected in
  216 + (* Printf.printf "F %s |connected|=%d\n" lemma (Xlist.size connected); *)
  217 + let connected = Xlist.fold connected [] (fun connected frame ->
  218 + if ENIAMadjuncts.check_selector_lex_constraints lexemes pos frame.selectors then frame :: connected else connected) in
  219 + (* Printf.printf "G %s |connected|=%d\n" lemma (Xlist.size connected); *)
  220 + let connected = Xlist.rev_map connected (fun frame ->
  221 + {frame with
  222 + positions = find_selprefs (ENIAMwalRenderer.render_connected_schema (ENIAMwalReduce.set_necessary frame.positions))}) in
  223 + (* Printf.printf "H %s |connected|=%d\n" lemma (Xlist.size connected); *)
  224 + let connected = List.flatten (Xlist.rev_map connected (ENIAMadjuncts.add_connected_adjuncts preps compreps compars pos2)) in
  225 + (* Printf.printf "I %s |connected|=%d\n" lemma (Xlist.size connected); *)
  226 + let connected = if pos = "prep" then
  227 + if connected <> [] then failwith "assign_valence" else
  228 + assign_prep_semantics lemma else connected in
  229 + (* Printf.printf "J %s |connected|=%d\n" lemma (Xlist.size connected); *)
  230 + let connected = if connected = [] then
  231 + Xlist.rev_map (ENIAMvalence.get_aroles [] lemma pos) (fun (sel,arole,arole_attr,arev) ->
  232 + {empty_frame with selectors=sel; arole=arole; arole_attr=arole_attr; arev=arev}) else connected in
  233 + (* Printf.printf "K %s |connected|=%d\n" lemma (Xlist.size connected); *)
140 234 ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
141 235 schemata=schemata; lex_entries=entries; frames=connected})
142 236  
... ... @@ -177,60 +271,54 @@ let assign_valence tokens lex_sems group =
177 271 (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*))};
178 272 ExtArray.set tokens id {(ExtArray.get tokens id) with token=Lemma(lemma,pos,interp)}
179 273 | _ -> ())
  274 +*)
180 275  
181   -let get_prefs_schema prefs schema =
182   - Xlist.fold schema prefs (fun prefs t ->
183   - Xlist.fold t.sel_prefs prefs StringSet.add)
184   -
185   -let map_prefs_schema senses schema =
186   - Xlist.map schema (fun t ->
187   - if Xlist.mem t.morfs (Phrase Pro) || Xlist.mem t.morfs (Phrase ProNG) then t else
188   - {t with sel_prefs = Xlist.fold t.sel_prefs [] (fun l s ->
189   - if StringSet.mem senses s then s :: l else l)})
190 276  
191 277 let disambiguate_senses lex_sems group =
192 278 let prefs = Xlist.fold group (StringSet.singleton "ALL") (fun prefs id ->
193   - Xlist.fold (ExtArray.get lex_sems id).valence prefs (fun prefs -> function
194   - _,Frame(_,schema) -> get_prefs_schema prefs schema
195   - | _,LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema
196   - | _,ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in
197   - let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id ->
  279 + Xlist.fold (ExtArray.get lex_sems id).frames prefs (fun prefs frame ->
  280 + Xlist.fold frame.positions prefs (fun prefs t ->
  281 + Xlist.fold t.sel_prefs prefs (fun prefs -> function
  282 + SynsetName s -> StringSet.add prefs s
  283 + | _ -> failwith "disambiguate_senses")))) in
  284 + (*let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id ->
198 285 Xlist.fold (ExtArray.get lex_sems id).senses hipero (fun hipero (_,l,_) ->
199 286 Xlist.fold l hipero StringSet.add)) in
200 287 let senses = StringSet.intersection prefs hipero in
201 288 let is_zero = StringSet.mem hipero "0" in
202   - let senses = if is_zero then StringSet.add senses "0" else senses in
  289 + let senses = if is_zero then StringSet.add senses "0" else senses in*)
203 290 Xlist.iter group (fun id ->
204 291 let t = ExtArray.get lex_sems id in
205   - ExtArray.set lex_sems id {t with valence = if is_zero then t.valence else
206   - Xlist.map t.valence (function
207   - n,Frame(a,schema) -> n,Frame(a,map_prefs_schema senses schema)
208   - | n,LexFrame(s,p,r,schema) -> n,LexFrame(s,p,r,map_prefs_schema senses schema)
209   - | n,ComprepFrame(s,p,r,schema) -> n,ComprepFrame(s,p,r,map_prefs_schema senses schema));
210   - senses = Xlist.map t.senses (fun (s,l,w) ->
211   - s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)})
212   -
213   -*)
  292 + ExtArray.set lex_sems id {t with frames=Xlist.map t.frames (fun frame ->
  293 + let meanings = Xlist.map frame.meanings (fun (name,hipero,weight) ->
  294 + let hipero = Xlist.fold hipero ["ALL",0] (fun hipero (name,cost) ->
  295 + if StringSet.mem prefs name then (name,cost) :: hipero else hipero) in
  296 + name,hipero,weight) in
  297 + {frame with meanings=meanings})})
214 298  
  299 +let remove_unused_tokens tokens groups =
  300 + let set = Xlist.fold groups IntSet.empty (fun set group ->
  301 + Xlist.fold group set IntSet.add) in
  302 + Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
  303 + if IntSet.mem set i then () else
  304 + ExtArray.set tokens i ENIAMtokenizerTypes.empty_token_env)
215 305  
216 306 let assign tokens text =
217 307 let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in
218 308 let _ = ExtArray.add lex_sems empty_lex_sem in
219 309 Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
220   - (* let token = ExtArray.get tokens i in
221   - (* ExtArray.set tokens i token; *)
222   - let senses = find_senses token in *)
223   - let lex_sem = {empty_lex_sem with senses=[](*senses*)} in
224   - let _ = ExtArray.add lex_sems lex_sem in
225   - ());
  310 + ignore (ExtArray.add lex_sems empty_lex_sem));
226 311 let groups = split_tokens_into_groups (ExtArray.size tokens) text in
227 312 (* Xlist.iter groups (fun group -> print_endline (String.concat " " (Xlist.map group string_of_int))); *)
  313 + remove_unused_tokens tokens groups;
228 314 Xlist.iter groups (fun group -> assign_valence tokens lex_sems group);
229   - (* Xlist.iter groups (fun group -> assign_valence tokens lex_sems group);
  315 + Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
  316 + let token = ExtArray.get tokens i in
  317 + let lex_sem = ExtArray.get lex_sems i in
  318 + let lex_sem = find_senses token lex_sem in
  319 + ExtArray.set lex_sems i lex_sem);
230 320 Xlist.iter groups (fun group -> disambiguate_senses lex_sems group);
231   - Xlist.iter groups (fun group -> assign_simplified_valence tokens lex_sems group);
232   - Xlist.iter groups (fun group -> assign_very_simplified_valence tokens lex_sems group);
233   - Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group); *)
  321 + (*Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group); *)
234 322 lex_sems
235 323  
236 324 let catch_assign tokens text =
... ... @@ -239,3 +327,11 @@ let catch_assign tokens text =
239 327 with e ->
240 328 ExtArray.make 0 empty_lex_sem,
241 329 Printexc.to_string e
  330 +
  331 +let initialize () =
  332 + ENIAMsubsyntax.initialize ();
  333 + ENIAMwalParser.initialize ();
  334 + ENIAMwalReduce.initialize ();
  335 + ENIAMplWordnet.initialize ();
  336 + ENIAMcategoriesPL.initialize ();
  337 + ()
... ...
lexSemantics/ENIAMlexSemanticsData.ml
... ... @@ -21,13 +21,13 @@ open ENIAMtokenizerTypes
21 21 open ENIAMlexSemanticsTypes
22 22 open Xstd
23 23  
24   -let subst_inst_roles = Xlist.fold [
25   - "wiosna", "Time","";
26   - "lato", "Time","";
27   - "jesień", "Time","";
28   - "zima", "Time","";
29   - "wieczór", "Time","";
30   - ] StringMap.empty (fun map (k,r,a) -> StringMap.add map k (r,a))
  24 +let subst_inst_time = StringSet.of_list [
  25 + "wiosna";
  26 + "lato";
  27 + "jesień";
  28 + "zima";
  29 + "wieczór";
  30 + ]
31 31  
32 32 let adj_roles = Xlist.fold [
33 33 "ten", "Apoz","";
... ... @@ -59,7 +59,7 @@ let adj_roles = Xlist.fold [
59 59 "taki", "Attribute","";
60 60 "czyj", "Possesive","";
61 61 "który", "Attribute","";
62   - ] StringMap.empty (fun map (k,r,a) -> StringMap.add map k (r,a))
  62 + ] StringMap.empty (fun map (k,r,a) -> StringMap.add_inc map k [r,a] (fun l -> (r,a) :: l))
63 63  
64 64 let adv_roles = Xlist.fold [ (* FIXME: problem z podwójnymi przypisaniami *)
65 65 (* operators: nielokalnie zmieniaja formułe logiczna *)
... ... @@ -80,7 +80,7 @@ let adv_roles = Xlist.fold [ (* FIXME: problem z podwójnymi przypisaniami *)
80 80 "dlatego", "Condition",""; (* odniesieniem argumentu jest sytuacji/kontekst *)
81 81 "tak", "Manner",""; (* odniesieniem argumentu jest sytuacji/kontekst, byc może deiktyczny *)
82 82  
83   - "skąd", "Location","Source";
  83 +(* "skąd", "Location","Source";
84 84 "skądkolwiek", "Location","Source";
85 85 "skądś", "Location","Source";
86 86 "skądże", "Location","Source";
... ... @@ -209,8 +209,8 @@ let adv_roles = Xlist.fold [ (* FIXME: problem z podwójnymi przypisaniami *)
209 209 "ongi", "Time","";
210 210 "ongiś", "Time","";
211 211 "wczas", "Time","";
212   - "wonczas", "Time","";
213   - ] StringMap.empty (fun map (k,r,a) -> StringMap.add map k (r,a))
  212 + "wonczas", "Time","";*)
  213 + ] StringMap.empty (fun map (k,r,a) -> StringMap.add_inc map k [r,a] (fun l -> (r,a) :: l))
214 214  
215 215 let qub_roles = Xlist.fold [
216 216 "tylko", "Quantifier","";
... ... @@ -236,10 +236,10 @@ let qub_roles = Xlist.fold [
236 236 "ponad", "Mod","";
237 237 "prawie", "Mod","";
238 238 "przynajmniej", "Mod","";
239   - ] StringMap.empty (fun map (k,r,a) -> StringMap.add map k (r,a))
  239 + ] StringMap.empty (fun map (k,r,a) -> StringMap.add_inc map k [r,a] (fun l -> (r,a) :: l))
240 240  
241 241  
242   -let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_prefs *)(* FIXME: problem z podwójnymi przypisaniami *)
  242 +let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_prefs *)
243 243 "od","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"];
244 244 "spod","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"];
245 245 "spomiędzy","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"];
... ... @@ -249,12 +249,14 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr
249 249 "spoza","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"];
250 250 "sprzed","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"];
251 251 "z","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"];
  252 + "z","postp", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"];
252 253 "znad","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"];
253 254 "zza","gen", "Location","Source",["POŁOŻENIE"],["POŁOŻENIE"];
254 255 "do","gen", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"];
255 256 "ku","dat", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"];
256 257 "między","acc", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"];
257 258 "na","acc", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"];
  259 + "na","postp", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"];
258 260 "nad","acc", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"];
259 261 "nieopodal","gen", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"];
260 262 "opodal","gen", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"];
... ... @@ -267,6 +269,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr
267 269 "za","acc", "Location","Goal",["POŁOŻENIE"],["POŁOŻENIE"];
268 270 "dzięki","dat", "Condition","",["CZEMU"],[];
269 271 "na","acc", "Condition","",["CZEMU"],[];
  272 + "na","postp", "Condition","",["CZEMU"],[];
270 273 "od","gen", "Condition","",["CZEMU"],[];
271 274 "przez","acc", "Condition","",["CZEMU"],[];
272 275 "wskutek","gen", "Condition","",["CZEMU"],[];
... ... @@ -275,6 +278,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr
275 278 "do","gen", "Purpose","",["CZEMU"],[];
276 279 "ku","dat", "Purpose","",["CZEMU"],[];
277 280 "na","acc", "Purpose","",["CZEMU"],[];
  281 + "na","postp", "Purpose","",["CZEMU"],[];
278 282 "po","acc", "Purpose","",["CZEMU"],[];
279 283 "do","gen", "Duration","",["CZAS"],["CZAS"];
280 284 "od","gen", "Duration","",["CZAS"],["CZAS"];
... ... @@ -285,6 +289,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr
285 289 "między","inst", "Location","",["POŁOŻENIE"],["POŁOŻENIE"];
286 290 "nad","inst", "Location","",["POŁOŻENIE"],["POŁOŻENIE"];
287 291 "na","loc", "Location","",["POŁOŻENIE"],["POŁOŻENIE"];
  292 + "na","postp", "Location","",["POŁOŻENIE"],["POŁOŻENIE"];
288 293 "naokoło","gen", "Location","",["POŁOŻENIE"],["POŁOŻENIE"];
289 294 "naprzeciw","gen", "Location","",["POŁOŻENIE"],["POŁOŻENIE"];
290 295 "naprzeciwko","gen", "Location","",["POŁOŻENIE"],["POŁOŻENIE"];
... ... @@ -327,6 +332,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr
327 332 "jak","str", "Manner","",[],[];*)
328 333 "pod","acc", "Manner","",["ALL"],[];
329 334 "z","inst", "Manner","",["ALL"],[];
  335 + "z","postp", "Manner","",["ALL"],[];
330 336 "dokoła","gen", "Path","",["POŁOŻENIE"],["POŁOŻENIE"];
331 337 "dookoła","gen", "Path","",["POŁOŻENIE"],["POŁOŻENIE"];
332 338 "koło","gen", "Path","",["POŁOŻENIE"],["POŁOŻENIE"];
... ... @@ -363,10 +369,10 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr
363 369 "temu","acc", "Time","",["CZAS"],["CZAS"]; (* dodane *)
364 370 "za","gen", "Time","",["CZAS"],["CZAS"]; (* dodane *)
365 371 ] StringMap.empty (fun map (lemma,case,role,role_attr,hipero,sel_prefs) ->
366   - let hipero = Xlist.fold hipero StringSet.empty ENIAMplWordnet.get_hipero_rec in
367   - let map2 = try StringMap.find map lemma with Not_found -> StringMap.empty in
368   - let map2 = StringMap.add_inc map2 case [case,role,role_attr,hipero,sel_prefs] (fun l -> (case,role,role_attr,hipero,sel_prefs) :: l) in
369   - StringMap.add map lemma map2)
  372 + let hipero = Xlist.map hipero (fun hipero -> ENIAMwalTypes.Predef hipero) in
  373 + let sel_prefs = Xlist.map sel_prefs (fun sel_prefs -> ENIAMwalTypes.Predef sel_prefs) in
  374 + StringMap.add_inc map lemma [case,role,role_attr,hipero,sel_prefs]
  375 + (fun l -> (case,role,role_attr,hipero,sel_prefs) :: l))
370 376 (* "przeciwko","dat","Dat";
371 377 "przeciw","dat","Dat";
372 378 "o","acc","Theme";
... ... @@ -374,14 +380,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr
374 380 "według","gen","Manr";
375 381 "wobec","gen","Dat";*)
376 382  
377   -let assign_prep_semantics lemma cases t =
378   - try
379   - let map = StringMap.find prep_roles lemma in
380   - let l = List.flatten (Xlist.map cases (fun case ->
381   - try StringMap.find map case with Not_found -> [])) in
382   - if l = [] then Normal else PrepSemantics l
383   - with Not_found -> Normal
384   -
  383 +(*
385 384 let subst_special_lexemes = Xlist.fold [
386 385 "jutro", ["indexical"];(*"dzień"*)
387 386 "pojutrze", ["indexical"];(*"dzień"*)
... ... @@ -553,3 +552,4 @@ let assign_semantics tokens lex_sems group =
553 552 {t with semantics=assign_prep_semantics lemma (StringSet.to_list cases) t}
554 553 | _ -> t in
555 554 ExtArray.set lex_sems id t)
  555 +*)
... ...
lexSemantics/ENIAMlexSemanticsHTMLof.ml
... ... @@ -63,10 +63,10 @@ let html_of_lex_sems tokens lex_sems =
63 63 let schemata = Xlist.map t.schemata (fun (selectors,l) ->
64 64 "&emsp;&emsp;[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ String.concat ", " (Xlist.map l (fun (d,s) ->
65 65 ENIAM_LCGstringOf.direction d ^ ENIAM_LCGstringOf.grammar_symbol 0 s)) ^ "}") in
66   - let frames = Xlist.map t.frames (fun (selectors,meanings,schema) ->
  66 + (* let frames = Xlist.map t.frames (fun (selectors,meanings,schema) -> FIXME
67 67 "&emsp;&emsp;[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "} " ^
68   - String.concat ", " (Xlist.map meanings (fun m -> ENIAMwalStringOf.meaning m))) in
69   - (String.concat "<br>\n " ([core] @ schemata @ frames @ lex_entries)) :: l))) ^
  68 + String.concat ", " (Xlist.map meanings (fun m -> ENIAMwalStringOf.meaning m))) in *)
  69 + (String.concat "<br>\n " ([core] @ schemata (*@ frames*) @ lex_entries)) :: l))) ^
70 70 "</P>"
71 71  
72 72 (* schemata: ((ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list *
... ...
lexSemantics/ENIAMlexSemanticsStringOf.ml
... ... @@ -27,6 +27,11 @@ let lex_sems t =
27 27 let t2 = ExtArray.get t id in
28 28 (Printf.sprintf "%3d %s" id (lex_sem t2)) :: l)))*)
29 29  
  30 +let arole f =
  31 + (if f.arole = "" then "" else "," ^ f.arole) ^
  32 + (if f.arole_attr = "" then "" else "," ^ f.arole_attr) ^
  33 + (if f.arev then ",rev" else "")
  34 +
30 35 let string_of_lex_sems tokens lex_sems =
31 36 String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size lex_sems - 1) [] (fun l id ->
32 37 let t = ExtArray.get lex_sems id in
... ... @@ -35,18 +40,22 @@ let string_of_lex_sems tokens lex_sems =
35 40 let lemma = ENIAMtokens.string_of_token t2.ENIAMtokenizerTypes.token in
36 41 let core = Printf.sprintf "%3d %s %s" id orth lemma in
37 42 let lex_entries = Xlist.map t.lex_entries (fun (selectors,s) ->
38   - "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] " ^ ENIAM_LCGstringOf.grammar_symbol 0 s) in
  43 + "&[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] " ^ ENIAM_LCGstringOf.grammar_symbol 0 s) in
39 44 let schemata = Xlist.map t.schemata (fun (selectors,l) ->
40 45 "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ String.concat "," (Xlist.map l (fun (d,s) ->
41 46 ENIAM_LCGstringOf.direction d ^ ENIAM_LCGstringOf.grammar_symbol 0 s)) ^ "}") in
42   - let frames = Xlist.map t.frames (fun (selectors,meanings,schema) ->
43   - "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "} " ^
  47 +(* let frames = Xlist.map t.frames (fun (selectors,meanings,schema) ->
  48 + "*[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "} " ^
44 49 String.concat "," (Xlist.map meanings (fun m -> ENIAMwalStringOf.meaning m))) in
45   - (String.concat "\n " ([core] @ schemata @ frames @ lex_entries)) :: l)))
  50 + let senses = Xlist.map t.senses (fun (sense,hipero,weight) ->
  51 + Printf.sprintf "%s[%s]%.2f" sense (String.concat "," (Xlist.map hipero (fun (s,n) -> s ^ " " ^ string_of_int n))) weight) in*)
  52 + let frames = Xlist.map t.frames (fun f ->
  53 + "*" ^ arole f ^ "[" ^ ENIAMcategoriesPL.string_of_selectors f.selectors ^ "] {" ^ ENIAMwalStringOf.schema f.positions ^ "} " ^
  54 + String.concat "," (Xlist.map f.meanings (fun (sense,hipero,weight) ->
  55 + Printf.sprintf "%s[%s]%.2f" sense (String.concat "," (Xlist.map hipero (fun (s,n) -> s ^ " " ^ string_of_int n))) weight))) in
  56 + (String.concat "\n " ([core] @ (*senses @*) schemata @ frames @ lex_entries)) :: l)))
46 57 (* let lroles = if snd t.lroles = "" then fst t.lroles else fst t.lroles ^ " " ^ snd t.lroles in
47 58 let core = Printf.sprintf "%3d %s %s %s" id orth lemma lroles in
48   - let senses = Xlist.map t.senses (fun (sense,hipero,weight) ->
49   - Printf.sprintf "%s[%s]%.2f" sense (String.concat "," hipero) weight) in
50 59 let valence = Xlist.map t.valence (ENIAMwalStringOf.fnum_frame "") in
51 60 let simple_valence = Xlist.map t.simple_valence (ENIAMwalStringOf.fnum_frame "") in
52 61 (* let semantics = *)
... ...
lexSemantics/ENIAMlexSemanticsTypes.ml
... ... @@ -36,20 +36,33 @@ type semantics =
36 36 | SpecialMod of string * (type_arg list * type_term)*)
37 37 | PrepSemantics of (string * string * string * StringSet.t * string list) list (* case,role,role_attr,hipero,sel_prefs *)
38 38  
  39 +type frame = {
  40 + selectors: (ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list;
  41 + meanings: ((*ENIAMwalTypes.meaning **) string * (string * int) list * float) list;
  42 + positions: ENIAMwalTypes.position list;
  43 + arole: string;
  44 + arole_attr: string;
  45 + arev: bool;
  46 + sopinion: ENIAMwalTypes.opinion;
  47 + fopinion: ENIAMwalTypes.opinion;
  48 + }
  49 +
  50 +let empty_frame = {selectors=[]; meanings=[]; positions=[]; arole=""; arole_attr=""; arev=false;
  51 + sopinion=ENIAMwalTypes.Nieokreslony; fopinion=ENIAMwalTypes.Nieokreslony}
  52 +
39 53 type lex_sem = {
40 54 schemata: ((ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list *
41 55 (ENIAM_LCGtypes.direction * ENIAM_LCGtypes.grammar_symbol) list) list;
42 56 lex_entries: ((ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list *
43 57 ENIAM_LCGtypes.grammar_symbol) list;
44   - frames: ((ENIAM_LCGlexiconTypes.selector * ENIAM_LCGlexiconTypes.selector_relation * string list) list *
45   - ENIAMwalTypes.meaning list * ENIAMwalTypes.position list) list;
  58 + frames: frame list;
46 59 cats: string list;
47 60 (* e: labels; *)
48 61 (* valence: (int * ENIAMwalTypes.frame) list;
49 62 simple_valence: (int * ENIAMwalTypes.frame) list;
50 63 very_simple_valence: ((ENIAM_LCGgrammarPLtypes.cat * ENIAM_LCGgrammarPLtypes.selector_relation * string list) list * ENIAM_LCGtypes.grammar_symbol) list; *)
51   - senses: (string * string list * float) list;
52   - lroles: string * string;
  64 + (* senses: (string * (string * int) list * float) list; *)
  65 + (* lroles: string * string; *)
53 66 semantics: semantics;
54 67 }
55 68  
... ... @@ -63,8 +76,8 @@ type lex_sem = {
63 76  
64 77 let empty_lex_sem = {
65 78 schemata=[]; lex_entries=[]; frames=[]; cats=["X"];
66   - (*e=empty_labels;*) (*valence=[]; simple_valence=[]; very_simple_valence=[];*) senses=[];
67   - lroles="",""; semantics=Normal}
  79 + (*e=empty_labels;*) (*valence=[]; simple_valence=[]; very_simple_valence=[];*) (*senses=[];*)
  80 + (*lroles="","";*) semantics=Normal}
68 81  
69 82 (* FIXME: poprawić katalog *)
70 83 (*let subst_uncountable_lexemes_filename = resource_path ^ "/lexSemantics/subst_uncountable.dat"
... ... @@ -77,3 +90,13 @@ let subst_uncountable_lexemes_filename2 = resource_path ^ &quot;/Walenty/subst_uncoun
77 90 let subst_container_lexemes_filename = resource_path ^ "/Walenty/subst_container.dat"
78 91 let subst_numeral_lexemes_filename = resource_path ^ "/Walenty/subst_numeral.dat"
79 92 let subst_time_lexemes_filename = resource_path ^ "/Walenty/subst_time.dat" *)
  93 +
  94 +let hipero_threshold = 3
  95 +let unknown_meaning_weight = -1.
  96 +
  97 +let lu_filename = resource_path ^ "/plWordnet/lu.tab"
  98 +let ex_hipo_filename = resource_path ^ "/plWordnet/ex_hipo.tab"
  99 +let syn_filename = resource_path ^ "/plWordnet/syn.tab"
  100 +
  101 +let predef_filename = resource_path ^ "/lexSemantics/predef_prefs.tab"
  102 +let proper_classes_filename = resource_path ^ "/lexSemantics/proper_classes.tab"
... ...
lexSemantics/ENIAMplWordnet.ml 0 → 100644
  1 +(*
  2 + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
  3 + * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open Xstd
  21 +open ENIAMlexSemanticsTypes
  22 +
  23 +let lu_names = ref IntMap.empty
  24 +let lumap = ref StringMap.empty
  25 +let synmap = ref IntMap.empty
  26 +let ex_hipo = ref IntMap.empty
  27 +let predef_names = ref IntMap.empty
  28 +let proper_classes = ref StringMap.empty
  29 +let predef = ref StringMap.empty
  30 +
  31 +let load_lu filename =
  32 + File.fold_tab filename (IntMap.empty,StringMap.empty) (fun (lu_names,lumap) -> function
  33 + [lu_id; lemma; variant; syn_id] ->
  34 + let v = variant,int_of_string syn_id in
  35 + IntMap.add lu_names (int_of_string lu_id) (lemma,variant,int_of_string syn_id),
  36 + StringMap.add_inc lumap lemma [v] (fun l -> v :: l)
  37 + | l -> failwith ("load_lu: " ^ String.concat "\t" l))
  38 +
  39 +let load_syn filename =
  40 + File.fold_tab filename IntMap.empty (fun synmap -> function
  41 + syn_id :: pos :: lu_ids ->
  42 + let lu_ids = Xlist.map lu_ids int_of_string in
  43 + IntMap.add synmap (int_of_string syn_id) (pos,lu_ids)
  44 + | l -> failwith ("load_syn: " ^ String.concat "\t" l))
  45 +
  46 +let load_ex_hipo filename =
  47 + File.fold_tab filename IntMap.empty (fun ex_hipo -> function
  48 + [parent; child; cost] ->
  49 + let parent = int_of_string parent in
  50 + let child = int_of_string child in
  51 + let cost = int_of_string cost in
  52 + let children = try IntMap.find ex_hipo parent with Not_found -> IntMap.empty in
  53 + let children = IntMap.add_inc children child cost (fun _ -> failwith "load_ex_hipo") in
  54 + IntMap.add ex_hipo parent children
  55 + | l -> failwith ("load_ex_hipo: " ^ String.concat "\t" l))
  56 +
  57 +let syn_id_of_sense sense =
  58 + let lemma,variant =
  59 + match List.rev (Xstring.split " " sense) with
  60 + variant :: l -> String.concat " " (List.rev l), variant
  61 + | _ -> failwith "syn_id_of_sense 1" in
  62 + let l = Xlist.fold (try StringMap.find !lumap lemma with Not_found -> failwith ("syn_id_of_sense: " ^ lemma)) [] (fun l (variant2,syn_id) ->
  63 + if variant = variant2 then syn_id :: l else l) in
  64 + match l with
  65 + [syn_id] -> syn_id
  66 + | _ -> failwith ("syn_id_of_sense 2: " ^ lemma)
  67 +
  68 +let load_predef ex_hipo filename =
  69 + let ex_hipo,predef_names,predef,_ =
  70 + File.fold_tab filename (ex_hipo,IntMap.empty,StringMap.empty,-1) (fun (ex_hipo,predef_names,predef,id) -> function
  71 + name :: senses ->
  72 + let ex_hipo = Xlist.fold senses ex_hipo (fun ex_hipo sense ->
  73 + let hipo_id = try StringMap.find predef sense with Not_found -> syn_id_of_sense sense in
  74 + let children = try IntMap.find ex_hipo hipo_id with Not_found -> IntMap.empty in
  75 + let children = IntMap.add_inc children id 0 (fun _ -> failwith "load_predef 1") in
  76 + IntMap.add ex_hipo hipo_id children) in
  77 + let predef_names = IntMap.add predef_names id name in
  78 + let predef = StringMap.add_inc predef name id (fun _ -> failwith "load_predef 2") in
  79 + ex_hipo, predef_names, predef, id-1
  80 + | l -> failwith ("load_predef: " ^ String.concat "\t" l)) in
  81 + ex_hipo,predef_names,predef
  82 +
  83 +let rec get_hipero_rec found ex_hipo id cost =
  84 + let cost2 = try IntMap.find found id with Not_found -> max_int in
  85 + if cost2 <= cost || cost > hipero_threshold then found else
  86 + let found = IntMap.add found id cost in
  87 + let map = try IntMap.find ex_hipo id with Not_found -> IntMap.empty in
  88 + IntMap.fold map found (fun found id2 cost2 ->
  89 + get_hipero_rec found ex_hipo id2 (cost + cost2))
  90 +
  91 +let get_hipero syn_id =
  92 + get_hipero_rec IntMap.empty !ex_hipo syn_id 0
  93 +
  94 +exception SynsetNotFound
  95 +
  96 +let synset_name syn_id =
  97 + if IntMap.mem !predef_names syn_id then IntMap.find !predef_names syn_id else
  98 + let lemma,variant,_ =
  99 + try IntMap.find !lu_names (List.hd (snd (IntMap.find !synmap syn_id)))
  100 + with Not_found -> raise SynsetNotFound (*failwith ("synset_name: " ^ string_of_int syn_id)*) in
  101 + lemma ^ "-" ^ variant
  102 +
  103 +let load_proper_classes filename =
  104 + File.fold_tab filename StringMap.empty (fun map -> function
  105 + id :: senses ->
  106 + let senses = Xlist.map senses (fun sense ->
  107 + match List.rev (Str.split (Str.regexp " ") sense) with
  108 + weight :: l -> String.concat " " (List.rev l), (try float_of_string weight with _ -> failwith "load_proper_classes 2")
  109 + | _ -> failwith "load_proper_classes 4") in
  110 + let senses = Xlist.map senses (fun (sense,weight) ->
  111 + (* let sense = if sense = "antroponim 1" then "nazwa własna 1" else sense in
  112 + let sense = if sense = "godzina 4" then "godzina 3" else sense in *)
  113 +(* print_endline sense; *)
  114 + let syn_id = syn_id_of_sense sense in
  115 + sense,IntMap.fold (get_hipero syn_id) [] (fun hipero syn_id cost -> (synset_name syn_id, cost) :: hipero),weight) in
  116 + StringMap.add_inc map id senses (fun _ -> failwith ("load_proper_classes 3: " ^ id))
  117 + | l -> failwith ("load_proper_classes: " ^ String.concat "\t" l))
  118 +
  119 +let simplify_pos = function
  120 + "subst" -> "noun"
  121 + | "depr" -> "noun"
  122 + | "adj" -> "adj"
  123 + | "adja" -> "adj"
  124 + | "adjc" -> "adj"
  125 + | "adjp" -> "adj"
  126 + | "ger" -> "verb"
  127 + | "pact" -> "verb"
  128 + | "ppas" -> "verb"
  129 + | "fin" -> "verb"
  130 + | "bedzie" -> "verb"
  131 + | "praet" -> "verb"
  132 + | "winien" -> "verb"
  133 + | "impt" -> "verb"
  134 + | "imps" -> "verb"
  135 + | "inf" -> "verb"
  136 + | "pcon" -> "verb"
  137 + | "pant" -> "verb"
  138 + | "pred" -> "verb"
  139 + | s -> s
  140 +
  141 +let find_senses lemma pos =
  142 +(*if pos = "ppron12" || pos = "ppron3" || pos = "siebie" then {t with senses=[lemma,["0"],0.]} else*) (* FIXME: ustalić co z zaimkami *)
  143 + let l = try StringMap.find !lumap lemma with Not_found -> [] in
  144 + let pos = simplify_pos pos in
  145 + Xlist.fold l [] (fun l (variant,syn_id) ->
  146 + let pos2,_ = try IntMap.find !synmap syn_id with Not_found -> failwith "find_senses" in
  147 + if pos <> pos2 then l else
  148 + (lemma ^ "-" ^ variant,
  149 + IntMap.fold (get_hipero syn_id) [] (fun hipero syn_id cost -> (synset_name syn_id, cost) :: hipero),
  150 + log10 (1. /. (try float_of_string variant with _ -> 3.))) :: l)
  151 +
  152 +let find_proper_senses senses =
  153 + List.flatten (Xlist.rev_map senses (fun sense ->
  154 + try StringMap.find !proper_classes sense with Not_found -> failwith ("find_proper_senses: " ^ sense)))
  155 +
  156 +let find_meaning lu_id =
  157 + let lemma,variant,syn_id = IntMap.find !lu_names lu_id in
  158 + lemma ^ "-" ^ variant,
  159 + IntMap.fold (get_hipero syn_id) [] (fun hipero syn_id cost -> (synset_name syn_id, cost) :: hipero),
  160 + log10 (1. /. (try float_of_string variant with _ -> 3.))
  161 +
  162 +let initialize () =
  163 + let a,b = load_lu lu_filename in
  164 + lu_names := a;
  165 + lumap := b;
  166 + synmap := load_syn syn_filename;
  167 + ex_hipo := load_ex_hipo ex_hipo_filename;
  168 + let a,b,c = load_predef !ex_hipo predef_filename in
  169 + ex_hipo := a;
  170 + predef_names := b;
  171 + predef := c;
  172 + proper_classes := load_proper_classes proper_classes_filename;
  173 + ()
... ...
lexSemantics/ENIAMvalence.ml
... ... @@ -613,3 +613,34 @@ let get_default_valence = function
613 613 | "adj" -> [Nieokreslony,NegationUndef,PredFalse,AspectUndef,[]]
614 614 | "adv" -> [Nieokreslony,NegationUndef,PredFalse,AspectUndef,[]]
615 615 | _ -> []
  616 +
  617 +open ENIAMcategoriesPL
  618 +
  619 +let get_aroles schema lemma = function
  620 + "pact" -> [Xlist.fold schema ([],"Arg","",true) (fun (sel,arole,arole_attr,arev) p ->
  621 + if p.gf = SUBJ then sel,p.role,p.role_attr,arev else sel,arole,arole_attr,arev)]
  622 + | "ppas" -> [Xlist.fold schema ([],"Arg","",true) (fun (sel,arole,arole_attr,arev) p ->
  623 + if p.gf = OBJ then sel,p.role,p.role_attr,arev else sel,arole,arole_attr,arev)]
  624 + | "subst" -> [
  625 + [Case,Eq,["dat"]],"Recipent","",false;
  626 + [Case,Eq,["inst"]],(if StringSet.mem ENIAMlexSemanticsData.subst_inst_time lemma then "Time" else "Instrument"),"",false;
  627 + [Case,Neq,["dat";"inst"]],"","",false]
  628 + | "adj" | "adjc" | "adjp" -> (* FIXME czy adjc i adjp mogą być adjunctami? *)
  629 + let l = try StringMap.find ENIAMlexSemanticsData.adj_roles lemma with Not_found -> ["Attribute",""] in
  630 + Xlist.map l (fun (role,role_attr) -> [],role,role_attr,false)
  631 + | "adv" ->
  632 + let modes = ENIAMcategoriesPL.adv_mode lemma in
  633 + let roles = try StringMap.find ENIAMlexSemanticsData.adv_roles lemma with Not_found -> ["Manner",""] in
  634 + Xlist.fold modes [] (fun l -> function
  635 + "mod" -> Xlist.fold roles l (fun l (role,role_attr) -> ([Mode,Eq,["mod"]],role,role_attr,false) :: l)
  636 + | "abl" -> ([Mode,Eq,["abl"]],"Location","Souce",false) :: l
  637 + | "adl" -> ([Mode,Eq,["adl"]],"Location","Goal",false) :: l
  638 + | "locat" -> ([Mode,Eq,["locat"]],"Location","",false) :: l
  639 + | "perl" -> ([Mode,Eq,["perl"]],"Path","",false) :: l
  640 + | "dur" -> ([Mode,Eq,["dur"]],"Duration","",false) :: l
  641 + | "temp" -> ([Mode,Eq,["temp"]],"Time","",false) :: l
  642 + | _ -> failwith "get_aroles")
  643 + | "qub" ->
  644 + let l = try StringMap.find ENIAMlexSemanticsData.qub_roles lemma with Not_found -> ["Arg",""] in
  645 + Xlist.map l (fun (role,role_attr) -> [],role,role_attr,false)
  646 + | _ -> [[],"","",false]
... ...
lexSemantics/ENIAMwalFrames.ml deleted
1   -(*
2   - * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
3   - * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
4   - * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
5   - *
6   - * This library is free software: you can redistribute it and/or modify
7   - * it under the terms of the GNU Lesser General Public License as published by
8   - * the Free Software Foundation, either version 3 of the License, or
9   - * (at your option) any later version.
10   - *
11   - * This library is distributed in the hope that it will be useful,
12   - * but WITHOUT ANY WARRANTY; without even the implied warranty of
13   - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14   - * GNU Lesser General Public License for more details.
15   - *
16   - * You should have received a copy of the GNU Lesser General Public License
17   - * along with this program. If not, see <http://www.gnu.org/licenses/>.
18   - *)
19   -
20   -open ENIAMwalTypes
21   -open Xstd
22   -
23   -let expands,compreps,comprep_reqs,subtypes,equivs = ENIAMwalParser.load_realizations ()
24   -(*let verb_frames = ENIAMwalParser.load_frames (Paths.walenty_path ^ Paths.verb_filename)
25   -let noun_frames = ENIAMwalParser.load_frames (Paths.walenty_path ^ Paths.noun_filename)
26   -let adj_frames = ENIAMwalParser.load_frames (Paths.walenty_path ^ Paths.adj_filename)
27   -let adv_frames = ENIAMwalParser.load_frames (Paths.walenty_path ^ Paths.adv_filename) *)
28   -
29   -let walenty = (*StringMap.empty*)ENIAMwalTEI.load_walenty2 ()
30   -
31   -(*let _ = StringMap.iter walenty (fun pos map ->
32   - StringMap.iter map (fun lexeme frames ->
33   - Printf.printf "%s %s %d\n%!" pos lexeme (Xlist.size frames)))*)
34   -
35   -(*let all_frames =
36   - ["subst",noun_frames;
37   - "adj",adj_frames;
38   - "adv",adv_frames;
39   - "ger",verb_frames;
40   - "pact",verb_frames;
41   - "ppas",verb_frames;
42   - "fin",verb_frames;
43   - "praet",verb_frames;
44   - "impt",verb_frames;
45   - "imps",verb_frames;
46   - "inf",verb_frames;
47   - "pcon",verb_frames]*)
48   -
49   -let rec get_role_and_sense = function
50   - Phrase(Lex "się") -> "Theme","", []
51   - | PhraseAbbr(Xp "abl",_) -> "Location","Source", []
52   - | PhraseAbbr(Xp "adl",_) -> "Location","Goal", []
53   - | PhraseAbbr(Xp "caus",_) -> "Condition","", []
54   - | PhraseAbbr(Xp "dest",_) -> "Purpose","", []
55   - | PhraseAbbr(Xp "dur",_) -> "Duration","", []
56   - | PhraseAbbr(Xp "instr",_) -> "Instrument","", []
57   - | PhraseAbbr(Xp "locat",_) -> "Location","", []
58   - | PhraseAbbr(Xp "mod",_) -> "Manner","", []
59   - | PhraseAbbr(Xp "perl",_) -> "Path","", []
60   - | PhraseAbbr(Xp "temp",_) -> "Time","", []
61   - | PhraseAbbr(Advp "abl",_) -> "Location","Source", []
62   - | PhraseAbbr(Advp "adl",_) -> "Location","Goal", []
63   - | PhraseAbbr(Advp "dur",_) -> "Duration","", []
64   - | PhraseAbbr(Advp "locat",_) -> "Location","", []
65   - | PhraseAbbr(Advp "mod",_) -> "Manner","", []
66   - | PhraseAbbr(Advp "perl",_) -> "Path","", []
67   - | PhraseAbbr(Advp "temp",_) -> "Time","", []
68   -(* | PhraseAbbr(Advp "pron",_) -> "Arg","", []
69   - | PhraseAbbr(Advp "misc",_) -> "Arg","", []*)
70   - | PhraseAbbr(Distrp,_) -> "Distributive","", [] (* FIXME: to jest kwantyfikator *)
71   - | PhraseAbbr(Possp,_) -> "Possesive","", []
72   - | LexPhraseMode("abl",_,_) -> "Location","Source", []
73   - | LexPhraseMode("adl",_,_) -> "Location","Goal", []
74   - | LexPhraseMode("caus",_,_) -> "Condition","", []
75   - | LexPhraseMode("dest",_,_) -> "Purpose","", []
76   - | LexPhraseMode("dur",_,_) -> "Duration","", []
77   - | LexPhraseMode("instr",_,_) -> "Instrument","", []
78   - | LexPhraseMode("locat",_,_) -> "Location","", []
79   - | LexPhraseMode("mod",_,_) -> "Manner","", []
80   - | LexPhraseMode("perl",_,_) -> "Path","", []
81   - | LexPhraseMode("temp",_,_) -> "Time","", []
82   - | _ -> "Arg","", []
83   -
84   -
85   -(*let rec get_gf_role = function
86   - [],Phrase(NP case) -> "C", "", ["T"]
87   - | [],Phrase(AdjP case) -> "R", "", ["T"]
88   - | [],Phrase(NumP(case,_)) -> "C", "", ["T"]
89   - | [],Phrase(PrepNP _) -> "C", "", ["T"]
90   - | [],Phrase(PrepAdjP _) -> "C", "", ["T"]
91   - | [],Phrase(PrepNumP _) -> "C", "", ["T"]
92   - | [],Phrase(ComprepNP _) -> "C", "", ["T"]
93   - | [],Phrase(ComparP _) -> "C", "", ["T"]
94   - | [],Phrase(CP _) -> "C", "", ["T"]
95   - | [],Phrase(NCP(case,_,_)) -> "C", "", ["T"]
96   - | [],Phrase(PrepNCP _) -> "C", "", ["T"]
97   - | [],Phrase(InfP _) -> "C", "", ["T"]
98   - | [],Phrase(FixedP _) -> "C", "", ["T"]
99   - | [],Phrase Or -> "C", "", ["T"] (* FIXME: zbadać w walentym faktyczne użycia or, bo to nie tylko zdania, ale też np(nom) w cudzysłowach *)
100   - | [],Phrase(Lex "się") -> "C", "Ptnt", ["T"]
101   - | [],PhraseAbbr(Xp mode,_) -> "C", mode, ["T"]
102   - | [],PhraseAbbr(Advp "pron",_) -> "R", "", ["T"]
103   - | [],PhraseAbbr(Advp "misc",_) -> "R", "", ["T"]
104   - | [],PhraseAbbr(Advp mode,_) -> "C", mode, ["T"]
105   - | [],PhraseAbbr(Nonch,_) -> "C", "", ["T"]
106   - | [],PhraseAbbr(Distrp,_) -> "C", "Distr", ["T"]
107   - | [],PhraseAbbr(Possp,_) -> "C", "Poss", ["T"]
108   - | [],LexPhraseMode(mode,_,_) -> "C", mode, ["T"]
109   - | [],LexPhrase((SUBST(_,case),_) :: _,_) -> "C", "", ["T"]
110   - | [],LexPhrase((PREP _,_) :: _,_) -> "C", "", ["T"]
111   - | [],LexPhrase((NUM(case,_,_),_) :: _,_) -> "C", "", ["T"]
112   - | [],LexPhrase((ADJ(_,case,_,_),_) :: _,_) -> "C", "", ["T"]
113   - | [],LexPhrase((ADV _,_) :: _,_) -> "C", "", ["T"]
114   - | [],LexPhrase((GER(_,case,_,_,_,_),_) :: _,_) -> "C", "", ["T"]
115   - | [],LexPhrase((PACT(_,case,_,_,_,_),_) :: _,_) -> "C", "", ["T"]
116   - | [],LexPhrase((PPAS(_,case,_,_,_),_) :: _,_) -> "C", "", ["T"]
117   - | [],LexPhrase((INF _,_) :: _,_) -> "C", "", ["T"]
118   - | [],LexPhrase((QUB,_) :: _,_) -> "C", "", ["T"]
119   - | [],LexPhrase((COMPAR,_) :: _,_) -> "C", "", ["T"]
120   - | [],LexPhrase((COMP _,_) :: _,_) -> "C", "", ["T"]
121   - | [],morf -> print_endline(*failwith*) ("get_gf: []," ^ ENIAMwalStringOf.morf morf);"","",[]
122   - | _,Phrase(InfP _) -> "X", "", ["T"]
123   - | _,Phrase(CP _) -> "X", "", ["T"] (* zwykle możliwa koordynacja z infp *)
124   - | _,Phrase _ -> "X", "", ["T"]
125   - | _,PhraseAbbr _ -> "X", "", ["T"]
126   - | _,LexPhraseMode _ -> "X", "", ["T"]
127   - | _,LexPhrase((INF _,_) :: _,_) -> "X", "", ["T"]
128   - | _,LexPhrase _ -> "X", "", ["T"]
129   - | _,morf -> failwith ("get_gf: _," ^ ENIAMwalStringOf.morf morf)*)
130   -
131   -(*let gf_rank = Xlist.fold [
132   - "",1;
133   - ] StringMap.empty (fun gf_rank (gf,v) -> StringMap.add gf_rank gf v)*)
134   -
135   -(*let agregate_gfs s gfs_roles =
136   -(* fst (Xlist.fold gfs ("",0) (fun (best_gf,best_rank) gf ->
137   - let rank = try StringMap.find gf_rank gf with Not_found -> failwith ("agregate_gfs: " ^ gf) in
138   - if rank > best_rank then gf, rank else best_gf, best_rank))*)
139   -(* let gfs,roles = List.split gfs_roles in
140   - let gfs = StringSet.to_list (Xlist.fold gfs StringSet.empty StringSet.add) in
141   - if Xlist.size gfs > 1 then print_endline ("agregate_gfs: " ^ String.concat " " gfs);
142   - if Xlist.size roles > 1 then print_endline ("agregate_gfs: " ^ String.concat " " roles);*)
143   - let gf,role,prefs = List.hd gfs_roles in
144   - {s with gf=gf; role=role; prefs=prefs}
145   -
146   -let rec make_gfs schema =
147   - let schema = Xlist.map schema (function
148   - {gf="subj"} as s -> {s with gf="SUBJ"; role="Agnt"; prefs=["T"]; morfs=make_gfs_morfs s.morfs}
149   - | {gf="obj"} as s -> {s with gf="OBJ"; role="Ptnt"; prefs=["T"]; morfs=make_gfs_morfs s.morfs}
150   - | {gf=""} as s -> agregate_gfs {s with morfs=make_gfs_morfs s.morfs} (Xlist.map s.morfs (fun morf -> get_gf_role (s.ce,morf)))
151   - | {gf=t} -> failwith ("make_gfs: " ^ t)) in
152   -(* let schema = List.rev (fst (Xlist.fold schema ([],StringMap.empty) (fun (schema,map) s ->
153   - try
154   - let n = StringMap.find map s.gf in
155   - {s with gf=s.gf ^ string_of_int (n+1)} :: schema,
156   - StringMap.add map s.gf (n+1)
157   - with Not_found ->
158   - s :: schema, StringMap.add map s.gf 1))) in*)
159   - schema
160   -
161   -and make_gfs_morfs morfs =
162   - List.flatten (Xlist.map morfs (function
163   - Phrase _ as morf -> [morf]
164   - | PhraseAbbr(Advp _,[]) -> [Phrase AdvP]
165   - | PhraseAbbr(_,[]) -> failwith "make_gfs_morfs"
166   - | PhraseAbbr(_,morfs) -> make_gfs_morfs morfs
167   - | LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,make_gfs schema))]
168   - | LexPhraseMode(_,pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,make_gfs schema))]
169   - | _ -> failwith "make_gfs_morfs"))*)
170   -
171   -let mark_nosem_morfs morfs =
172   - Xlist.map morfs (function
173   - Phrase(PrepNP(_,prep,c)) -> Phrase(PrepNP(NoSem,prep,c))
174   - | Phrase(PrepAdjP(_,prep,c)) -> Phrase(PrepAdjP(NoSem,prep,c))
175   - | Phrase(PrepNumP(_,prep,c)) -> Phrase(PrepNumP(NoSem,prep,c))
176   -(* | Phrase(ComprepNP(_,prep)) -> Phrase(ComprepNP(NoSem,prep)) *) (* FIXME: na razie ComprepNP są zawsze semantyczne *)
177   -(* | Phrase(ComparNP(_,prep,c)) -> Phrase(ComparNP(NoSem,prep,c)) (* FIXME: pomijam niesemantyczny compar *)
178   - | Phrase(ComparPP(_,prep)) -> Phrase(ComparPP(NoSem,prep))*)
179   - | Phrase(PrepNCP(_,prep,c,ct,co)) -> Phrase(PrepNCP(NoSem,prep,c,ct,co))
180   - | t -> t)
181   -
182   -
183   -let agregate_role_and_sense s l =
184   - let roles,senses = Xlist.fold l (StringSet.empty,StringSet.empty) (fun (roles,senses) (role,role_attr,sense) ->
185   - StringSet.add roles (role ^ " " ^ role_attr),
186   - Xlist.fold sense senses StringSet.add) in
187   - let roles = if StringSet.size roles = 1 then roles else StringSet.remove roles "Arg " in
188   - let role,role_attr =
189   - match Str.split (Str.regexp " ") (StringSet.min_elt roles) with
190   - [r;a] -> r,a
191   - | [r] -> r,""
192   - | _ -> failwith "agregate_role_and_sense" in
193   - {s with role=role; role_attr=role_attr(*; sel_prefs=StringSet.to_list senses*)}
194   -
195   -let rec assign_role_and_sense schema =
196   - Xlist.map schema (function
197   - {gf=SUBJ} as s ->
198   - if s.role = "" then {s with role="Initiator"; sel_prefs=["ALL"]; morfs=assign_role_and_sense_morfs s.morfs}
199   - else {s with morfs=assign_role_and_sense_morfs (mark_nosem_morfs s.morfs)}
200   - | {gf=OBJ} as s ->
201   - if s.role = "" then {s with role="Theme"; sel_prefs=["ALL"]; morfs=assign_role_and_sense_morfs s.morfs}
202   - else {s with morfs=assign_role_and_sense_morfs (mark_nosem_morfs s.morfs)}
203   - | {gf=ARG} as s ->
204   - if s.role = "" then agregate_role_and_sense {s with sel_prefs=["ALL"]; morfs=assign_role_and_sense_morfs s.morfs}
205   - (Xlist.map s.morfs (fun morf -> get_role_and_sense morf))
206   - else {s with morfs=assign_role_and_sense_morfs (mark_nosem_morfs s.morfs)}
207   - | _ -> failwith "assign_role_and_sense")
208   -
209   -and assign_role_and_sense_morfs morfs =
210   - List.flatten (Xlist.map morfs (function
211   - Phrase _ as morf -> [morf]
212   - | E _ as morf -> [morf]
213   - | PhraseAbbr(Advp _,[]) -> [Phrase AdvP]
214   - | PhraseAbbr(_,[]) -> failwith "assign_role_and_sense_morfs"
215   - | PhraseAbbr(_,morfs) -> assign_role_and_sense_morfs morfs
216   - | LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,assign_role_and_sense schema))]
217   - | LexPhraseMode(_,pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,assign_role_and_sense schema))]
218   - | _ -> failwith "assign_role_and_sense_morfs"))
219   -
220   -(*let _ =
221   - Xlist.iter walenty_filenames (fun filename ->
222   - print_endline filename;
223   - let frames = load_frames (walenty_path ^ filename) in
224   - StringMap.iter frames (fun _ l ->
225   - Xlist.iter l (fun (refl,opinion,negation,pred,aspect,schema) ->
226   - ignore (process_opinion opinion);
227   - ignore (process_negation [Text negation]);
228   - ignore (process_pred [Text pred]);
229   - ignore (process_aspect [Text aspect]);
230   - ignore (assign_pro_args (make_gfs (process_schema expands subtypes equivs schema))))))*)
231   -
232   -let remove_trivial_args schema =
233   - Xlist.fold schema [] (fun l (_,_,_,morfs) ->
234   - let morfs = Xlist.fold morfs [] (fun morfs -> function
235   - Phrase(AdjP _) -> morfs
236   - | Phrase(NP(Case "gen")) -> morfs
237   - | Phrase(NCP(Case "gen",_,_)) -> morfs
238   - | Phrase(PrepNP _) -> morfs
239   - | Phrase(FixedP _) -> morfs
240   - | LexPhrase([ADJ _,_],_) -> morfs
241   - | LexPhrase([PPAS _,_],_) -> morfs
242   - | LexPhrase([PACT _,_],_) -> morfs
243   - | LexPhrase([SUBST(_,Case "gen"),_],_) -> morfs
244   - | LexPhrase([PREP _,_;_],_) -> morfs
245   - | morf -> morf :: morfs) in
246   - if morfs = [] then l else morfs :: l)
247   -
248   -(* leksykalizacje do zmiany struktury
249   -lex([PREP(gen),'z';SUBST(sg,gen),'nazwa'],atr1[OBL{lex([QUB,'tylko'],natr[])}])
250   -lex([PREP(loc),'na';SUBST(sg,loc),'papier'],atr1[OBL{lex([QUB,'tylko'],natr[])}])
251   -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}])
252   -lex([PREP(gen),'z';SUBST(sg,gen),'most'],ratr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}])
253   -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}])
254   -lex([PREP(gen),'z';SUBST(sg,gen),'most'],ratr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}])
255   -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}])
256   -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}])
257   -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}])
258   -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}])
259   -lex([PREP(acc),'w';SUBST(pl,acc),'oko'],atr1[OBL-MOD{lex([ADV(pos),'prosto'],natr[])}])
260   -*)
261   -
262   -let num_arg_schema_field morfs =
263   - {gf=CORE; role="QUANT-ARG"; role_attr=""; sel_prefs=["ALL"]; cr=[]; ce=[]; dir=Forward; morfs=morfs}
264   -
265   -let std_arg_schema_field dir morfs =
266   - {gf=ARG; role="Arg"; role_attr=""; sel_prefs=["ALL"]; cr=[]; ce=[]; dir=dir; morfs=morfs}
267   -
268   -let simple_arg_schema_field morfs =
269   - {gf=ARG; role=""; role_attr=""; sel_prefs=["ALL"]; cr=[]; ce=[]; dir=Both; morfs=morfs}
270   -
271   -let nosem_refl_schema_field =
272   - {gf=NOSEM; role=""; role_attr=""; sel_prefs=["ALL"]; cr=[]; ce=[]; dir=Both; morfs=[Phrase(Lex "się")]}
273   -
274   -
275   -let expand_lexicalizations = function
276   - Frame(atrs,schema) -> Frame(atrs,expand_lexicalizations_schema schema)
277   -(* ComprepFrame(s,morfs) -> ComprepFrame(atrs,expand_lexicalizations_morfs morfs) *)
278   - | _ -> failwith "expand_lexicalizations"
279   -
280   -
281   -let prepare_schema_comprep expands subtypes equivs schema =
282   - assign_pro_args (assign_role_and_sense (ENIAMwalParser.expand_equivs_schema equivs (ENIAMwalParser.expand_subtypes subtypes (ENIAMwalParser.expand_schema expands schema))))
283   -
284   -let prepare_schema expands subtypes equivs schema =
285   - prepare_schema_comprep expands subtypes equivs (ENIAMwalParser.parse_schema schema)
286   -
287   -let prepare_schema_sem expands subtypes equivs schema =
288   - prepare_schema_comprep expands subtypes equivs schema
289   -
290   -
291   -let convert_frame expands subtypes equivs lexemes valence lexeme pos (refl,opinion,negation,pred,aspect,schema) =
292   -(* Printf.printf "convert_frame %s %s\n" lexeme pos; *)
293   - try
294   - if refl = "się" && not (StringMap.mem lexemes "się") then raise ImpossibleSchema else
295   - let frame =
296   - try StringMap.find default_frames refl (* w refl jest przekazywana informacja o typie domyślnej ramki *)
297   - with Not_found ->
298   - Frame(DefaultAtrs([],ENIAMwalParser.parse_refl [Text refl],
299   - ENIAMwalParser.parse_opinion opinion,
300   - ENIAMwalParser.parse_negation [Text negation],
301   - ENIAMwalParser.parse_pred [Text pred],
302   - ENIAMwalParser.parse_aspect [Text aspect]),
303   - prepare_schema expands subtypes equivs schema) in
304   - let frame = if StringMap.is_empty lexemes then frame else reduce_schema_frame lexemes frame in
305   - let frame = expand_lexicalizations frame in
306   - Xlist.fold (extract_lex_frames lexeme pos [] frame) valence (fun valence -> function
307   - lexeme,pos,Frame(atrs,schema) ->
308   - let schemas = simplify_lex (split_xor (split_or_coord schema)) in
309   - Xlist.fold schemas valence (fun valence schema ->
310   - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in
311   - let poss = StringMap.add_inc poss pos [Frame(atrs,schema)] (fun l -> Frame(atrs,schema) :: l) in
312   - StringMap.add valence lexeme poss)
313   - | lexeme,pos,LexFrame(id,pos2,restr,schema) ->
314   - let schemas = simplify_lex (split_xor (split_or_coord schema)) in
315   - Xlist.fold schemas valence (fun valence schema ->
316   - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in
317   - let poss = StringMap.add_inc poss pos [LexFrame(id,pos2,restr,schema)] (fun l -> LexFrame(id,pos2,restr,schema) :: l) in
318   - StringMap.add valence lexeme poss)
319   - | _ -> failwith "convert_frame")
320   - with ImpossibleSchema -> valence
321   -
322   -let convert_frame_sem expands subtypes equivs lexemes valence lexeme pos = function
323   - Frame(DefaultAtrs(meanings,refl,opinion,negation,pred,aspect),positions) ->
324   -(* Printf.printf "convert_frame_sem %s\n" (ENIAMwalStringOf.frame lexeme (Frame(DefaultAtrs(meanings,refl,opinion,negation,pred,aspect),positions))); *)
325   - (try
326   - if refl = ReflSie && not (StringMap.mem lexemes "się") then raise ImpossibleSchema else
327   - let frame =
328   - Frame(DefaultAtrs(meanings,refl,opinion,negation,pred,aspect),
329   - prepare_schema_sem expands subtypes equivs positions) in
330   - let frame = if StringMap.is_empty lexemes then frame else reduce_schema_frame lexemes frame in
331   - let frame = expand_lexicalizations frame in
332   - Xlist.fold (extract_lex_frames lexeme pos [] frame) valence (fun valence -> function
333   - lexeme,pos,Frame(atrs,schema) ->
334   - let schemas = simplify_lex (split_xor (split_or_coord schema)) in
335   - Xlist.fold schemas valence (fun valence schema ->
336   - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in
337   - let poss = StringMap.add_inc poss pos [Frame(atrs,schema)] (fun l -> Frame(atrs,schema) :: l) in
338   - StringMap.add valence lexeme poss)
339   - | lexeme,pos,LexFrame(id,pos2,restr,schema) ->
340   - let schemas = simplify_lex (split_xor (split_or_coord schema)) in
341   - Xlist.fold schemas valence (fun valence schema ->
342   - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in
343   - let poss = StringMap.add_inc poss pos [LexFrame(id,pos2,restr,schema)] (fun l -> LexFrame(id,pos2,restr,schema) :: l) in
344   - StringMap.add valence lexeme poss)
345   - | _ -> failwith "convert_frame_sem")
346   - with ImpossibleSchema -> valence)
347   - | _ -> failwith "convert_frame_sem"
348   -
349   -let make_comprep_frames_of_schema s = function
350   - [{cr=[];ce=[]; morfs=[LexPhrase([pos,Lexeme lex],(restr,schema))]}] ->
351   - lex,
352   - (match get_pos lex pos with [pos] -> pos | _ -> failwith "make_comprep_frame_of_schema 2"),
353   - ComprepFrame(s,pos,restr,schema)
354   - | schema -> failwith ("make_comprep_frame_of_schema: " ^ ENIAMwalStringOf.schema schema)
355   -
356   -let convert_comprep_frame expands subtypes equivs lexemes valence lexeme pos (s,morf) =
357   - try
358   - let schema = prepare_schema_comprep expands subtypes equivs [simple_arg_schema_field [morf]] in
359   - let schema = if StringMap.is_empty lexemes then schema else reduce_schema lexemes schema in
360   - let schema = expand_lexicalizations_schema schema in
361   - let lexeme,pos,frame = make_comprep_frames_of_schema s schema in
362   - Xlist.fold (extract_lex_frames lexeme pos [] frame) valence (fun valence -> function
363   - lexeme,pos,ComprepFrame(s,pos2,restr,schema) ->
364   - let schemas = simplify_lex (split_xor (split_or_coord schema)) in
365   - Xlist.fold schemas valence (fun valence schema ->
366   - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in
367   - let poss = StringMap.add_inc poss pos [ComprepFrame(s,pos2,restr,schema)] (fun l -> ComprepFrame(s,pos2,restr,schema) :: l) in
368   - StringMap.add valence lexeme poss)
369   - | lexeme,pos,LexFrame(id,pos2,restr,schema) ->
370   - let schemas = simplify_lex (split_xor (split_or_coord schema)) in
371   - Xlist.fold schemas valence (fun valence schema ->
372   - let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in
373   - let poss = StringMap.add_inc poss pos [LexFrame(id,pos2,restr,schema)] (fun l -> LexFrame(id,pos2,restr,schema) :: l) in
374   - StringMap.add valence lexeme poss)
375   - | _ -> failwith "convert_comprep_frame")
376   - with ImpossibleSchema -> valence
377   -
378   -let remove_pro_args schema = (* FIXME: sprawdzić czy Pro i Null są zawsze na początku *)
379   - List.rev (Xlist.fold schema [] (fun schema -> function
380   - {morfs=[Phrase Pro]} -> schema
381   - | {morfs=(Phrase Pro) :: morfs} as s -> {s with morfs=morfs} :: schema
382   - | {morfs=[Phrase Null]} -> schema
383   - | {morfs=(Phrase Null) :: morfs} as s -> {s with morfs=morfs} :: schema
384   - | s -> s :: schema))
385   -
386   -
387   -
388   -(*let _ =
389   - let valence = Xlist.fold all_frames StringMap.empty (fun valence (pos,frame_map) ->
390   - print_endline pos;
391   - StringMap.fold frame_map valence (fun valence lexeme frames ->
392   - Xlist.fold frames valence (fun valence frame ->
393   -(* print_endline (ENIAMwalStringOf.unparsed_frame lexeme frame); *)
394   - convert_frame expands subtypes equivs StringMap.empty valence lexeme pos frame))) in
395   - print_endline "comprepnp";
396   - let valence = StringMap.fold compreps valence (fun valence lexeme frames ->
397   - Xlist.fold frames valence (fun valence (pos,frame) ->
398   - convert_comprep_frame expands subtypes equivs StringMap.empty valence lexeme pos frame)) in
399   - print_endline "expand_restr";
400   - let valence = StringMap.mapi valence (fun lexeme poss ->
401   - StringMap.mapi poss (fun pos frames ->
402   - List.flatten (Xlist.map frames (expand_restr valence lexeme pos)))) in
403   - print_endline "transform_frame";
404   - let _ = StringMap.mapi valence (fun lexeme poss ->
405   - StringMap.mapi poss (fun pos frames ->
406   -(* print_endline lexeme; *)
407   - List.flatten (Xlist.map frames (transform_frame lexeme pos)))) in
408   - print_endline "done";
409   - ()*)
410   -(* StringMap.iter valence (fun lexeme poss ->
411   - StringMap.iter poss (fun pos frames ->
412   - Xlist.iter frames (fun frame -> print_endline (ENIAMwalStringOf.frame lexeme frame))))*)
lexSemantics/ENIAMwalReduce.ml
... ... @@ -90,36 +90,21 @@ let select_comprep_adjuncts lexemes =
90 90 not (StringSet.is_empty (StringSet.intersection reqs lexemes)) then s :: l else l)
91 91 with Not_found -> l)
92 92  
93   -
94   -
95   -(* let rec assign_pro_args schema =
96   - Xlist.map schema (fun s ->
97   - let morfs = match s.morfs with
98   - (E p) :: l -> E Pro :: (E p) :: l
99   - | [LexPhrase _] as morfs -> morfs
100   - | [Phrase(FixedP _)] as morfs -> morfs
101   - | [Phrase(Lex _)] as morfs -> morfs
102   - (* | [Phrase Refl] as morfs -> morfs
103   - | [Phrase Recip] as morfs -> morfs*)
104   - | Phrase Null :: _ as morfs -> morfs
105   - | Phrase Pro :: _ as morfs -> morfs
106   - | morfs -> if s.gf <> SUBJ && s.cr = [] && s.ce = [] then (Phrase Null) :: morfs else (Phrase Pro) :: morfs in (* FIXME: ustalić czy są inne przypadki uzgodnienia *)
107   - (* let morfs = assign_pro_args_lex morfs in *) (* bez pro wewnątrz leksykalizacji *)
108   - {s with morfs=morfs}) *)
109   -
110   -(*let assign_pro_args_lex morfs =
111   - Xlist.map morfs (function
112   - Lex(morf,specs,lex,restr) -> LexN(morf,specs,lex,assign_pro_args_restr restr)
113   - | LexNum(morf,lex1,lex2,restr) -> LexNum(morf,lex1,lex2,assign_pro_args_restr restr)
114   - | LexCompar(morf,l) -> LexCompar(morf,make_gfs_lex l)
115   - | morf -> morf)
116   -
117   - and assign_pro_args_restr = function
118   - Natr -> Natr
119   - | Ratr1 schema -> Ratr1(assign_pro_args schema)
120   - | Atr1 schema -> Atr1(assign_pro_args schema)
121   - | Ratr schema -> Ratr(assign_pro_args schema)
122   - | Atr schema -> Atr(assign_pro_args schema)*)
  93 +let set_necessary schema =
  94 + Xlist.map schema (fun p ->
  95 + let nec =
  96 + if p.gf = ADJUNCT then Opt else
  97 + if Xlist.fold p.morfs false (fun b -> function
  98 + SimpleLexArg _ -> true
  99 + | LexArg _ -> true
  100 + | FixedP _ -> true
  101 + | _ -> b) then Req else
  102 + if p.gf <> SUBJ && p.cr = [] && p.ce = [] then Opt else
  103 + if Xlist.fold p.morfs false (fun b -> function
  104 + NP NomAgr -> true
  105 + | NCP(NomAgr,_,_) -> true
  106 + | _ -> b) then ProNG else Pro in
  107 + {p with is_necessary=nec})
123 108  
124 109 exception ImpossibleSchema
125 110  
... ...
lexSemantics/ENIAMwalRenderer.ml
... ... @@ -217,20 +217,39 @@ let render_lex_entry = function
217 217 (* Printf.printf "%s %s %s\n" pos lemma (ENIAMwalStringOf.schema schema); *)
218 218 selectors,render_lex_entry entry) *)
219 219  
  220 +let adjunct morfs = {empty_position with gf=ADJUNCT; is_necessary=Opt; morfs=Xlist.map morfs (fun morf -> LCG morf)}
  221 +let adjunct_multi dir morfs = {empty_position with gf=ADJUNCT; is_necessary=Multi; dir=dir; morfs=Xlist.map morfs (fun morf -> LCG morf)}
  222 +let adjunct_dir dir morfs = {empty_position with gf=ADJUNCT; is_necessary=Opt; dir=dir; morfs=Xlist.map morfs (fun morf -> LCG morf)}
  223 +let adjunct_ce ce morfs = {empty_position with gf=ADJUNCT; ce=[ce]; is_necessary=Opt; morfs=Xlist.map morfs (fun morf -> LCG morf)}
  224 +
220 225 let render_comprep prep = Both,Plus[One;Tensor[Atom "comprepnp"; Atom prep]]
221 226  
  227 +let render_connected_comprep prep = adjunct [Tensor[Atom "comprepnp"; Atom prep]]
  228 +
222 229 let render_prepnp prep cases =
223 230 Both,Plus(One :: List.flatten (Xlist.map cases (fun case ->
224 231 [Tensor[Atom "prepnp"; Atom prep; Atom case];
225 232 Tensor[Atom "prepncp"; Atom prep; Atom case; Top; Top]])))
226 233  
  234 +let render_connected_prepnp prep cases =
  235 + adjunct (List.flatten (Xlist.map cases (fun case ->
  236 + [Tensor[Atom "prepnp"; Atom prep; Atom case];
  237 + Tensor[Atom "prepncp"; Atom prep; Atom case; Top; Top]])))
  238 +
227 239 let render_prepadjp prep cases =
228 240 let postp = if prep = "z" || prep = "po" || prep = "na" then [Tensor[Atom "prepadjp"; Atom prep; Atom "postp"]] else [] in
229 241 Both,Plus(One :: postp @ (Xlist.map cases (fun case ->
230 242 Tensor[Atom "prepadjp"; Atom prep; Atom case])))
231 243  
  244 +let render_connected_prepadjp prep cases =
  245 + let postp = if prep = "z" || prep = "po" || prep = "na" then [Tensor[Atom "prepadjp"; Atom prep; Atom "postp"]] else [] in
  246 + adjunct (postp @ (Xlist.map cases (fun case ->
  247 + Tensor[Atom "prepadjp"; Atom prep; Atom case])))
  248 +
232 249 let render_compar prep = Both,Plus[One;Tensor[Atom "compar"; Atom prep; Top]]
233 250  
  251 +let render_connected_compar prep = adjunct [Tensor[Atom "compar"; Atom prep; Top]]
  252 +
234 253 let verb_adjuncts_simp = [
235 254 Both, Plus[One;Tensor[Atom "advp"; Atom "pron"]];
236 255 Both, Plus[One;Tensor[Atom "advp"; Atom "locat"]];
... ... @@ -249,6 +268,24 @@ let verb_adjuncts_simp = [
249 268 Both, Plus[One;Tensor[Atom "padvp"]];
250 269 ]
251 270  
  271 +let verb_connected_adjuncts_simp = [
  272 + adjunct [Tensor[Atom "advp"; Atom "pron"]];
  273 + adjunct [Tensor[Atom "advp"; Atom "locat"]];
  274 + adjunct [Tensor[Atom "advp"; Atom "abl"]];
  275 + adjunct [Tensor[Atom "advp"; Atom "adl"]];
  276 + adjunct [Tensor[Atom "advp"; Atom "perl"]];
  277 + adjunct [Tensor[Atom "advp"; Atom "temp"]];
  278 + adjunct [Tensor[Atom "advp"; Atom "dur"]];
  279 + adjunct [Tensor[Atom "advp"; Atom "mod"]];
  280 + adjunct [Tensor[Atom "np";Top;Atom "dat"; Top; Top];Tensor[Atom "ncp"; Top; Atom "dat"; Top; Top; Top; Top]];
  281 + adjunct [Tensor[Atom "np";Top;Atom "inst"; Top; Top];Tensor[Atom "ncp"; Top; Atom "inst"; Top; Top; Top; Top]];
  282 + adjunct [Tensor[Atom "date"];Tensor[Atom "day-lex"];Tensor[Atom "day-month"];Tensor[Atom "day"]];
  283 + adjunct_dir Forward_ [Tensor[Atom "cp";Top; Top]];
  284 + adjunct [Tensor[Atom "or"]];
  285 + adjunct [Tensor[Atom "lex";Atom "się";Atom "qub"]];
  286 + adjunct_ce "3" [Tensor[Atom "padvp"]];
  287 +]
  288 +
252 289 let proper_noun_adjuncts_simp = [
253 290 Both, Plus[One;Tensor[Atom "np";Top;Atom "gen"; Top; Top];Tensor[Atom "ncp"; Top; Atom "gen"; Top; Top; Top; Top]];
254 291 Forward, Plus[One;Tensor[Atom "np";Top;Atom "nom"; Top; Top];Tensor[Atom "np";Top;AVar "case"; Top; Top]];
... ... @@ -256,6 +293,13 @@ let proper_noun_adjuncts_simp = [
256 293 Forward, Plus[One;Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]];
257 294 ]
258 295  
  296 +let proper_noun_connected_adjuncts_simp = [
  297 + adjunct [Tensor[Atom "np";Top;Atom "gen"; Top; Top];Tensor[Atom "ncp"; Top; Atom "gen"; Top; Top; Top; Top]];
  298 + adjunct_dir Forward_ [Tensor[Atom "np";Top;Atom "nom"; Top; Top];Tensor[Atom "np";Top;AVar "case"; Top; Top]];
  299 + adjunct_multi Backward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]];
  300 + adjunct_dir Forward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]];
  301 +]
  302 +
259 303 let common_noun_adjuncts_simp = [
260 304 Both, Plus[One;Tensor[Atom "np";Top;Atom "gen"; Top; Top];Tensor[Atom "ncp"; Top; Atom "gen"; Top; Top; Top; Top]];
261 305 Forward, Plus[One;Tensor[Atom "np";Top;Atom "nom"; Top; Top];Tensor[Atom "np";Top;AVar "case"; Top; Top]];
... ... @@ -263,15 +307,45 @@ let common_noun_adjuncts_simp = [
263 307 Forward, Plus[One;Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]];
264 308 ]
265 309  
  310 +let common_noun_connected_adjuncts_simp = [
  311 + adjunct [Tensor[Atom "np";Top;Atom "gen"; Top; Top];Tensor[Atom "ncp"; Top; Atom "gen"; Top; Top; Top; Top]];
  312 + adjunct_dir Forward_ [Tensor[Atom "np";Top;Atom "nom"; Top; Top];Tensor[Atom "np";Top;AVar "case"; Top; Top]];
  313 + adjunct_multi Backward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]];
  314 + adjunct_dir Forward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]];
  315 +]
  316 +
266 317 let measure_noun_adjuncts_simp = [
267 318 Backward, Maybe(Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]);
268 319 Forward, Plus[One;Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]];
269 320 ]
270 321  
  322 +let measure_noun_connected_adjuncts_simp = [
  323 + adjunct_multi Backward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]];
  324 + adjunct_dir Forward_ [Tensor[Atom "adjp"; AVar "number"; AVar "case"; AVar "gender"]];
  325 +]
  326 +
271 327 let adj_adjuncts_simp = [
272 328 Both, Plus[One;Tensor[Atom "advp"; Top]];
273 329 ]
274 330  
  331 +let adj_connected_adjuncts_simp = [
  332 + adjunct [Tensor[Atom "advp"; Top]];
  333 +]
  334 +
275 335 let adv_adjuncts_simp = [
276 336 Both, Plus[One;Tensor[Atom "advp"; Top]];
277 337 ]
  338 +
  339 +let adv_connected_adjuncts_simp = [
  340 + adjunct [Tensor[Atom "advp"; Top]];
  341 + ]
  342 +
  343 +let assing_pref_morfs = function
  344 + "po","postp" -> [
  345 + LCG(Tensor[Atom "adjp"; Atom "sg"; Atom "dat"; Atom "m1"]);
  346 + LCG(Tensor[Atom "adjp"; Top; Atom "postp"; Top])]
  347 + | "z","postp" -> [LCG(Tensor[Atom "adjp"; Atom "sg"; Atom "nom"; Atom "f"])]
  348 + | "na","postp" -> [LCG(Tensor[Atom "advp"; Top])]
  349 + | _,case -> [
  350 + LCG(Tensor[Atom "np"; Top; Atom case; Top; Top]);
  351 + LCG(Tensor[Atom "adjp"; Top; Atom case; Top])]
... ...
lexSemantics/ENIAMwalStringOf.ml
... ... @@ -101,6 +101,9 @@ let gf = function
101 101 SUBJ -> "subj"
102 102 | OBJ -> "obj"
103 103 | ARG -> "arg"(*""*)
  104 + | ADJUNCT -> "adjunct"
  105 + | CORE -> "core"
  106 + | NOSEM -> "nosem"
104 107  
105 108 let pos = function
106 109 SUBST(n,c) -> "SUBST(" ^ number n ^ "," ^ case c ^ ")"
... ... @@ -173,13 +176,26 @@ let controllees l =
173 176 "1" -> "controllee"
174 177 | n -> "controllee" ^ n)
175 178  
  179 +let necessary = function
  180 + Opt -> "opt"
  181 + | Req -> "req"
  182 + | Pro -> "pro"
  183 + | ProNG -> "prong"
  184 + | Multi -> "multi"
  185 +
  186 +let dir = function
  187 + Both_ -> ""
  188 + | Forward_ -> "/"
  189 + | Backward_ -> "\\"
  190 +
176 191 let rec schema schema =
177 192 String.concat "+" (Xlist.map schema (fun s ->
178 193 String.concat "," (
179 194 (if s.gf = ARG then [] else [gf s.gf])@s.mode@
  195 + (if s.is_necessary = Opt then [] else [necessary s.is_necessary])@
180 196 (if s.role = "" then [] else [s.role])@
181 197 (if s.role_attr = "" then [] else [s.role_attr])@
182   - (*s.sel_prefs@*)(controllers s.cr)@(controllees s.ce)) ^ "{" ^ String.concat ";" (Xlist.map s.morfs phrase) ^ "}"))
  198 + (*s.sel_prefs@*)(controllers s.cr)@(controllees s.ce)) ^ (dir s.dir) ^ "{" ^ String.concat ";" (Xlist.map s.morfs phrase) ^ "}"))
183 199 (*
184 200 and morf = function
185 201 Phrase p -> phrase p
... ... @@ -222,7 +238,7 @@ let rec connected_schema schema =
222 238 *)
223 239  
224 240 let meaning m =
225   - m.name ^ "-" ^ m.variant
  241 + m.name ^ "-" ^ m.variant
226 242  
227 243 let lex_entry = function
228 244 SimpleLexEntry(le,p) ->
... ...
lexSemantics/ENIAMwalTypes.ml
... ... @@ -39,7 +39,7 @@ type aux = NoAux | PastAux | FutAux | ImpAux
39 39  
40 40 type nsem = Common of string | Time*)
41 41  
42   -type gf = SUBJ | OBJ | ARG
  42 +type gf = SUBJ | OBJ | ARG | ADJUNCT | CORE | NOSEM
43 43  
44 44 type pos =
45 45 SUBST of number * case
... ... @@ -99,13 +99,19 @@ type restr = Natr | Ratr | Ratrs | Ratr1 | Atr | Atr1 | NoRestr
99 99 type sel_prefs =
100 100 SynsetId of int
101 101 | Predef of string
102   - | RelationRole of string * string * string (* relacji * rola * atrybut roli *)
  102 + | SynsetName of string
  103 + | RelationRole of string * string * string (* relacja * rola * atrybut roli *)
  104 +
  105 +type necessary = Req | Opt | Pro | ProNG | Multi
  106 +
  107 +type direction = Both_ | Forward_ | Backward_
103 108  
104 109 type position = {psn_id: int; gf: gf; role: string; role_attr: string; sel_prefs: sel_prefs list;
105   - mode: string list; cr: string list; ce: string list; morfs: phrase list}
  110 + mode: string list; cr: string list; ce: string list; morfs: phrase list;
  111 + dir: direction; is_necessary: necessary}
106 112  
107 113 let empty_position =
108   - {psn_id=(-1); gf=ARG; role=""; role_attr=""; mode=[]; sel_prefs=[]; cr=[]; ce=[]; morfs=[]}
  114 + {psn_id=(-1); gf=ARG; role=""; role_attr=""; mode=[]; sel_prefs=[]; cr=[]; ce=[]; dir=Both_; morfs=[]; is_necessary=Opt}
109 115  
110 116 type meaning = {mng_id: int;
111 117 name: string;
... ...
lexSemantics/interface.ml
... ... @@ -83,10 +83,7 @@ let rec main_loop in_chan out_chan =
83 83 let _ =
84 84 prerr_endline message;
85 85 Arg.parse spec_list anon_fun usage_msg;
86   - ENIAMsubsyntax.initialize ();
87   - ENIAMcategoriesPL.initialize ();
88   - ENIAMwalParser.initialize ();
89   - ENIAMwalReduce.initialize ();
  86 + ENIAMlexSemantics.initialize ();
90 87 Gc.compact ();
91 88 prerr_endline "Ready!";
92 89 if !comm_stdio then main_loop stdin stdout
... ...
lexSemantics/makefile
... ... @@ -3,34 +3,33 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -#OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa #eniam-plWordnet.cmxa #eniam-lexSemantics.cmxa
7 6 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa
8 7 INSTALLDIR=`ocamlc -where`/eniam
9 8  
10   -SOURCES= entries.ml ENIAMwalTypes.ml ENIAMwalStringOf.ml ENIAMwalParser.ml ENIAMwalReduce.ml ENIAMvalence.ml ENIAMwalRenderer.ml ENIAMadjuncts.ml \
11   - ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsStringOf.ml ENIAMlexSemanticsHTMLof.ml ENIAMlexSemanticsXMLof.ml ENIAMlexSemantics.ml #ENIAMlexSemanticsData.ml
  9 +SOURCES= entries.ml ENIAMwalTypes.ml ENIAMwalStringOf.ml ENIAMwalParser.ml ENIAMwalReduce.ml ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsData.ml ENIAMvalence.ml ENIAMwalRenderer.ml ENIAMadjuncts.ml \
  10 + ENIAMlexSemanticsStringOf.ml ENIAMlexSemanticsHTMLof.ml ENIAMlexSemanticsXMLof.ml ENIAMplWordnet.ml ENIAMlexSemantics.ml
12 11  
13 12 all: eniam-lexSemantics.cma eniam-lexSemantics.cmxa
14 13  
15 14 install: all
16 15 mkdir -p $(INSTALLDIR)
17 16 cp eniam-lexSemantics.cmxa eniam-lexSemantics.a eniam-lexSemantics.cma $(INSTALLDIR)
18   - cp entries.cmi ENIAMwalTypes.cmi ENIAMwalStringOf.cmi ENIAMwalParser.cmi ENIAMwalReduce.cmi ENIAMvalence.cmi ENIAMwalRenderer.cmi ENIAMadjuncts.cmi ENIAMlexSemanticsTypes.cmi ENIAMlexSemanticsStringOf.cmi ENIAMlexSemanticsHTMLof.cmi ENIAMlexSemanticsXMLof.cmi ENIAMlexSemantics.cmi $(INSTALLDIR)
19   - cp entries.cmx ENIAMwalTypes.cmx ENIAMwalStringOf.cmx ENIAMwalParser.cmx ENIAMwalReduce.cmx ENIAMvalence.cmx ENIAMwalRenderer.cmx ENIAMadjuncts.cmx ENIAMlexSemanticsTypes.cmx ENIAMlexSemanticsStringOf.cmx ENIAMlexSemanticsHTMLof.cmx ENIAMlexSemanticsXMLof.cmx ENIAMlexSemantics.cmx $(INSTALLDIR)
  17 + cp entries.cmi ENIAMwalTypes.cmi ENIAMwalStringOf.cmi ENIAMwalParser.cmi ENIAMwalReduce.cmi ENIAMlexSemanticsData.cmi ENIAMvalence.cmi ENIAMwalRenderer.cmi ENIAMadjuncts.cmi ENIAMlexSemanticsTypes.cmi ENIAMlexSemanticsStringOf.cmi ENIAMlexSemanticsHTMLof.cmi ENIAMlexSemanticsXMLof.cmi ENIAMplWordnet.cmi ENIAMlexSemantics.cmi $(INSTALLDIR)
  18 + cp entries.cmx ENIAMwalTypes.cmx ENIAMwalStringOf.cmx ENIAMwalParser.cmx ENIAMwalReduce.cmx ENIAMlexSemanticsData.cmx ENIAMvalence.cmx ENIAMwalRenderer.cmx ENIAMadjuncts.cmx ENIAMlexSemanticsTypes.cmx ENIAMlexSemanticsStringOf.cmx ENIAMlexSemanticsHTMLof.cmx ENIAMlexSemanticsXMLof.cmx ENIAMplWordnet.cmx ENIAMlexSemantics.cmx $(INSTALLDIR)
20 19 mkdir -p /usr/share/eniam/lexSemantics
21   -# cp resources/* /usr/share/eniam/lexSemantics
22   - ln -s /usr/share/eniam/lexSemantics/proper_names_20160104.tab /usr/share/eniam/lexSemantics/proper_names.tab
23   - ln -s /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab
  20 + cp resources/* /usr/share/eniam/lexSemantics
  21 +# ln -s /usr/share/eniam/lexSemantics/proper_names_20160104.tab /usr/share/eniam/lexSemantics/proper_names.tab
  22 +# ln -s /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab
24 23  
25 24 install-local: all
26 25 mkdir -p $(INSTALLDIR)
27 26 cp eniam-lexSemantics.cmxa eniam-lexSemantics.a eniam-lexSemantics.cma $(INSTALLDIR)
28   - cp entries.cmi ENIAMwalTypes.cmi ENIAMwalStringOf.cmi ENIAMwalParser.cmi ENIAMwalReduce.cmi ENIAMvalence.cmi ENIAMwalRenderer.cmi ENIAMadjuncts.cmi ENIAMlexSemanticsTypes.cmi ENIAMlexSemantics.cmi $(INSTALLDIR)
29   - cp entries.cmx ENIAMwalTypes.cmx ENIAMwalStringOf.cmx ENIAMwalParser.cmx ENIAMwalReduce.cmx ENIAMvalence.cmx ENIAMwalRenderer.cmx ENIAMadjuncts.cmx ENIAMlexSemanticsTypes.cmx ENIAMlexSemantics.cmx $(INSTALLDIR)
  27 + cp entries.cmi ENIAMwalTypes.cmi ENIAMwalStringOf.cmi ENIAMwalParser.cmi ENIAMwalReduce.cmi ENIAMlexSemanticsData.cmi ENIAMvalence.cmi ENIAMwalRenderer.cmi ENIAMadjuncts.cmi ENIAMlexSemanticsTypes.cmi ENIAMlexSemanticsStringOf.cmi ENIAMlexSemanticsHTMLof.cmi ENIAMlexSemanticsXMLof.cmi ENIAMplWordnet.cmi ENIAMlexSemantics.cmi $(INSTALLDIR)
  28 + cp entries.cmx ENIAMwalTypes.cmx ENIAMwalStringOf.cmx ENIAMwalParser.cmx ENIAMwalReduce.cmx ENIAMlexSemanticsData.cmx ENIAMvalence.cmx ENIAMwalRenderer.cmx ENIAMadjuncts.cmx ENIAMlexSemanticsTypes.cmx ENIAMlexSemanticsStringOf.cmx ENIAMlexSemanticsHTMLof.cmx ENIAMlexSemanticsXMLof.cmx ENIAMplWordnet.cmx ENIAMlexSemantics.cmx $(INSTALLDIR)
30 29 mkdir -p /usr/local/share/eniam/lexSemantics
31   -# cp resources/* /usr/local/share/eniam/lexSemantics
32   - ln -s /usr/local/share/eniam/lexSemantics/proper_names_20160104.tab /usr/local/share/eniam/lexSemantics/proper_names.tab
33   - ln -s /usr/local/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/local/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab
  30 + cp resources/* /usr/local/share/eniam/lexSemantics
  31 +# ln -s /usr/local/share/eniam/lexSemantics/proper_names_20160104.tab /usr/local/share/eniam/lexSemantics/proper_names.tab
  32 +# ln -s /usr/local/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/local/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab
34 33  
35 34 eniam-lexSemantics.cma: $(SOURCES)
36 35 ocamlc -linkall -a -o eniam-lexSemantics.cma $(OCAMLFLAGS) $^
... ... @@ -38,7 +37,7 @@ eniam-lexSemantics.cma: $(SOURCES)
38 37 eniam-lexSemantics.cmxa: $(SOURCES)
39 38 ocamlopt -linkall -a -o eniam-lexSemantics.cmxa $(INCLUDES) $^
40 39  
41   -test: $(SOURCES) test.ml
  40 +test: test.ml
42 41 $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^
43 42  
44 43 interface: interface.ml
... ...
plWordnet/resources/predef_prefs.tab renamed to lexSemantics/resources/predef_prefs.tab
1   -LUDZIE osoba 1 grupa ludzi 1
  1 +LUDZIE osoba 1 ludzie 1
2 2 ISTOTY LUDZIE osoba 1 istota żywa 1 grupa istot 1
3 3 PODMIOTY LUDZIE podmiot 3
4 4 JADŁO pokarm 1 napój 1
... ... @@ -17,4 +17,4 @@ SYTUACJA CZYNNOŚĆ czynność 1 zdarzenie 2 okoliczność 1 ciąg zdarzeń 1
17 17 KIEDY CZAS SYTUACJA
18 18 CZEMU CECHA SYTUACJA LUDZIE
19 19 ILOŚĆ ilość 1 rozmiar 1 rozmiar 2 jednostka 4 wielkość 6
20   -ALL PODMIOTY ISTOTY DOBRA SYTUACJA
  20 +#ALL PODMIOTY ISTOTY DOBRA SYTUACJA
... ...
plWordnet/resources/proper_classes.tab renamed to lexSemantics/resources/proper_classes.tab
lexSemantics/test.ml
... ... @@ -23,6 +23,7 @@ let test_strings = [
23 23 (* "Kot miauczy w październiku."; *)
24 24 (* "Arabia Saudyjska biegnie."; *)
25 25 "Chłopcy mają ulicę kwiatami.";
  26 + "Kot miałczy.";
26 27 (* "Np. Ala.";
27 28 "Kot np. miauczy.";
28 29 "Szpak frunie. Kot miauczy.";
... ... @@ -61,6 +62,7 @@ let test_strings = [
61 62 ]
62 63  
63 64 let _ =
  65 + ENIAMlexSemantics.initialize ();
64 66 print_endline "Testy wbudowane";
65 67 Xlist.iter test_strings (fun s ->
66 68 print_endline ("\nTEST: " ^ s);
... ...
morphology/check_rule_compos.py 0 → 100644
  1 +# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016.
  2 +# This file is intended to check the NKJP1M frequency list against rules derived from SGJP.
  3 +# If you want to use this, review the end of this file (filenames, column structure) and run with python3.
  4 +
  5 +import re
  6 +
  7 +def load_rules_file(fname):
  8 + rule_list = []
  9 + contents = ''
  10 +
  11 + with open(fname) as inp:
  12 + contents = inp.read()
  13 +
  14 + contents = contents.split('\n')
  15 +
  16 + for line in contents:
  17 + data = line.split('\t')
  18 + if len(data) != 7:
  19 + print('Skipped line in rules: '+line)
  20 + rule_list.append(tuple(data))
  21 +
  22 + return rule_list
  23 +
  24 +def make_rules_table(rule_list):
  25 + "Given rule_list as list of tuples (name, freq, classification, prefix, suffix, stem ending, \
  26 + tag), create a dictionary: ending -> list of applicable rules, also as tuples. Indices are \
  27 + prefixes followed by - (hyphen) and suffixes preced by -, up to three characters; longer \
  28 + affixes are included in the lists for their outermost three-character parts. If both empty \
  29 + affixes are empty, rule gets listed under '-'."
  30 +
  31 + rtable = dict()
  32 +
  33 + for rl in rule_list:
  34 + if len(rl) != 7:
  35 + print("Skipped invalid rule: "+str(rl))
  36 + continue
  37 +
  38 + index = '-'
  39 +
  40 + if rl[3] != '':
  41 + index = rl[3] + '-'
  42 + elif rl[4] != '':
  43 + index = '-' + rl[4]
  44 +
  45 + if len(index) > 4:
  46 + if index[0] == '-': # suffix
  47 + index = '-' + index[-3:]
  48 + else: # prefix
  49 + index = index[:3] + '-'
  50 +
  51 + if index in rtable:
  52 + rtable[index].append(rl)
  53 + else:
  54 + rtable[index] = [ rl ]
  55 +
  56 + return rtable
  57 +
  58 +# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
  59 +# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
  60 +# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
  61 +def strict_tagcomp(tag1, tag2):
  62 + tag1_items = tag1.split(':')
  63 + tag2_items = tag2.split(':')
  64 +
  65 + if (tag1_items[0] != tag2_items[0] # POS
  66 + or len(tag1_items) != len(tag2_items)):
  67 + return False
  68 +
  69 + for (i, item) in enumerate(tag1_items):
  70 + if not item in tag2_items[i].split('.'):
  71 + return False
  72 +
  73 + return True
  74 +
  75 +def liberal_tagcomp(tag1, tag2):
  76 + tag1_items = tag1.split(':')
  77 + tag2_items = tag2.split(':')
  78 +
  79 + if (tag1_items[0] != tag2_items[0] # POS
  80 + or len(tag1_items) != len(tag2_items)):
  81 + return False
  82 +
  83 + for (i, item) in enumerate(tag1_items):
  84 + # remove tags n1, f1...
  85 + item = re.sub(r'(n1|n2|n3)', 'n', item)
  86 + model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
  87 + if not item in model and model[0] != '_': # underscore as a catchall
  88 + return False
  89 +
  90 + return True
  91 +
  92 +def is_recognizable(entry, rules_table):
  93 + "Check whether entry, given as triple (word_form, lemma, tags) is recognizable using \
  94 + rules_table as obtained from make_rules_table() function. Return the rule's class \
  95 + (third column, usually empty string)."
  96 +
  97 + for chunk_size in range(3, -1, -1):
  98 + if len(entry[0]) < chunk_size:
  99 + continue
  100 +
  101 + rule_candidates = []
  102 +
  103 + pref_ind = entry[0][:chunk_size]+'-'
  104 + suf_ind = '-'+entry[0][-chunk_size:]
  105 + if pref_ind in rules_table:
  106 + rule_candidates += rules_table[ pref_ind ]
  107 + if suf_ind in rules_table:
  108 + rule_candidates += rules_table[ suf_ind ]
  109 +
  110 + if len(rule_candidates) == 0:
  111 + continue
  112 + for rl in rule_candidates:
  113 + # check first the prefix and suffix (the above code just finds rules that are
  114 + # potentially relevant), and tag; then proceed to reconstructing the lemma
  115 + if (entry[0][:len(rl[3])] == rl[3] and
  116 + # check for empty suffix, since string[-0:] returns the string unchanged
  117 + (len(rl[4]) == 0 or entry[0][-len(rl[4]):] == rl[4]) and
  118 + liberal_tagcomp(entry[2], rl[6])):
  119 + # trim the prefix and suffix, and glue the ending suggested by the rule;
  120 + # compare with the original lemma
  121 + if (entry[0][len(rl[3]):-len(rl[4])]+rl[5] == entry[1]
  122 + # another corner case, str[:-0] would be ''
  123 + or (len(rl[4]) == 0 and entry[0][len(rl[3]):]+rl[5] == entry[1])):
  124 + return rl[2]
  125 +
  126 + return False
  127 +
  128 +rlist = load_rules_file('../resources/SGJP/freq_rules.tab')
  129 +rtable = make_rules_table(rlist)
  130 +
  131 +def esccurl(string) :
  132 + "Escape the curly brackets in the string, for using it with the string formatter."
  133 + return string.replace('{', '{{').replace('}', '}}')
  134 +
  135 +with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
  136 + with open('freq_with_rules.tab', 'w+') as out:
  137 + for line in inp:
  138 + line = line.strip()
  139 + data = line.split('\t')
  140 + if len(data) != 8: # column count of TAGGED frequency list
  141 + print('Skipped line in the list: '+line)
  142 + continue
  143 +
  144 + # The following was added to work on partially done tagged frequency, to get rid of the
  145 + # previous COMPOS classification. Otherwise we'd want to use something like this:
  146 + # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
  147 + # previous COMPOS column is in data[4], so we skip it below
  148 + fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))
  149 +
  150 + rl_class = is_recognizable((data[0], data[1], data[2]), rtable)
  151 + if rl_class == '':
  152 + print(fmt.format('COMPOS'), file=out)
  153 + elif rl_class != False:
  154 + print(fmt.format('COMPOS-'+rl_class), file=out)
  155 + else:
  156 + # Try again, with lowered lemma and word form.
  157 + rl_class_low = is_recognizable((data[0].lower(), data[1].lower(), data[2]),
  158 + rtable)
  159 + if rl_class_low == '':
  160 + print(fmt.format('COMPOS-LWR'), file=out)
  161 + elif rl_class_low != False:
  162 + print(fmt.format('COMPOS-LWR-'+rl_class_low), file=out)
  163 + else:
  164 + print(fmt.format('NCOMPOS'), file=out)
... ...
morphology/compare_morphosyn.py 0 → 100644
  1 +# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016.
  2 +#
  3 +# Given a frequency list and groundtruth dictionary, tag the entries on the frequency list with some
  4 +# automatic tags (can be seen at the end of this file).
  5 +#
  6 +# Run from Python3, with -i (inspect option), eg. `python3 -i compare_morphosyn.py`.
  7 +# Then invoke something like (with # representing Python prompt):
  8 +# # sgjp = load_sgjp('../../NLP resources/sgjp-20160724.tab')
  9 +# # nkjp = load_nkjp('../resources/NKJP1M/NKJP1M-frequency.tab')
  10 +# # notmatching(nkjp, sgjp, liberal_tagcomp, 'raw_tagged_frequency.tab') # (may take a while)
  11 +# # ^D # Ctrl-D when done
  12 +# The last argument points the result file, liberal_tagcomp is the most sane tag comparing function.
  13 +
  14 +import functools
  15 +import re
  16 +import unicodedata
  17 +
  18 +def load_sgjp(fname):
  19 + sgjp = dict()
  20 + with open(fname) as inp:
  21 + for line in inp:
  22 + data = line.strip().split('\t')
  23 +
  24 + if len(data) < 3:
  25 + print('Skipped line: ' + line.strip())
  26 + continue
  27 +
  28 + word_form = data[0]
  29 +
  30 + lemma = ''
  31 + lemma_sub = ''
  32 + if data[1] == ':':
  33 + lemma = [':']
  34 + else:
  35 + lemma = data[1].split(":")[0] # lemma subidentifier
  36 + if len(data[1].split(":")) > 1:
  37 + lemma_sub = data[1].split(":")[1]
  38 + if word_form.find('_') == -1:
  39 + lemma = lemma.replace('_', ' ')
  40 +
  41 + tags = data[2]
  42 +
  43 + notes = ''
  44 + if len(data) == 4:
  45 + notes = data[3]
  46 +
  47 + if lemma in sgjp:
  48 + sgjp[lemma].append([word_form, tags, notes])
  49 + else:
  50 + sgjp[lemma] = [ [word_form, tags, notes, lemma_sub] ]
  51 + return sgjp
  52 +
  53 +
  54 +def load_nkjp(fname):
  55 + nkjp = []
  56 + with open(fname) as inp:
  57 + nkjp = inp.read().split('\n')
  58 + for (n, line) in enumerate(nkjp):
  59 + nkjp[n] = nkjp[n].split('\t') # word_form, lemma, tags, freq
  60 + if len(nkjp[n]) != 5:
  61 + print('Skipped line: ' + str(n))
  62 + del nkjp[n]
  63 + return nkjp
  64 +
  65 +def naive_tagcomp(tag1, tag2):
  66 + return (tag1 == tag2)
  67 +
  68 +def strict_tagcomp(tag1, tag2):
  69 + tag1_items = tag1.split(':')
  70 + tag2_items = tag2.split(':')
  71 +
  72 + if (tag1_items[0] != tag2_items[0] # POS
  73 + or len(tag1_items) != len(tag2_items)):
  74 + return False
  75 +
  76 + for (i, item) in enumerate(tag1_items):
  77 + if not item in tag2_items[i].split('.'):
  78 + return False
  79 +
  80 + return True
  81 +
  82 +def liberal_tagcomp(tag1, tag2):
  83 + tag1_items = tag1.split(':')
  84 + tag2_items = tag2.split(':')
  85 +
  86 + if (tag1_items[0] != tag2_items[0] # POS
  87 + or len(tag1_items) != len(tag2_items)):
  88 + return False
  89 +
  90 + for (i, item) in enumerate(tag1_items):
  91 + # remove tags n1, f1...
  92 + item = re.sub(r'(n1|n2|n3)', 'n', item)
  93 + model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
  94 + if not item in model and model[0] != '_': # underscore as a catchall
  95 + return False
  96 +
  97 + return True
  98 +
  99 +def compare_entries(nkjp_entry, sgjp_forms, tagcomp_func):
  100 + found = False
  101 + case1 = False
  102 + case2 = False
  103 + case3 = False
  104 + for (s, sgjp_form) in enumerate(sgjp_forms):
  105 + nkjp_word = nkjp_entry[0]
  106 + nkjp_tag = re.sub(r':$', '', nkjp_entry[2])
  107 + if nkjp_tag != nkjp_entry[2]:
  108 + print("Corrected tag %s for %s %s" % (nkjp_entry[2], nkjp_entry[0], nkjp_entry[1]))
  109 + sgjp_word = sgjp_form[0]
  110 + sgjp_tag = sgjp_form[1]
  111 +
  112 + tag_match = tagcomp_func(nkjp_tag, sgjp_tag) # do it once
  113 +
  114 + if sgjp_word == nkjp_word and tag_match: # word_nkjp_word & tag
  115 + found = True
  116 + break
  117 +
  118 + elif tag_match: # tag okay, try with other letter cases
  119 + if len(nkjp_word) > 1 and nkjp_word.lower().capitalize() == nkjp_word: # Aaaa -> aaaa
  120 + if sgjp_word == nkjp_word.lower():
  121 + case1 = True
  122 + if not case1 and nkjp_word.lower() != nkjp_word:
  123 + if sgjp_word == nkjp_word.capitalize(): # AAAA -> Aaaa
  124 + case2 = True
  125 + elif sgjp_word == nkjp_word.lower(): # AAAA -> aaaa, A -> a
  126 + case3 = True
  127 + return (found, case1, case2, case3)
  128 +
  129 +def tab_format(collection, label):
  130 + "Convert a collection used by notmatching() function to a string of tabbed entries."
  131 + fmt = ''
  132 + for etr in collection:
  133 + fmt = fmt + '\t'.join(etr)+ '\t' + label + '\n'
  134 + #print("formatted for "+label+", "+str(len(fmt)) + " bytes")
  135 + return fmt
  136 +
  137 +def nonalphab(string):
  138 + for char in string:
  139 + if unicodedata.category(char)[0] == 'L': # 'letter'
  140 + return False
  141 + return True
  142 +
  143 +def notmatching(nkjp, sgjp, tagcomp_func, result_file):
  144 + notmatching = []
  145 + matching = []
  146 + case1_notmatching = [] # Aaaa -> aaaa
  147 + case2_notmatching = [] # AAAA -> Aaaa
  148 + case3_notmatching = [] # AAAA -> aaaa, A -> a
  149 + lower_matching = [] # matching with form and lemma converted to lowercase
  150 + symbols = []
  151 + notmatching_numeric = []
  152 +
  153 + for (n, nkjp_entry) in enumerate(nkjp):
  154 +
  155 + lemma = nkjp_entry[1].strip()
  156 + form = nkjp_entry[0].strip()
  157 + # Warn about stripped whitespaces.
  158 + if lemma != nkjp_entry[1]:
  159 + print("Stripped whitespaces in lemma: %s" % nkjp_entry[1])
  160 + if form != nkjp_entry[0]:
  161 + print("Stripped whitespaces in form: %s" % nkjp_entry[0])
  162 +
  163 + # Abbreviations are automatically classified as symbols.
  164 + if nkjp_entry[2][:4] == 'brev':
  165 + symbols.append(nkjp_entry)
  166 + continue
  167 +
  168 + sgjp_forms = []
  169 + lowered_lemma = False # indicates if lemma was converted to lowercase
  170 + if lemma in sgjp: # lemma matching
  171 + sgjp_forms = sgjp[lemma]
  172 + else:
  173 + if lemma.lower() in sgjp:
  174 + lowered_lemma = True
  175 + sgjp_forms = sgjp[lemma.lower()]
  176 + else:
  177 + # Continue when we can't find even lowered lemma in SGJP.
  178 + if nonalphab(form) and nonalphab(lemma):
  179 + symbols.append(nkjp_entry)
  180 + elif re.match(r"^[123456789]", form, flags=re.L) != None:
  181 + notmatching_numeric.append(nkjp_entry)
  182 + continue
  183 + else:
  184 + notmatching.append(nkjp_entry)
  185 + continue
  186 +
  187 + # The following is executed only if the lemma (maybe in lowercase) was found in SGJP.
  188 +
  189 + # Go through the entry if it wasn't found in SGJP
  190 + found, case1, case2, case3 = 0, 1, 2, 3 # indices in boolean tuple below
  191 + case = compare_entries(nkjp_entry, sgjp_forms, tagcomp_func)
  192 +
  193 + # one more desperate attempt at lowering the lemma, if nothing was found
  194 + if (not lowered_lemma) and not True in case:
  195 + if lemma.lower() in sgjp:
  196 + sgjp_forms = sgjp[lemma.lower()]
  197 + case = compare_entries(nkjp_entry, sgjp_forms, tagcomp_func)
  198 + if True in case:
  199 + lowered_lemma = True
  200 + else: # revert for consistency
  201 + sgjp_forms = sgjp[lemma]
  202 +
  203 + if lowered_lemma and (case[found] or case[case1] or case[case2] or case[case3]):
  204 + lower_matching.append(nkjp_entry)
  205 + continue
  206 +
  207 + if case[found]:
  208 + matching.append(nkjp_entry)
  209 + continue
  210 +
  211 + if nonalphab(form) and nonalphab(lemma):
  212 + symbols.append(nkjp_entry)
  213 + continue
  214 + if re.match(r"^[123456789]", form, flags=re.L) != None:
  215 + notmatching_numeric.append(nkjp_entry)
  216 + continue
  217 +
  218 + if case[case1]:
  219 + case1_notmatching.append(nkjp_entry)
  220 + continue
  221 + if case[case2]:
  222 + case2_notmatching.append(nkjp_entry)
  223 + continue
  224 + if case[case3]:
  225 + case3_notmatching.append(nkjp_entry)
  226 + continue
  227 +
  228 + # when everything failed:
  229 + notmatching.append(nkjp_entry)
  230 +
  231 + collections = [nkjp, matching, case1_notmatching, case2_notmatching, case3_notmatching,
  232 + lower_matching, symbols, notmatching_numeric, notmatching]
  233 + # sort the entries in collections by frequency
  234 + collections = list(map((lambda coll: sorted(coll, reverse=True, key=(lambda etr: int(etr[3])))),
  235 + collections))
  236 + freqs = list(map(lambda coll: functools.reduce((lambda x, y: x+y),
  237 + [int(etr[3]) for etr in coll]), # sum of sets' frequencies
  238 + collections))
  239 + descs = ["Total:",
  240 + "Found:",
  241 + "Found when Aaa -> aaa (lemma):",
  242 + "Found when AAA -> Aaa (lemma):",
  243 + "Found when AAA -> aaa (lemma):",
  244 + "Found when word form and lemma are converted to lowercase:",
  245 + "Symbols:",
  246 + "Not found, numeric:",
  247 + "Not found, other:"]
  248 +
  249 + for (i, _) in enumerate(collections):
  250 + info = (len(collections[i]), 100.0*(len(collections[i])/len(collections[0])),
  251 + freqs[i], 100.0*(freqs[i]/freqs[0]))
  252 + print((descs[i]+" %d entries (%.2f%%), %d occurences (%.2f%%)") % info)
  253 +
  254 + # below we skip nkjp, which contains everything
  255 + labels = ['SGJP-EXACT\tNCH\tCORR', 'SGJP-LMM-UNCAPITAL\tNCH\tCORR',
  256 + 'SGJP-LMM-CAPITAL\tNCH\tCORR', 'SGJP-LMM-LOWER\tNCH\tCORR',
  257 + 'SGJP-BTH-LOWER\tNCH\tCORR', 'NON-SGJP\tSYMB\tCORR',
  258 + 'NON-SGJP\tLATEK\tCORR', 'NON-SGJP\tCW\tCORR']
  259 + with open(result_file, 'w+') as out:
  260 + for (c, coll) in enumerate(collections[1:]):
  261 + print(tab_format(coll, labels[c]), file=out)
... ...
morphology/compos_alt.py 0 → 100644
  1 +# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Nov 2016.
  2 +# This file is intended to check the (partially tagged) NKJP1M frequency list against list of exce-
  3 +# ptions from morphological rules derived from SGJP.
  4 +# If you want to use this, review the end of this file (filenames, column structure) and run with python3.
  5 +
  6 +import re
  7 +
  8 +# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
  9 +# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
  10 +# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
  11 +def strict_tagcomp(tag1, tag2):
  12 + tag1_items = tag1.split(':')
  13 + tag2_items = tag2.split(':')
  14 +
  15 + if (tag1_items[0] != tag2_items[0] # POS
  16 + or len(tag1_items) != len(tag2_items)):
  17 + return False
  18 +
  19 + for (i, item) in enumerate(tag1_items):
  20 + if not item in tag2_items[i].split('.'):
  21 + return False
  22 +
  23 + return True
  24 +
  25 +def liberal_tagcomp(tag1, tag2):
  26 + tag1_items = tag1.split(':')
  27 + tag2_items = tag2.split(':')
  28 +
  29 + if (tag1_items[0] != tag2_items[0] # POS
  30 + or len(tag1_items) != len(tag2_items)):
  31 + return False
  32 +
  33 + for (i, item) in enumerate(tag1_items):
  34 + # remove tags n1, f1...
  35 + item = re.sub(r'(n1|n2|n3)', 'n', item)
  36 + model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
  37 + if not item in model and model[0] != '_': # underscore as a catchall
  38 + return False
  39 +
  40 + return True
  41 +
  42 +# the bulk of the following ripped from check_rule_compos.py
  43 +def esccurl(string) :
  44 + "Escape the curly brackets in the string, for using it with the string formatter."
  45 + return string.replace('{', '{{').replace('}', '}}')
  46 +
  47 +alt_idx = dict() # indexed by data[0] - word form
  48 +
  49 +with open('../resources/SGJP/alt.tab') as alt_src:
  50 + for line in alt_src:
  51 + line = line.strip()
  52 + data = line.split('\t')
  53 + if len(data) != 3:
  54 + print('Skipped line in the alt list: '+line)
  55 + continue
  56 + # handle lemmas with subclassification after colon
  57 + if data[1].find(':') != -1 and data[1] != ':':
  58 + data[1] = data[1][: data[1].find(':')]
  59 + # each entry consists of 0 - list of lemmas, 1 - list of tags
  60 + if not data[0] in alt_idx:
  61 + alt_idx[data[0]] = [[data[1]], [data[2]]]
  62 + else:
  63 + alt_idx[data[0]][0].append(data[1])
  64 + alt_idx[data[0]][1].append(data[2])
  65 +
  66 +with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
  67 + with open('freq_with_alt.tab', 'w+') as out:
  68 + for line in inp:
  69 + line = line.strip()
  70 + data = line.split('\t')
  71 + if len(data) != 8: # column count of TAGGED frequency list
  72 + print('Skipped line in the list: '+line)
  73 + continue
  74 +
  75 + # The following was added to work on partially done tagged frequency, to get rid of the
  76 + # previous COMPOS classification. Otherwise we'd want to use something like this:
  77 + # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
  78 + # previous COMPOS column is in data[4], so we skip it below
  79 + fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))
  80 +
  81 + matched = False
  82 + if data[0] in alt_idx:
  83 + tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), alt_idx[data[0]][1]))
  84 + tagnum = True in tagcomps and tagcomps.index(True)
  85 + # (make sure that if lemma is matching, it belongs to the matching tag)
  86 + if tagnum != -1 and tagnum != False and alt_idx[data[0]][0][tagnum] == data[1]:
  87 + print(fmt.format('COMPOS-ALT'), file=out)
  88 + matched = True
  89 + # try again with lowering word form and lemma:
  90 + if not matched and data[0].lower() in alt_idx:
  91 + tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), # data[2] - tag stays the same
  92 + alt_idx[data[0].lower()][1]))
  93 + tagnum = True in tagcomps and tagcomps.index(True)
  94 + if tagnum != -1 and tagnum != False and alt_idx[data[0].lower()][0][tagnum] == data[1].lower():
  95 + print(fmt.format('COMPOS-LWR-ALT'), file=out)
  96 + matched = True
  97 + if not matched:
  98 + print(line, file=out)
... ...
morphology/data/alternations.dic
... ... @@ -1078,3 +1078,222 @@
1078 1078 zetrz star
1079 1079 źr ziar
1080 1080  
  1081 +@obce_funkcjonalnie_twarde_a
  1082 + ac ak ac
  1083 + aq ak aq
  1084 + ay ay ay
  1085 + c c c
  1086 + c k c
  1087 + ch ch ch
  1088 + cq k cq
  1089 + dok dk dok
  1090 + du du du
  1091 + dź dź dź
  1092 + ec ek ec
  1093 + ey ey ey
  1094 + gh dz gh
  1095 + gh gh gh
  1096 + gi gi gi
  1097 + gn gn gn
  1098 + gy gy gy
  1099 + ic ik ic
  1100 + kie k kie
  1101 + lj lj lj
  1102 + ly ly ly
  1103 + m m m
  1104 + nc nk nc
  1105 + oc ok oc
  1106 + oy oy oy
  1107 + que k que
  1108 + ri ri ri
  1109 + shu shu shu
  1110 + st sti st
  1111 + szu szu szu
  1112 + ti ti ti
  1113 + tu tu tu
  1114 + use uz use
  1115 + v v v
  1116 + x ksi x
  1117 + x x x
  1118 + z z z
  1119 +
  1120 +@obce_funkcjonalnie_twarde_e
  1121 + ac ak ac
  1122 + ac aki ac
  1123 + aq ak aq
  1124 + aq aki aq
  1125 + ay ay ay
  1126 + c c c
  1127 + c ce c
  1128 + co ki c
  1129 + cq k cq
  1130 + cq ki cq
  1131 + cques ki cques
  1132 + dok dk dok
  1133 + dok dki dok
  1134 + dź dź dź
  1135 + ec ek ec
  1136 + ec eki ec
  1137 + ey ey ey
  1138 + gh dz gh
  1139 + gh gh gh
  1140 + gh ghi gh
  1141 + gi gi gi
  1142 + gn gn gn
  1143 + gue gi gue
  1144 + gues gi gues
  1145 + gy gy gy
  1146 + ic iki ic
  1147 + je j je
  1148 + ke ki ke
  1149 + kie k kie
  1150 + kie ki kie
  1151 + lj lj lj
  1152 + ly ly ly
  1153 + nc nk nc
  1154 + nc nki nc
  1155 + oc ok oc
  1156 + oc oki oc
  1157 + ov ov ov
  1158 + oy oy oy
  1159 + que k que
  1160 + que ki que
  1161 + ques ki ques
  1162 + ri ri ri
  1163 + st sti st
  1164 + ti ti ti
  1165 + use uz use
  1166 + x ksi x
  1167 +* ng n ng
  1168 +
  1169 +@obce_funkcjonalnie_twarde_ie
  1170 + ais aisi ais
  1171 + bes bi bes
  1172 + ce si ce
  1173 + ct kci ct
  1174 + de dzi de
  1175 + dh dz dh
  1176 + dh dzi dh
  1177 + fe fi fe
  1178 + h dz h
  1179 + m mi m
  1180 + me mi me
  1181 + ne ni ne
  1182 + nes ni nes
  1183 + nh ni nh
  1184 + ph fi ph
  1185 + phe fi phe
  1186 + re rz re
  1187 + res rz res
  1188 + rh rz rh
  1189 + rs rz rs
  1190 + se si se
  1191 + sne śni sne
  1192 + ste ści ste
  1193 + stes ści stes
  1194 + te ci te
  1195 + tes ci tes
  1196 + th ci th
  1197 + th si th
  1198 + the ci the
  1199 + thes ci thes
  1200 + use uzi use
  1201 + v vi v
  1202 + ve vi ve
  1203 + x ksi x
  1204 +* ng ni ng
  1205 +
  1206 +@obce_funkcjonalnie_twarde_iy
  1207 + ai ai ai
  1208 + bee bee bee
  1209 + co ki c
  1210 + cques ki cques
  1211 + dhi dhi dhi
  1212 + die die die
  1213 + dy dy dy
  1214 + dí dí dí
  1215 + eu eu eu
  1216 + ghi ghi ghi
  1217 + gie gie gie
  1218 + gue gi gue
  1219 + gues gi gues
  1220 + ji ji ji
  1221 + ke ki ke
  1222 + kie kie kie
  1223 + ky c ky
  1224 + ky ki ky
  1225 + ky ky ky
  1226 + lye lye lye
  1227 + nii ni nii
  1228 + nii nii nii
  1229 + nyi nyi nyi
  1230 + pi pi pi
  1231 + pie pie pie
  1232 + que ki que
  1233 + ques ki ques
  1234 + re ry re
  1235 + rii ri rii
  1236 + rii rii rii
  1237 + ssy ssy ssy
  1238 + szky scy szky
  1239 + szky ski szky
  1240 + szky szky szky
  1241 + thy thy thy
  1242 + tie tie tie
  1243 + zo zi zo
  1244 + ři ři ři
  1245 +* ng ngy ng
  1246 +
  1247 +@obce_funkcjonalnie_miekkie_ii_wyglos
  1248 + ay ai ay
  1249 + ci cyj ci
  1250 + czi czyj czi
  1251 + oy oi oy
  1252 +* ay ai ay
  1253 +* oy oi oy
  1254 +
  1255 +@obce_ais
  1256 + ais ais ais
  1257 +
  1258 +@obce_apostrof
  1259 + bes bes bes
  1260 + by by by
  1261 + ce ce ce
  1262 + cy cy cy
  1263 + de de de
  1264 + dy dy dy
  1265 + es es es
  1266 + fe fe fe
  1267 + ge ge ge
  1268 + ges ges ges
  1269 + gues gues gues
  1270 + ke ke ke
  1271 + kes kes kes
  1272 + ky ky ky
  1273 + le le le
  1274 + les les les
  1275 + ly ly ly
  1276 + ly ly ly
  1277 + me me me
  1278 + my my my
  1279 + ne ne ne
  1280 + nes nes nes
  1281 + pe pe pe
  1282 + phe phe phe
  1283 + phy phy phy
  1284 + ques ques ques
  1285 + re re re
  1286 + res res res
  1287 + ry ry ry
  1288 + se se se
  1289 + ses ses ses
  1290 + sy sy sy
  1291 + te te te
  1292 + tes tes tes
  1293 + the the the
  1294 + thes thes thes
  1295 + thy thy thy
  1296 + uy uy uy
  1297 + ve ve ve
  1298 + ze ze ze
  1299 +* es e es
... ...
morphology/data/interps_general.tab 0 → 100644
  1 +adj-sup adj:sg:nom.voc:n1.n2:sup Ca
  2 +adj-sup adj:sg:nom.voc:m1.m2.m3:sup Cb
  3 +adj-sup adj:sg:nom.voc:f:sup Cc
  4 +adj-sup adj:sg:loc:m1.m2.m3.n1.n2:sup Cd
  5 +adj-sup adj:sg:loc:f:sup Ce
  6 +adj-sup adj:sg:inst:m1.m2.m3.n1.n2:sup Cf
  7 +adj-sup adj:sg:inst:f:sup Cg
  8 +adj-sup adj:sg:gen:m1.m2.m3.n1.n2:sup Ch
  9 +adj-sup adj:sg:gen:f:sup Ci
  10 +adj-sup adj:sg:dat:m1.m2.m3.n1.n2:sup Cj
  11 +adj-sup adj:sg:dat:f:sup Ck
  12 +adj-sup adj:sg:acc:n1.n2:sup Cl
  13 +adj-sup adj:sg:acc:m3:sup Cm
  14 +adj-sup adj:sg:acc:m1.m2:sup Cn
  15 +adj-sup adj:sg:acc:f:sup Co
  16 +adj-sup adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup Cp
  17 +adj-sup adj:pl:nom.voc:m1.p1:sup Cq
  18 +adj-sup adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Cr
  19 +adj-sup adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Cs
  20 +adj-sup adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Ct
  21 +adj-sup adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Cu
  22 +adj-sup adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup Cv
  23 +adj-sup adj:pl:acc:m1.p1:sup Cw
  24 +verb-neg ppas:sg:nom.voc:m1.m2.m3:perf:neg Ua
  25 +verb-neg ppas:sg:nom.voc:m1.m2.m3:imperf:neg Ua
  26 +verb-neg ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg Ua
  27 +verb-neg ppas:sg:nom.voc:f:perf:neg Ub
  28 +verb-neg ppas:sg:nom.voc:f:imperf:neg Ub
  29 +verb-neg ppas:sg:nom.voc:f:imperf.perf:neg Ub
  30 +verb-neg ppas:sg:nom.acc.voc:n1.n2:perf:neg Uc
  31 +verb-neg ppas:sg:nom.acc.voc:n1.n2:imperf:neg Uc
  32 +verb-neg ppas:sg:nom.acc.voc:n1.n2:imperf.perf:neg Uc
  33 +verb-neg ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg Ud
  34 +verb-neg ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg Ud
  35 +verb-neg ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg Ud
  36 +verb-neg ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg Ue
  37 +verb-neg ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg Ue
  38 +verb-neg ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg Ue
  39 +verb-neg ppas:sg:gen.dat.loc:f:perf:neg Uf
  40 +verb-neg ppas:sg:gen.dat.loc:f:imperf:neg Uf
  41 +verb-neg ppas:sg:gen.dat.loc:f:imperf.perf:neg Uf
  42 +verb-neg ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg Ug
  43 +verb-neg ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg Ug
  44 +verb-neg ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg Ug
  45 +verb-neg ppas:sg:acc:m3:perf:neg Uh
  46 +verb-neg ppas:sg:acc:m3:imperf:neg Uh
  47 +verb-neg ppas:sg:acc:m3:imperf.perf:neg Uh
  48 +verb-neg ppas:sg:acc:m1.m2:perf:neg Ui
  49 +verb-neg ppas:sg:acc:m1.m2:imperf:neg Ui
  50 +verb-neg ppas:sg:acc:m1.m2:imperf.perf:neg Ui
  51 +verb-neg ppas:sg:acc.inst:f:perf:neg Uj
  52 +verb-neg ppas:sg:acc.inst:f:imperf:neg Uj
  53 +verb-neg ppas:sg:acc.inst:f:imperf.perf:neg Uj
  54 +verb-neg ppas:pl:nom.voc:m1.p1:perf:neg Uk
  55 +verb-neg ppas:pl:nom.voc:m1.p1:imperf:neg Uk
  56 +verb-neg ppas:pl:nom.voc:m1.p1:imperf.perf:neg Uk
  57 +verb-neg ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:neg Ul
  58 +verb-neg ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg Ul
  59 +verb-neg ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg Ul
  60 +verb-neg ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg Um
  61 +verb-neg ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Um
  62 +verb-neg ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Um
  63 +verb-neg ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg Un
  64 +verb-neg ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Un
  65 +verb-neg ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Un
  66 +verb-neg ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg Uo
  67 +verb-neg ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Uo
  68 +verb-neg ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Uo
  69 +verb-neg ppas:pl:acc:m1.p1:perf:neg Up
  70 +verb-neg ppas:pl:acc:m1.p1:imperf:neg Up
  71 +verb-neg ppas:pl:acc:m1.p1:imperf.perf:neg Up
  72 +verb-neg pact:sg:nom.voc:m1.m2.m3:imperf:neg Va
  73 +verb-neg pact:sg:nom.voc:m1.m2.m3:imperf.perf:neg Va
  74 +verb-neg pact:sg:nom.voc:f:imperf:neg Vb
  75 +verb-neg pact:sg:nom.voc:f:imperf.perf:neg Vb
  76 +verb-neg pact:sg:nom.acc.voc:n1.n2:imperf:neg Vc
  77 +verb-neg pact:sg:nom.acc.voc:n1.n2:imperf.perf:neg Vc
  78 +verb-neg pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg Vd
  79 +verb-neg pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg Vd
  80 +verb-neg pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg Ve
  81 +verb-neg pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg Ve
  82 +verb-neg pact:sg:gen.dat.loc:f:imperf:neg Vf
  83 +verb-neg pact:sg:gen.dat.loc:f:imperf.perf:neg Vf
  84 +verb-neg pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg Vg
  85 +verb-neg pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg Vg
  86 +verb-neg pact:sg:acc:m3:imperf:neg Vh
  87 +verb-neg pact:sg:acc:m3:imperf.perf:neg Vh
  88 +verb-neg pact:sg:acc:m1.m2:imperf:neg Vi
  89 +verb-neg pact:sg:acc:m1.m2:imperf.perf:neg Vi
  90 +verb-neg pact:sg:acc.inst:f:imperf:neg Vj
  91 +verb-neg pact:sg:acc.inst:f:imperf.perf:neg Vj
  92 +verb-neg pact:pl:nom.voc:m1.p1:imperf:neg Vk
  93 +verb-neg pact:pl:nom.voc:m1.p1:imperf.perf:neg Vk
  94 +verb-neg pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg Vl
  95 +verb-neg pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg Vl
  96 +verb-neg pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Vm
  97 +verb-neg pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Vm
  98 +verb-neg pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Vn
  99 +verb-neg pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Vn
  100 +verb-neg pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Vo
  101 +verb-neg pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Vo
  102 +verb-neg pact:pl:acc:m1.p1:imperf:neg Vp
  103 +verb-neg pact:pl:acc:m1.p1:imperf.perf:neg Vp
  104 +verb-neg ger:sg:nom.acc:n2:perf:neg Ta
  105 +verb-neg ger:sg:nom.acc:n2:imperf:neg Ta
  106 +verb-neg ger:sg:nom.acc:n2:imperf.perf:neg Ta
  107 +verb-neg ger:sg:inst:n2:perf:neg Tb
  108 +verb-neg ger:sg:inst:n2:imperf:neg Tb
  109 +verb-neg ger:sg:inst:n2:imperf.perf:neg Tb
  110 +verb-neg ger:sg:gen:n2:perf:neg Tc
  111 +verb-neg ger:sg:gen:n2:imperf:neg Tc
  112 +verb-neg ger:sg:gen:n2:imperf.perf:neg Tc
  113 +verb-neg ger:sg:dat.loc:n2:perf:neg Td
  114 +verb-neg ger:sg:dat.loc:n2:imperf:neg Td
  115 +verb-neg ger:sg:dat.loc:n2:imperf.perf:neg Td
  116 +verb-neg ger:pl:nom.acc:n2:perf:neg Te
  117 +verb-neg ger:pl:nom.acc:n2:imperf:neg Te
  118 +verb-neg ger:pl:nom.acc:n2:imperf.perf:neg Te
  119 +verb-neg ger:pl:loc:n2:perf:neg Tf
  120 +verb-neg ger:pl:loc:n2:imperf:neg Tf
  121 +verb-neg ger:pl:loc:n2:imperf.perf:neg Tf
  122 +verb-neg ger:pl:inst:n2:perf:neg Tg
  123 +verb-neg ger:pl:inst:n2:imperf:neg Tg
  124 +verb-neg ger:pl:inst:n2:imperf.perf:neg Tg
  125 +verb-neg ger:pl:gen:n2:perf:neg Th
  126 +verb-neg ger:pl:gen:n2:imperf:neg Th
  127 +verb-neg ger:pl:gen:n2:imperf.perf:neg Th
  128 +verb-neg ger:pl:dat:n2:perf:neg Ti
  129 +verb-neg ger:pl:dat:n2:imperf:neg Ti
  130 +verb-neg ger:pl:dat:n2:imperf.perf:neg Ti
  131 +other winien:sg:n1.n2:ter:imperf W
  132 +other winien:sg:n1.n2:sec:imperf W
  133 +other winien:sg:n1.n2:pri:imperf W
  134 +other winien:sg:n1.n2:imperf W
  135 +other winien:sg:m1.m2.m3:ter:imperf W
  136 +other winien:sg:m1.m2.m3:sec:imperf W
  137 +other winien:sg:m1.m2.m3:pri:imperf W
  138 +other winien:sg:m1.m2.m3:imperf W
  139 +other winien:sg:f:ter:imperf W
  140 +other winien:sg:f:sec:imperf W
  141 +other winien:sg:f:pri:imperf W
  142 +other winien:sg:f:imperf W
  143 +other winien:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf W
  144 +other winien:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf W
  145 +other winien:pl:m2.m3.f.n1.n2.p2.p3:imperf W
  146 +other winien:pl:m1.p1:ter:imperf W
  147 +other winien:pl:m1.p1:sec:imperf W
  148 +other winien:pl:m1.p1:pri:imperf W
  149 +other winien:pl:m1.p1:imperf W
  150 +noun subst:sg:voc:n2 Gva
  151 +noun subst:sg:voc:n1 Gvb
  152 +noun subst:sg:voc:m3 Gvc
  153 +noun subst:sg:voc:m2 Gvd
  154 +noun subst:sg:voc:m1 Gve
  155 +noun subst:sg:voc:f Gvf
  156 +noun subst:sg:nom:n2 Gna
  157 +noun subst:sg:nom:n1 Gnb
  158 +noun subst:sg:nom:m3 Gnc
  159 +noun subst:sg:nom:m2 Gnd
  160 +noun subst:sg:nom:m1 Gne
  161 +noun subst:sg:nom:f Gnf
  162 +noun subst:sg:loc:n2 Gla
  163 +noun subst:sg:loc:n1 Glb
  164 +noun subst:sg:loc:m3 Glc
  165 +noun subst:sg:loc:m2 Gld
  166 +noun subst:sg:loc:m1 Gle
  167 +noun subst:sg:loc:f Glf
  168 +noun subst:sg:inst:n2 Gia
  169 +noun subst:sg:inst:n1 Gib
  170 +noun subst:sg:inst:m3 Gic
  171 +noun subst:sg:inst:m2 Gid
  172 +noun subst:sg:inst:m1 Gie
  173 +noun subst:sg:inst:f Gif
  174 +noun subst:sg:gen:n2 Gga
  175 +noun subst:sg:gen:n1 Ggb
  176 +noun subst:sg:gen:m3 Ggc
  177 +noun subst:sg:gen:m2 Ggd
  178 +noun subst:sg:gen:m1 Gge
  179 +noun subst:sg:gen:f Ggf
  180 +noun subst:sg:dat:n2 Gda
  181 +noun subst:sg:dat:n1 Gdb
  182 +noun subst:sg:dat:m3 Gdc
  183 +noun subst:sg:dat:m2 Gdd
  184 +noun subst:sg:dat:m1 Gde
  185 +noun subst:sg:dat:f Gdf
  186 +noun subst:sg:acc:n2 Gaa
  187 +noun subst:sg:acc:n1 Gab
  188 +noun subst:sg:acc:m3 Gac
  189 +noun subst:sg:acc:m2 Gad
  190 +noun subst:sg:acc:m1 Gae
  191 +noun subst:sg:acc:f Gaf
  192 +noun subst:pl:voc:p3 Yvp
  193 +noun subst:pl:voc:p2 Yvq
  194 +noun subst:pl:voc:p1 Yvr
  195 +noun subst:pl:voc:n2 Yva
  196 +noun subst:pl:voc:n1 Yvb
  197 +noun subst:pl:voc:m3 Yvc
  198 +noun subst:pl:voc:m2 Yvd
  199 +noun subst:pl:voc:m1 Yve
  200 +noun subst:pl:voc:f Yvf
  201 +noun subst:pl:nom:p3 Ynp
  202 +noun subst:pl:nom:p2 Ynq
  203 +noun subst:pl:nom:p1 Ynr
  204 +noun subst:pl:nom:n2 Yna
  205 +noun subst:pl:nom:n1 Ynb
  206 +noun subst:pl:nom:m3 Ync
  207 +noun subst:pl:nom:m2 Ynd
  208 +noun subst:pl:nom:m1 Yne
  209 +noun subst:pl:nom:f Ynf
  210 +noun subst:pl:loc:p3 Ylp
  211 +noun subst:pl:loc:p2 Ylq
  212 +noun subst:pl:loc:p1 Ylr
  213 +noun subst:pl:loc:n2 Yla
  214 +noun subst:pl:loc:n1 Ylb
  215 +noun subst:pl:loc:m3 Ylc
  216 +noun subst:pl:loc:m2 Yld
  217 +noun subst:pl:loc:m1 Yle
  218 +noun subst:pl:loc:f Ylf
  219 +noun subst:pl:inst:p3 Yip
  220 +noun subst:pl:inst:p2 Yiq
  221 +noun subst:pl:inst:p1 Yir
  222 +noun subst:pl:inst:n2 Yia
  223 +noun subst:pl:inst:n1 Yib
  224 +noun subst:pl:inst:m3 Yic
  225 +noun subst:pl:inst:m2 Yid
  226 +noun subst:pl:inst:m1 Yie
  227 +noun subst:pl:inst:f Yif
  228 +noun subst:pl:gen:p3 Ygp
  229 +noun subst:pl:gen:p2 Ygq
  230 +noun subst:pl:gen:p1 Ygr
  231 +noun subst:pl:gen:n2 Yga
  232 +noun subst:pl:gen:n1 Ygb
  233 +noun subst:pl:gen:m3 Ygc
  234 +noun subst:pl:gen:m2 Ygd
  235 +noun subst:pl:gen:m1 Yge
  236 +noun subst:pl:gen:f Ygf
  237 +noun subst:pl:dat:p3 Ydp
  238 +noun subst:pl:dat:p2 Ydq
  239 +noun subst:pl:dat:p1 Ydr
  240 +noun subst:pl:dat:n2 Yda
  241 +noun subst:pl:dat:n1 Ydb
  242 +noun subst:pl:dat:m3 Ydc
  243 +noun subst:pl:dat:m2 Ydd
  244 +noun subst:pl:dat:m1 Yde
  245 +noun subst:pl:dat:f Ydf
  246 +noun subst:pl:acc:p3 Yap
  247 +noun subst:pl:acc:p2 Yaq
  248 +noun subst:pl:acc:p1 Yar
  249 +noun subst:pl:acc:n2 Yaa
  250 +noun subst:pl:acc:n1 Yab
  251 +noun subst:pl:acc:m3 Yac
  252 +noun subst:pl:acc:m2 Yad
  253 +noun subst:pl:acc:m1 Yae
  254 +noun subst:pl:acc:f Yaf
  255 +other qub W
  256 +other prep:nom W
  257 +other prep:loc:wok W
  258 +other prep:loc:nwok W
  259 +other prep:loc W
  260 +other prep:inst:wok W
  261 +other prep:inst:nwok W
  262 +other prep:inst W
  263 +other prep:gen:wok W
  264 +other prep:gen:nwok W
  265 +other prep:gen W
  266 +other prep:dat W
  267 +other prep:acc:wok W
  268 +other prep:acc:nwok W
  269 +other prep:acc W
  270 +other pred W
  271 +verb praet:sg:n1.n2:ter:perf Ja
  272 +verb praet:sg:n1.n2:ter:imperf.perf Ja
  273 +verb praet:sg:n1.n2:ter:imperf Ja
  274 +verb praet:sg:n1.n2:sec:perf Jb
  275 +verb praet:sg:n1.n2:sec:imperf.perf Jb
  276 +verb praet:sg:n1.n2:sec:imperf Jb
  277 +verb praet:sg:n1.n2:pri:perf Jc
  278 +verb praet:sg:n1.n2:pri:imperf.perf Jc
  279 +verb praet:sg:n1.n2:pri:imperf Jc
  280 +verb praet:sg:n1.n2:perf Jd
  281 +verb praet:sg:n1.n2:imperf.perf Jd
  282 +verb praet:sg:n1.n2:imperf Jd
  283 +verb praet:sg:m1.m2.m3:ter:perf Je
  284 +verb praet:sg:m1.m2.m3:ter:imperf.perf Je
  285 +verb praet:sg:m1.m2.m3:ter:imperf Je
  286 +verb praet:sg:m1.m2.m3:sec:perf Jf
  287 +verb praet:sg:m1.m2.m3:sec:imperf.perf Jf
  288 +verb praet:sg:m1.m2.m3:sec:imperf Jf
  289 +verb praet:sg:m1.m2.m3:pri:perf Jg
  290 +verb praet:sg:m1.m2.m3:pri:imperf.perf Jg
  291 +verb praet:sg:m1.m2.m3:pri:imperf Jg
  292 +verb praet:sg:m1.m2.m3:perf:nagl.agl Jh
  293 +verb praet:sg:m1.m2.m3:imperf:nagl.agl Jh
  294 +verb praet:sg:m1.m2.m3:imperf.perf Jh
  295 +verb praet:sg:f:ter:perf Ji
  296 +verb praet:sg:f:ter:imperf Ji
  297 +verb praet:sg:f:sec:perf Jj
  298 +verb praet:sg:f:sec:imperf Jj
  299 +verb praet:sg:f:pri:perf Jk
  300 +verb praet:sg:f:pri:imperf Jk
  301 +verb praet:sg:f:perf Jl
  302 +verb praet:sg:f:imperf Jl
  303 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:ter:perf Jm
  304 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf.perf Jm
  305 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf Jm
  306 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:sec:perf Jn
  307 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf.perf Jn
  308 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf Jn
  309 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:pri:perf Jo
  310 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf.perf Jo
  311 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf Jo
  312 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:perf Jp
  313 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf Jp
  314 +verb praet:pl:m2.m3.f.n1.n2.p2.p3:imperf Jp
  315 +verb praet:pl:m1.p1:ter:perf Jq
  316 +verb praet:pl:m1.p1:ter:imperf Jq
  317 +verb praet:pl:m1.p1:sec:perf Jr
  318 +verb praet:pl:m1.p1:sec:imperf Jr
  319 +verb praet:pl:m1.p1:pri:perf Js
  320 +verb praet:pl:m1.p1:pri:imperf Js
  321 +verb praet:pl:m1.p1:perf Jt
  322 +verb praet:pl:m1.p1:imperf Jt
  323 +other ppron3:sg:nom:n1.n2:ter:akc.nakc:praep.npraep W
  324 +other ppron3:sg:nom:m1.m2.m3:ter:akc.nakc:praep.npraep W
  325 +other ppron3:sg:nom:f:ter:akc.nakc:praep.npraep W
  326 +other ppron3:sg:loc:n1.n2:ter:akc.nakc:praep.npraep W
  327 +other ppron3:sg:loc:m1.m2.m3:ter:akc.nakc:praep.npraep W
  328 +other ppron3:sg:loc:f:ter:akc.nakc:praep.npraep W
  329 +other ppron3:sg:inst:n1.n2:ter:akc.nakc:praep.npraep W
  330 +other ppron3:sg:inst:m1.m2.m3:ter:akc.nakc:praep.npraep W
  331 +other ppron3:sg:inst:f:ter:akc.nakc:praep.npraep W
  332 +other ppron3:sg:gen:n1.n2:ter:nakc:npraep W
  333 +other ppron3:sg:gen:n1.n2:ter:akc:npraep W
  334 +other ppron3:sg:gen:n1.n2:ter:akc.nakc:praep W
  335 +other ppron3:sg:gen:m1.m2.m3:ter:nakc:praep W
  336 +other ppron3:sg:gen:m1.m2.m3:ter:nakc:npraep W
  337 +other ppron3:sg:gen:m1.m2.m3:ter:akc:praep W
  338 +other ppron3:sg:gen:m1.m2.m3:ter:akc:npraep W
  339 +other ppron3:sg:gen:f:ter:akc.nakc:praep W
  340 +other ppron3:sg:gen:f:ter:akc.nakc:npraep W
  341 +other ppron3:sg:dat:n1.n2:ter:nakc:npraep W
  342 +other ppron3:sg:dat:n1.n2:ter:akc:npraep W
  343 +other ppron3:sg:dat:n1.n2:ter:akc.nakc:praep W
  344 +other ppron3:sg:dat:m1.m2.m3:ter:nakc:npraep W
  345 +other ppron3:sg:dat:m1.m2.m3:ter:akc:npraep W
  346 +other ppron3:sg:dat:m1.m2.m3:ter:akc.nakc:praep W
  347 +other ppron3:sg:dat:f:ter:akc.nakc:praep W
  348 +other ppron3:sg:dat:f:ter:akc.nakc:npraep W
  349 +other ppron3:sg:acc:n1.n2:ter:akc.nakc:praep W
  350 +other ppron3:sg:acc:n1.n2:ter:akc.nakc:npraep W
  351 +other ppron3:sg:acc:m1.m2.m3:ter:nakc:praep W
  352 +other ppron3:sg:acc:m1.m2.m3:ter:nakc:npraep W
  353 +other ppron3:sg:acc:m1.m2.m3:ter:akc:praep W
  354 +other ppron3:sg:acc:m1.m2.m3:ter:akc:npraep W
  355 +other ppron3:sg:acc:f:ter:akc.nakc:praep W
  356 +other ppron3:sg:acc:f:ter:akc.nakc:npraep W
  357 +other ppron3:pl:nom:m2.m3.f.n1.n2.p2.p3:ter:akc.nakc:praep.npraep W
  358 +other ppron3:pl:nom:m1.p1:ter:akc.nakc:praep.npraep W
  359 +other ppron3:pl:loc:_:ter:akc.nakc:praep.npraep W
  360 +other ppron3:pl:inst:_:ter:akc.nakc:praep.npraep W
  361 +other ppron3:pl:gen:_:ter:akc.nakc:praep W
  362 +other ppron3:pl:gen:_:ter:akc.nakc:npraep W
  363 +other ppron3:pl:dat:_:ter:akc.nakc:praep W
  364 +other ppron3:pl:dat:_:ter:akc.nakc:npraep W
  365 +other ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:akc.nakc:praep W
  366 +other ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:akc.nakc:npraep W
  367 +other ppron3:pl:acc:m1.p1:ter:akc.nakc:praep W
  368 +other ppron3:pl:acc:m1.p1:ter:akc.nakc:npraep W
  369 +other ppron12:sg:voc:m1.m2.m3.f.n1.n2:sec W
  370 +other ppron12:sg:voc:m1.m2.m3.f.n1.n2:pri W
  371 +other ppron12:sg:nom:m1.m2.m3.f.n1.n2:sec W
  372 +other ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri W
  373 +other ppron12:sg:loc:m1.m2.m3.f.n1.n2:sec W
  374 +other ppron12:sg:loc:m1.m2.m3.f.n1.n2:pri W
  375 +other ppron12:sg:inst:m1.m2.m3.f.n1.n2:sec W
  376 +other ppron12:sg:inst:m1.m2.m3.f.n1.n2:pri W
  377 +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:nakc W
  378 +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:akc W
  379 +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:nakc W
  380 +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:akc W
  381 +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:nakc W
  382 +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:akc W
  383 +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:nakc W
  384 +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:akc W
  385 +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:nakc W
  386 +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:akc W
  387 +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:nakc W
  388 +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:akc W
  389 +other ppron12:pl:voc:_:sec W
  390 +other ppron12:pl:voc:_:pri W
  391 +other ppron12:pl:nom:_:sec W
  392 +other ppron12:pl:nom:_:pri W
  393 +other ppron12:pl:loc:_:sec W
  394 +other ppron12:pl:loc:_:pri W
  395 +other ppron12:pl:inst:_:sec W
  396 +other ppron12:pl:inst:_:pri W
  397 +other ppron12:pl:gen:_:sec W
  398 +other ppron12:pl:gen:_:pri W
  399 +other ppron12:pl:dat:_:sec W
  400 +other ppron12:pl:dat:_:pri W
  401 +other ppron12:pl:acc:_:sec W
  402 +other ppron12:pl:acc:_:pri W
  403 +verb ppas:sg:nom.voc:m1.m2.m3:perf:aff Ra
  404 +verb ppas:sg:nom.voc:m1.m2.m3:imperf:aff Ra
  405 +verb ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff Ra
  406 +verb ppas:sg:nom.voc:f:perf:aff Rb
  407 +verb ppas:sg:nom.voc:f:imperf:aff Rb
  408 +verb ppas:sg:nom.voc:f:imperf.perf:aff Rb
  409 +verb ppas:sg:nom.acc.voc:n1.n2:perf:aff Rc
  410 +verb ppas:sg:nom.acc.voc:n1.n2:imperf:aff Rc
  411 +verb ppas:sg:nom.acc.voc:n1.n2:imperf.perf:aff Rc
  412 +verb ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:aff Rd
  413 +verb ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff Rd
  414 +verb ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff Rd
  415 +verb ppas:sg:gen:m1.m2.m3.n1.n2:perf:aff Re
  416 +verb ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff Re
  417 +verb ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff Re
  418 +verb ppas:sg:gen.dat.loc:f:perf:aff Rf
  419 +verb ppas:sg:gen.dat.loc:f:imperf:aff Rf
  420 +verb ppas:sg:gen.dat.loc:f:imperf.perf:aff Rf
  421 +verb ppas:sg:dat:m1.m2.m3.n1.n2:perf:aff Rg
  422 +verb ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff Rg
  423 +verb ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff Rg
  424 +verb ppas:sg:acc:m3:perf:aff Rh
  425 +verb ppas:sg:acc:m3:imperf:aff Rh
  426 +verb ppas:sg:acc:m3:imperf.perf:aff Rh
  427 +verb ppas:sg:acc:m1.m2:perf:aff Ri
  428 +verb ppas:sg:acc:m1.m2:imperf:aff Ri
  429 +verb ppas:sg:acc:m1.m2:imperf.perf:aff Ri
  430 +verb ppas:sg:acc.inst:f:perf:aff Rj
  431 +verb ppas:sg:acc.inst:f:imperf:aff Rj
  432 +verb ppas:sg:acc.inst:f:imperf.perf:aff Rj
  433 +verb ppas:pl:nom.voc:m1.p1:perf:aff Rk
  434 +verb ppas:pl:nom.voc:m1.p1:imperf:aff Rk
  435 +verb ppas:pl:nom.voc:m1.p1:imperf.perf:aff Rk
  436 +verb ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:aff Rl
  437 +verb ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff Rl
  438 +verb ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff Rl
  439 +verb ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff Rm
  440 +verb ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Rm
  441 +verb ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Rm
  442 +verb ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff Rn
  443 +verb ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Rn
  444 +verb ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Rn
  445 +verb ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff Ro
  446 +verb ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Ro
  447 +verb ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Ro
  448 +verb ppas:pl:acc:m1.p1:perf:aff Rp
  449 +verb ppas:pl:acc:m1.p1:imperf:aff Rp
  450 +verb ppas:pl:acc:m1.p1:imperf.perf:aff Rp
  451 +verb pcon:imperf O
  452 +verb pant:perf P
  453 +verb pact:sg:nom.voc:m1.m2.m3:imperf:aff Qa
  454 +verb pact:sg:nom.voc:m1.m2.m3:imperf.perf:aff Qa
  455 +verb pact:sg:nom.voc:f:imperf:aff Qb
  456 +verb pact:sg:nom.voc:f:imperf.perf:aff Qb
  457 +verb pact:sg:nom.acc.voc:n1.n2:imperf:aff Qc
  458 +verb pact:sg:nom.acc.voc:n1.n2:imperf.perf:aff Qc
  459 +verb pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff Qd
  460 +verb pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff Qd
  461 +verb pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff Qe
  462 +verb pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff Qe
  463 +verb pact:sg:gen.dat.loc:f:imperf:aff Qf
  464 +verb pact:sg:gen.dat.loc:f:imperf.perf:aff Qf
  465 +verb pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff Qg
  466 +verb pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff Qg
  467 +verb pact:sg:acc:m3:imperf:aff Qh
  468 +verb pact:sg:acc:m3:imperf.perf:aff Qh
  469 +verb pact:sg:acc:m1.m2:imperf:aff Qi
  470 +verb pact:sg:acc:m1.m2:imperf.perf:aff Qi
  471 +verb pact:sg:acc.inst:f:imperf:aff Qj
  472 +verb pact:sg:acc.inst:f:imperf.perf:aff Qj
  473 +verb pact:pl:nom.voc:m1.p1:imperf:aff Qk
  474 +verb pact:pl:nom.voc:m1.p1:imperf.perf:aff Qk
  475 +verb pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff Ql
  476 +verb pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff Ql
  477 +verb pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Qm
  478 +verb pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Qm
  479 +verb pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Qn
  480 +verb pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Qn
  481 +verb pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Qo
  482 +verb pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Qo
  483 +verb pact:pl:acc:m1.p1:imperf:aff Qp
  484 +verb pact:pl:acc:m1.p1:imperf.perf:aff Qp
  485 +other num:sg:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.n1.n2:rec W
  486 +other num:sg:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2:rec W
  487 +other num:sg:nom.gen.dat.inst.acc.loc.voc:f:rec W
  488 +other num:sg:nom.acc:m1.m2.m3.f.n1.n2:rec W
  489 +other num:sg.pl:nom.acc:m1.m2.m3.f.n1.n2.p1.p2:rec W
  490 +other num:pl:nom.voc:m1:rec W
  491 +other num:pl:nom.voc:m1:congr W
  492 +other num:pl:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2.p1.p2:rec W
  493 +other num:pl:nom.acc:m1.m2.m3.f.n1.n2.p1.p2:rec W
  494 +other num:pl:nom.acc.voc:n1.p1.p2:rec W
  495 +other num:pl:nom.acc.voc:m2.m3.n2:congr W
  496 +other num:pl:nom.acc.voc:m2.m3.n2.f:congr W
  497 +other num:pl:nom.acc.voc:m2.m3.f.n2:rec W
  498 +other num:pl:nom.acc.voc:m2.m3.f.n1.n2.p1.p2:rec W
  499 +other num:pl:nom.acc.voc:m1:rec W
  500 +other num:pl:nom.acc.voc:f:congr W
  501 +other num:pl:inst:n1.p1.p2:rec W
  502 +other num:pl:inst:m1.m2.m3.n2:congr W
  503 +other num:pl:inst:m1.m2.m3.n2.f:congr W
  504 +other num:pl:inst:m1.m2.m3.f.n2:congr W
  505 +other num:pl:inst:m1.m2.m3.f.n1.n2.p1.p2:congr W
  506 +other num:pl:inst:f:congr W
  507 +other num:pl:gen:n1.p1.p2:rec W
  508 +other num:pl:gen.loc:m1.m2.m3.n2.f:congr W
  509 +other num:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2:congr W
  510 +other num:pl:gen.dat.loc:m1.m2.m3.n2.f:congr W
  511 +other num:pl:gen.dat.inst.loc:m1.m2.m3.f.n2:congr W
  512 +other num:pl:gen.dat.inst.loc:m1.m2.m3.f.n1.n2.p1.p2:congr W
  513 +other num:pl:dat:m1.m2.m3.n2.f:congr W
  514 +other num:pl:dat.loc:n1.p1.p2:congr.rec W
  515 +other num:pl:acc:m1:rec W
  516 +other num:comp W
  517 +other interj W
  518 +verb inf:perf I
  519 +verb inf:imperf.perf I
  520 +verb inf:imperf I
  521 +verb impt:sg:sec:perf Ma
  522 +verb impt:sg:sec:imperf.perf Ma
  523 +verb impt:sg:sec:imperf Ma
  524 +verb impt:pl:sec:perf Mb
  525 +verb impt:pl:sec:imperf.perf Mb
  526 +verb impt:pl:sec:imperf Mb
  527 +verb impt:pl:pri:perf Mc
  528 +verb impt:pl:pri:imperf.perf Mc
  529 +verb impt:pl:pri:imperf Mc
  530 +verb imps:perf N
  531 +verb imps:imperf.perf N
  532 +verb imps:imperf N
  533 +verb ger:sg:nom.acc:n2:perf:aff Xa
  534 +verb ger:sg:nom.acc:n2:imperf:aff Xa
  535 +verb ger:sg:nom.acc:n2:imperf.perf:aff Xa
  536 +verb ger:sg:inst:n2:perf:aff Xb
  537 +verb ger:sg:inst:n2:imperf:aff Xb
  538 +verb ger:sg:inst:n2:imperf.perf:aff Xb
  539 +verb ger:sg:gen:n2:perf:aff Xc
  540 +verb ger:sg:gen:n2:imperf:aff Xc
  541 +verb ger:sg:gen:n2:imperf.perf:aff Xc
  542 +verb ger:sg:dat.loc:n2:perf:aff Xd
  543 +verb ger:sg:dat.loc:n2:imperf:aff Xd
  544 +verb ger:sg:dat.loc:n2:imperf.perf:aff Xd
  545 +verb ger:pl:nom.acc:n2:perf:aff Xe
  546 +verb ger:pl:nom.acc:n2:imperf:aff Xe
  547 +verb ger:pl:nom.acc:n2:imperf.perf:aff Xe
  548 +verb ger:pl:loc:n2:perf:aff Xf
  549 +verb ger:pl:loc:n2:imperf:aff Xf
  550 +verb ger:pl:loc:n2:imperf.perf:aff Xf
  551 +verb ger:pl:inst:n2:perf:aff Xg
  552 +verb ger:pl:inst:n2:imperf:aff Xg
  553 +verb ger:pl:inst:n2:imperf.perf:aff Xg
  554 +verb ger:pl:gen:n2:perf:aff Xh
  555 +verb ger:pl:gen:n2:imperf:aff Xh
  556 +verb ger:pl:gen:n2:imperf.perf:aff Xh
  557 +verb ger:pl:dat:n2:perf:aff Xi
  558 +verb ger:pl:dat:n2:imperf:aff Xi
  559 +verb ger:pl:dat:n2:imperf.perf:aff Xi
  560 +verb fin:sg:ter:perf La
  561 +verb fin:sg:ter:imperf.perf La
  562 +verb fin:sg:ter:imperf La
  563 +verb fin:sg:sec:perf Lb
  564 +verb fin:sg:sec:imperf.perf Lb
  565 +verb fin:sg:sec:imperf Lb
  566 +verb fin:sg:pri:perf Lc
  567 +verb fin:sg:pri:imperf.perf Lc
  568 +verb fin:sg:pri:imperf Lc
  569 +verb fin:pl:ter:perf Ld
  570 +verb fin:pl:ter:imperf.perf Ld
  571 +verb fin:pl:ter:imperf Ld
  572 +verb fin:pl:sec:perf Le
  573 +verb fin:pl:sec:imperf.perf Le
  574 +verb fin:pl:sec:imperf Le
  575 +verb fin:pl:pri:perf Lf
  576 +verb fin:pl:pri:imperf.perf Lf
  577 +verb fin:pl:pri:imperf Lf
  578 +noun depr:pl:voc:m2 Hv
  579 +noun depr:pl:nom:m2 Hn
  580 +other conj W
  581 +verb cond:sg:n1.n2:ter:perf Ka
  582 +verb cond:sg:n1.n2:ter:imperf.perf Ka
  583 +verb cond:sg:n1.n2:ter:imperf Ka
  584 +verb cond:sg:n1.n2:sec:perf Kb
  585 +verb cond:sg:n1.n2:sec:imperf.perf Kb
  586 +verb cond:sg:n1.n2:sec:imperf Kb
  587 +verb cond:sg:n1.n2:pri:perf Kc
  588 +verb cond:sg:n1.n2:pri:imperf.perf Kc
  589 +verb cond:sg:n1.n2:pri:imperf Kc
  590 +verb cond:sg:n1.n2:perf Kd
  591 +verb cond:sg:n1.n2:imperf.perf Kd
  592 +verb cond:sg:n1.n2:imperf Kd
  593 +verb cond:sg:m1.m2.m3:ter:perf Ke
  594 +verb cond:sg:m1.m2.m3:ter:imperf.perf Ke
  595 +verb cond:sg:m1.m2.m3:ter:imperf Ke
  596 +verb cond:sg:m1.m2.m3:sec:perf Kf
  597 +verb cond:sg:m1.m2.m3:sec:imperf.perf Kf
  598 +verb cond:sg:m1.m2.m3:sec:imperf Kf
  599 +verb cond:sg:m1.m2.m3:pri:perf Kg
  600 +verb cond:sg:m1.m2.m3:pri:imperf.perf Kg
  601 +verb cond:sg:m1.m2.m3:pri:imperf Kg
  602 +verb cond:sg:f:ter:perf Kh
  603 +verb cond:sg:f:ter:imperf.perf Kh
  604 +verb cond:sg:f:ter:imperf Kh
  605 +verb cond:sg:f:sec:perf Ki
  606 +verb cond:sg:f:sec:imperf.perf Ki
  607 +verb cond:sg:f:sec:imperf Ki
  608 +verb cond:sg:f:pri:perf Kj
  609 +verb cond:sg:f:pri:imperf.perf Kj
  610 +verb cond:sg:f:pri:imperf Kj
  611 +verb cond:pl:m2.m3.f.n1.n2.p2.p3:ter:perf Kk
  612 +verb cond:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf.perf Kk
  613 +verb cond:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf Kk
  614 +verb cond:pl:m2.m3.f.n1.n2.p2.p3:sec:perf Kl
  615 +verb cond:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf.perf Kl
  616 +verb cond:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf Kl
  617 +verb cond:pl:m2.m3.f.n1.n2.p2.p3:pri:perf Km
  618 +verb cond:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf.perf Km
  619 +verb cond:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf Km
  620 +verb cond:pl:m1.p1:ter:perf Kn
  621 +verb cond:pl:m1.p1:ter:imperf.perf Kn
  622 +verb cond:pl:m1.p1:ter:imperf Kn
  623 +verb cond:pl:m1.p1:sec:perf Ko
  624 +verb cond:pl:m1.p1:sec:imperf.perf Ko
  625 +verb cond:pl:m1.p1:sec:imperf Ko
  626 +verb cond:pl:m1.p1:pri:perf Kp
  627 +verb cond:pl:m1.p1:pri:imperf.perf Kp
  628 +verb cond:pl:m1.p1:pri:imperf Kp
  629 +other comp W
  630 +other burk W
  631 +other brev:pun W
  632 +other brev:npun W
  633 +other bedzie:sg:ter:imperf W
  634 +other bedzie:sg:sec:imperf W
  635 +other bedzie:sg:pri:imperf W
  636 +other bedzie:pl:ter:imperf W
  637 +other bedzie:pl:sec:imperf W
  638 +other bedzie:pl:pri:imperf W
  639 +other aglt:sg:sec:imperf:wok W
  640 +other aglt:sg:sec:imperf:nwok W
  641 +other aglt:sg:pri:imperf:wok W
  642 +other aglt:sg:pri:imperf:nwok W
  643 +other aglt:pl:sec:imperf:wok W
  644 +other aglt:pl:sec:imperf:nwok W
  645 +other aglt:pl:pri:imperf:wok W
  646 +other aglt:pl:pri:imperf:nwok W
  647 +adv-sup adv:sup W
  648 +adv adv:pos W
  649 +adv-com adv:com W
  650 +adv adv W
  651 +adj adjp F
  652 +adj adjc E
  653 +adj adja D
  654 +adj adj:sg:nom.voc:n1.n2:pos Aa
  655 +adj-com adj:sg:nom.voc:n1.n2:com Ba
  656 +adj adj:sg:nom.voc:m1.m2.m3:pos Ab
  657 +adj-com adj:sg:nom.voc:m1.m2.m3:com Bb
  658 +adj adj:sg:nom.voc:f:pos Ac
  659 +adj-com adj:sg:nom.voc:f:com Bc
  660 +adj adj:sg:loc:m1.m2.m3.n1.n2:pos Ad
  661 +adj-com adj:sg:loc:m1.m2.m3.n1.n2:com Bd
  662 +adj adj:sg:loc:f:pos Ae
  663 +adj-com adj:sg:loc:f:com Be
  664 +adj adj:sg:inst:m1.m2.m3.n1.n2:pos Af
  665 +adj-com adj:sg:inst:m1.m2.m3.n1.n2:com Bf
  666 +adj adj:sg:inst:f:pos Ag
  667 +adj-com adj:sg:inst:f:com Bg
  668 +adj adj:sg:gen:m1.m2.m3.n1.n2:pos Ah
  669 +adj-com adj:sg:gen:m1.m2.m3.n1.n2:com Bh
  670 +adj adj:sg:gen:f:pos Ai
  671 +adj-com adj:sg:gen:f:com Bi
  672 +adj adj:sg:dat:m1.m2.m3.n1.n2:pos Aj
  673 +adj-com adj:sg:dat:m1.m2.m3.n1.n2:com Bj
  674 +adj adj:sg:dat:f:pos Ak
  675 +adj-com adj:sg:dat:f:com Bk
  676 +adj adj:sg:acc:n1.n2:pos Al
  677 +adj-com adj:sg:acc:n1.n2:com Bl
  678 +adj adj:sg:acc:m3:pos Am
  679 +adj-com adj:sg:acc:m3:com Bm
  680 +adj adj:sg:acc:m1.m2:pos An
  681 +adj-com adj:sg:acc:m1.m2:com Bn
  682 +adj adj:sg:acc:f:pos Ao
  683 +adj-com adj:sg:acc:f:com Bo
  684 +adj adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos Ap
  685 +adj-com adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:com Bp
  686 +adj adj:pl:nom.voc:m1.p1:pos Aq
  687 +adj-com adj:pl:nom.voc:m1.p1:com Bq
  688 +adj adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos Ar
  689 +adj-com adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:com Br
  690 +adj adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos As
  691 +adj-com adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:com Bs
  692 +adj adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos At
  693 +adj-com adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:com Bt
  694 +adj adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos Au
  695 +adj-com adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:com Bu
  696 +adj adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos Av
  697 +adj-com adj:pl:acc:m2.m3.f.n1.n2.p2.p3:com Bv
  698 +adj adj:pl:acc:m1.p1:pos Aw
  699 +adj-com adj:pl:acc:m1.p1:com Bw
  700 +verb pacta XYZ
... ...
morphology/data/rev_rules.dic
... ... @@ -32,6 +32,7 @@
32 32 @NOUN-LEMMA-CAP
33 33 kapitaliki_wyglos lemma=ε
34 34 kapitaliki_wyglos A lemma=A
  35 +# lemma=końcówka_lematu
35 36  
36 37 @VERB-LEMMA-PATAL
37 38 funkcjonalnie_miekkie_iy ć lemma=ć
... ... @@ -57,3 +58,14 @@
57 58 dowolne yć lemma=yć
58 59 inf_e eć lemma=eć
59 60  
  61 +@NOUN-LEMMA-FOREIGN
  62 + obce_funkcjonalnie_twarde_a a lemma=a
  63 +* obce_funkcjonalnie_twarde_a ah lemma=ah
  64 + obce_funkcjonalnie_twarde_e lemma=ε
  65 + obce_funkcjonalnie_twarde_ie lemma=ε
  66 + obce_funkcjonalnie_twarde_ie a lemma=a
  67 + obce_funkcjonalnie_twarde_iy lemma=y
  68 + obce_funkcjonalnie_miekkie_ii_wyglos a lemma=a
  69 + obce_funkcjonalnie_miekkie_ii_wyglos lemma=ε
  70 + obce_ais lemma=ais
  71 + obce_apostrof lemma='
... ...
morphology/data/rules.dic
... ... @@ -312,3 +312,67 @@
312 312 verb_j e group=e cat=verb
313 313 funkcjonalnie_twarde_i e group=ie cat=verb
314 314  
  315 +@NOUN-FLEX-FOREIGN
  316 + obce_funkcjonalnie_twarde_iy flex=y2 con cat=noun
  317 + obce_funkcjonalnie_twarde_iy ch flex=ych con cat=noun
  318 + obce_funkcjonalnie_twarde_iy m flex=ym con cat=noun
  319 + obce_funkcjonalnie_twarde_iy mi flex=ymi con cat=noun
  320 +* obce_funkcjonalnie_twarde_iy es flex=s con cat=noun
  321 +* obce_funkcjonalnie_twarde_iy s flex=s con cat=noun
  322 + obce_funkcjonalnie_twarde_e e flex=e1 con cat=noun
  323 + obce_funkcjonalnie_twarde_ie e flex=ie con cat=noun
  324 + obce_funkcjonalnie_twarde_e ego flex=ego con cat=noun
  325 + obce_funkcjonalnie_twarde_e ej flex=ej con cat=noun
  326 + obce_funkcjonalnie_twarde_e em flex=em2 con cat=noun
  327 + obce_funkcjonalnie_twarde_e emu flex=emu con cat=noun
  328 + obce_funkcjonalnie_twarde_a a flex=a1 con cat=noun
  329 + obce_funkcjonalnie_twarde_a ach flex=ach con cat=noun
  330 + obce_funkcjonalnie_twarde_a ami flex=ami1 con cat=noun
  331 + obce_funkcjonalnie_twarde_a ą flex=ą con cat=noun
  332 + obce_funkcjonalnie_twarde_a ę flex=ę con cat=noun
  333 + obce_funkcjonalnie_twarde_a o flex=o2 con cat=noun
  334 + obce_funkcjonalnie_twarde_a om flex=om con cat=noun
  335 + obce_funkcjonalnie_twarde_a ów flex=ów con cat=noun
  336 + obce_funkcjonalnie_twarde_a owie flex=owie con cat=noun
  337 + obce_funkcjonalnie_twarde_a u flex=u1 con cat=noun
  338 + obce_funkcjonalnie_twarde_a i flex=i2 con cat=noun
  339 + obce_funkcjonalnie_twarde_a y flex=i2 con cat=noun
  340 + obce_funkcjonalnie_miekkie_ii_wyglos flex=ε3 con cat=noun
  341 +
  342 +@NOUN-FLEX-APOSTROF
  343 + obce_ais ’go flex=ego con cat=noun
  344 + obce_ais ’mu flex=emu con cat=noun
  345 + obce_ais ’m flex=em con cat=noun
  346 + obce_ais 'go flex=ego con cat=noun
  347 + obce_ais 'mu flex=emu con cat=noun
  348 + obce_ais 'm flex=em con cat=noun
  349 + obce_ais flex=ε con cat=noun
  350 + obce_apostrof 'a flex='a con cat=noun
  351 + obce_apostrof ’u flex='u con cat=noun
  352 + obce_apostrof ’owi flex='owi con cat=noun
  353 + obce_apostrof ’em flex='em con cat=noun
  354 + obce_apostrof ’ie flex='ie con cat=noun
  355 + obce_apostrof ’u flex='u con cat=noun
  356 + obce_apostrof ’y flex='y con cat=noun
  357 + obce_apostrof ’owie flex='owie con cat=noun
  358 + obce_apostrof ’e flex='e con cat=noun
  359 + obce_apostrof ’ów flex='ów con cat=noun
  360 + obce_apostrof ’i flex='i con cat=noun
  361 + obce_apostrof ’om flex='om con cat=noun
  362 + obce_apostrof ’ami flex='ami con cat=noun
  363 + obce_apostrof ’ach flex='ach con cat=noun
  364 + obce_apostrof 'a flex='a con cat=noun
  365 + obce_apostrof 'u flex='u con cat=noun
  366 + obce_apostrof 'owi flex='owi con cat=noun
  367 + obce_apostrof 'em flex='em con cat=noun
  368 + obce_apostrof 'ie flex='ie con cat=noun
  369 + obce_apostrof 'u flex='u con cat=noun
  370 + obce_apostrof 'y flex='y con cat=noun
  371 + obce_apostrof 'owie flex='owie con cat=noun
  372 + obce_apostrof 'e flex='e con cat=noun
  373 + obce_apostrof 'ów flex='ów con cat=noun
  374 + obce_apostrof 'i flex='i con cat=noun
  375 + obce_apostrof 'om flex='om con cat=noun
  376 + obce_apostrof 'ami flex='ami con cat=noun
  377 + obce_apostrof 'ach flex='ach con cat=noun
  378 + obce_apostrof flex=ε con cat=noun
... ...
morphology/data/schemata.dic
... ... @@ -27,3 +27,5 @@ PREF-ε FIN-FLEX-J VERB-FLEX2-J VERB-GROUP-J-NĄ VERB-LEMMA-NĄ
27 27 PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-J-NĄ VERB-LEMMA-NĄ
28 28 PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-J-NĄ VERB-LEMMA-NĄ
29 29 PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-J-NĄ VERB-LEMMA-NĄ
  30 +NOUN-FLEX-FOREIGN NOUN-LEMMA-FOREIGN
  31 +NOUN-FLEX-APOSTROF NOUN-LEMMA-FOREIGN
... ...
morphology/doc/decyzje-scalanie.txt 0 → 100644
  1 +Plik freqListInterps.ml wykorzystuje plik data/interps_general.tab, skąd usunięte zostały niektóre wpisy,
  2 +głównie dotyczące adj (powodujące dwuznaczności przy scalaniu).
  3 +
  4 +Pozostawione intepretacje:
  5 +-siebie, numcol (nieobecne w SGJP) pozostają jak są
  6 +-ppron12, ppron3 pozostają jak są
  7 +(tzn. nie ma znalezionych odpowiedników, są pomijane)
  8 +
  9 +W przypadku praet, imps, imp, fin, inf, ger, pact, ppas wybrana została interpretacja najbardziej podobna
  10 +do oryginalnej licząc od końca, co rozwiązuje problemy perf.impef, neg.aff
  11 +
  12 +Przekształcenia:
  13 +-qub:wok, qub:nwok -> qub
  14 +
  15 +Co do num spoza SGJP:
  16 +-jeżeli forma składa się wyłącznie z cyfr arabskich i rzymskich, wybierana jest najdłuższa interpretacja
  17 +Zapewne w rzeczywistości powinny wtedy obejmować wszystkie możliwe tagi.
  18 +-w przeciwnym wypadku wybierana jest najkrótsza (najwęższa) interpretacja
... ...
morphology/doc/model.pdf
No preview for this file type
morphology/doc/model.tex
... ... @@ -6,89 +6,162 @@
6 6 \usepackage[polish]{babel}
7 7 % \usepackage{tikz}
8 8 % \usetikzlibrary{conceptgraph}
  9 +\usepackage{amsthm}
9 10  
10 11 \parindent 0pt
11 12 \parskip 4pt
12 13  
13   -% \newcommand{\tensor}{\otimes}
14   -% \newcommand{\forward}{\operatorname{/}}
15   -% \newcommand{\backward}{\operatorname{\backslash}}
16   -% \newcommand{\both}{\mid}
17   -% \newcommand{\plus}{\oplus}
18   -% \newcommand{\zero}{0}
19   -% \newcommand{\one}{1}
20   -% \newcommand{\letin}[2]{{\bf let}\;#1\;{\bf in}\;#2}
21   -% \newcommand{\caseof}[2]{{\bf case}\;#1\;{\bf of}\;#2}
22   -% \newcommand{\emp}{{\bf emp}}
23   -% \newcommand{\inl}{{\bf inl}}
24   -% \newcommand{\inr}{{\bf inr}}
25   -% \newcommand{\coord}[1]{{#1}^\star}
26   -% \newcommand{\map}[2]{{\bf map}\;#1\;#2}
27   -% \newcommand{\concat}[1]{{\bf concat}\;#1}
28   -% \newcommand{\makeset}[1]{{\bf makeset}\;#1}
29   -% \newcommand{\maketerm}[1]{{\bf maketerm}\;#1}
30   -% \newcommand{\addlist}[2]{{\bf add}\;#1\;#2}
31   -% \newcommand{\ana}[1]{{\bf ana}(#1)}
32   -% \newcommand{\One}{\bullet}
33   -
34   -
35   -\title{Model probabilistyczny guessera dla języka polskiego}
  14 +\newcommand{\form}{{\it form}}
  15 +\newcommand{\lemma}{{\it lemma}}
  16 +\newcommand{\cat}{{\it cat}}
  17 +\newcommand{\interp}{{\it interp}}
  18 +\newcommand{\fsuf}{{\it fsuf}}
  19 +\newcommand{\lsuf}{{\it lsuf}}
  20 +
  21 +\newtheorem{task}{Zadanie}
  22 +\newtheorem{answer}{Odpowiedź}
  23 +
  24 +\title{Model probabilistyczny fleksji języka polskiego}
36 25 \author{Wojciech Jaworski}
37 26 %\date{}
38 27  
39 28 \begin{document}
40 29 \maketitle
41 30  
42   -Zakładamy, że język jest rozkładem probabilistycznym na czwórkach (form,lemma,cat,interp),
  31 +Zakładamy, że język jest rozkładem probabilistycznym na czwórkach (\form,\lemma,\cat,\interp),
43 32 czyli, że wystąpienia kolejnych słów w tekście są od siebie niezależne.
44   -Interpretacja interp jest zbiorem tagów zgodnym a tagsetem SGJP.
45   -Kategoria $cat \in \{ noun, adj, adv, verb, other \}$
  33 +Interpretacja \interp{} jest zbiorem tagów zgodnym a tagsetem SGJP.
  34 +Kategoria $\cat \in \{ {\rm noun}, {\rm adj}, {\rm adv}, {\rm verb}, {\rm other} \}$
46 35 Zakładamy też, że język jest poprawny, tzn. nie ma literówek, ani błędów gramatycznych.
47 36  
48 37 Dysponujemy następującymi danymi:
49 38 \begin{itemize}
50 39 \item słownikiem gramatycznym S, czyli zbiorem czwórek, o których wiemy, że należą do języka;
51   -\item zbiorem reguł, czyli zbiorem czwórek (fsuf,lsuf,cat,interp)
  40 +\item zbiorem reguł, czyli zbiorem czwórek (\fsuf,\lsuf,\cat,\interp)
52 41 \item zbiorem wyjątków, czyli zbiorem czwórek, o których wiemy, że należą do języka, które nie są opisywane przez reguły
53 42 \item otagowaną listą frekwencyjną.
54 43 \end{itemize}
55   -Reguła przyłożona do formy ucina fsuf i przykleja lsuf.
  44 +Reguła przyłożona do formy ucina \fsuf{} i przykleja \lsuf.
  45 +
  46 +Lista frekwencyjna wytworzona jest na podstawie NKJP1M. Usunięte zostały z niej symbole
  47 +(formy do których odczytania nie wystarczy znajomość reguł wymowy takie, jak liczby zapisane cyframi, oznaczenia godzin i lat,
  48 +znaki interpunkcyjne, skróty, emotikony). Usunięte zostały również formy odmienialne z użyciem myślnika i apostrofu
  49 +(np. odmienione akronimy i nazwiska obce, formy takie jak ,,12-latek``).
  50 +Interpretacje na liście frekwencyjnej zostały skonwertowane do postaci takiej jaka występuje w SGJP,
  51 +łączącej interpretacje form identycznych. Na przykład interpretacje adj:pl:nom:m1:pos, adj:pl:voc:m1:pos, adj:pl:nom:p1:pos i adj:pl:voc:p1:pos
  52 +zostały złączone w adj:pl:nom.voc:m1.p1:pos, a frekwencje form zsumowane.
  53 +
  54 +Celem jest aproksymacja wartości P(\lemma,\cat,\interp|\form).
  55 +
  56 +%Jakość aproksymacji mierzymy licząc jak często wśród $k$ najbardziej prawdopodobnych trójek $\lemma,\cat,\interp$
  57 +%wskazanych przez model dla zadanej formy znajduje się trójka poprawna. Wyniki dla poszczególnych form agregujemy
  58 +%za pomocą średniej ważonej po ich częstościach.
  59 +
  60 +%Pytanie 0: Ile wynosi powyższa miara liczona z użyciem p-stw wziętych z listy frekwencyjnej? (To jest ograniczenie górne dla modelu)
  61 +
  62 +%Pytanie 0': Ile wynosi powyższa miara liczona z użyciem częstości wziętych ze zbioru reguł? (To jest ograniczenie dolne dla modelu)
  63 +
  64 +Pierwszym kryterium jest przynależność formy do słownika S.
  65 +Jeśli forma należy do S zakładamy, że jedno z haseł S zawierające tę formę
  66 +poprawnie opisuje jej lemat, kategorię i interpretację.
  67 +
  68 +\begin{task}
  69 +Jakie jest prawdopodobieństwo trafienia na formę, której lemat, kategoria i interpretacja należy do słownika, czyli
  70 +\[P((\form,\lemma,\cat,\interp) \in S)\]
  71 +Jakie jest prawdopodobieństwo trafienia na formę, która należy do słownika, ale jej lemat, kategoria lub interpretacja należy do słownika, czyli
  72 +\[P((\form,\lemma,\cat,\interp) \not\in S \wedge \form \in S)\]
  73 +\end{task}
  74 +
  75 +\begin{answer}
  76 +Prawdopodobieństwo natrafienia na formę należącą do słownika wynosi 95,67\%, zaś natrafienia na formę należącą do SGJP bez odpowiedniej
  77 +interpretacji -- 3,92\% (lista tych form znajduje się w pliku traps.txt).
  78 +\end{answer}
  79 +
  80 +W przypadku form należących do słownika różnorodność interpretacji będzie niewielka,
  81 +natomiast istotne będzie prawdopodobieństwo wystąpienia danego lematu.
  82 +Zaś w przypadku form nie należących do słownika prawdopodobieństwo wystąpienia lematu
  83 +będzie zawsze małe.
56 84  
57   -Celem jest aproksymacja wartości P(lemma,cat,interp|form).
  85 +Dzielimy teraz listę frekwencyjną na część należącą do S i nie należącą do S.
  86 +Od tej pory budujemy model osobno dla każdej z części.
58 87  
59   -Pytanie 1: $P((form,lemma,cat,interp) \in S)$
  88 +W przypadku cześci należącej do S zauważamy, że \[P(\lemma,\cat,\interp|\form)=P(\form|\lemma,\cat,\interp)\frac{P(\lemma,\cat,\interp)}{P(\form)}\]
60 89  
61   -Pytanie 2: $P((form,lemma,cat,interp) \not\in S \wedge form \in S)$
  90 +Zakładamy, że \interp{} jest niezależne od \lemma, pod warunkiem określonego \cat
  91 +\[P(\lemma,\cat,\interp)=P(\lemma,\cat)P(\interp|\lemma,\cat)=P(\lemma,\cat)P(\interp|\cat)\]
62 92  
63   -Załóżmy, że reguły i wyjątki mają postać taką, że do danej formy można zaaplikować tylko jedną z nich
64   -(dla żadnej reguły sufix nie jest podciągiem innego sufixu). Wtedy
65   -\[P(lemma,cat,interp|form)\approx P(rule|form)=P(rule|fsuf)\]
66   -(W powyższym drzewie sufixowym w każdym węźle mamy dowiązania do sufixów o jeden znak dłuższych oraz kategorię pozostałe traktową łącznie
  93 +$P(\form)$, $P(\lemma,\cat)$ i $P(\interp|\cat)$ szacujemy na podstawie listy frekwencyjnej,
  94 +w przypadku pierwszych dwu stosując wygładzanie. Wyliczenie $P(\form)$ zawiera uogólniona lista frekwencyjna
  95 +(ścieżka {\tt resources/NKJP1M/NKJP1M-generalized-frequency.tab} w repozytorium ENIAM), $P(\lemma,\cat)$ -- plik
  96 + {\tt prob\_lemmacat.txt}, zaś $P(\interp|\cat)$ -- {\tt prob\_itp\_givencat.txt} (oba zawarte w katalogu {\tt morphology/doc}).
67 97  
68   -Pytanie 3: Czy faktycznie zachodzi powyższa zależność? Jak zmierzyć podobieństwo?
  98 +$P(\form|\lemma,\cat,\interp)$ wynosi 0, gdy w S nie ma krotki postaci (\form,\lemma,\cat,\interp);
  99 +1, gdy jest dokładnie jedna krotka z (\lemma,\cat,\interp). Gdy jest ich więcej oznacza to, że
  100 +lemat ma przynajmniej dwa warianty odmiany. Są to przypadki rzadkie. Przypisujemy każdej z możliwości
  101 +prawdopodobieństwo 1.
69 102  
70   -Problem tu jest taki, że lista frekwencyjna jest zbyt mała by precyzyjnie określić p-stwo ok. 40000 reguł
  103 +\begin{task}
  104 +Przejrzeć SGJP i znaleźć wszystkie przykłady, w których dla ustalonego lematu, kategorii i interpretacji
  105 +jest więcej niż jedna forma. Znaleźć wystąpienia tych krotek na liście frekwencyjnej.
  106 +\end{task}
71 107  
72   -\[P(rule|fsuf)=P(lsuf,cat,interp|fsuf)=P(fsuf|lsuf,cat,interp)\frac{P(lsuf,cat,interp)}{P(fsuf)}\]
  108 +\begin{answer}
  109 +Lista takich form znajduje się w pliku multi\_forms.txt.
  110 +\end{answer}
73 111  
74   -$P(fsuf)$ jest prawdopodobieństwem tego, że do języka należy słowo o zadanym sufixie.
  112 +Teraz zanalizujemy drugą część listy frekwencyjnej.
  113 +Załóżmy, że reguły mają postać taką, że sufiks żadnej reguły nie jest podciągiem sufixu innej z nich.
  114 +Sufiksy reguł tworzą drzewo, które w każdym węźle ma dowiązania do sufixów o jeden znak dłuższych oraz kategorię pozostałe traktową łącznie.
  115 +Przyjmujemy następujące założenie modelowe:
  116 +\[P(\lemma,\cat,\interp|\form)\approx P(rule|\form)=P(rule|\fsuf)\]
  117 +Wynika ono z tego, że mając nieznaną formę musimy oprzeć się na ogólnych regułach
  118 +odmiany i nie możemy korzystać z tego że ma ona jakieś konkretne brzmienie.
  119 +Korzystamy tutaj tylko z reguł oznaczonych jako produktywne.
  120 +
  121 +Problem tu jest taki, że lista frekwencyjna jest zbyt mała by precyzyjnie określić p-stwo ok. 40000 reguł.
  122 +Dlatego znowu stosujemy zabieg z prawdopodobieństwem warunkowym.
  123 +
  124 +\[P(rule|\fsuf)=P(\lsuf,\cat,\interp|\fsuf)=P(\fsuf|\lsuf,\cat,\interp)\frac{P(\lsuf,\cat,\interp)}{P(\fsuf)}\]
  125 +
  126 +$P(\fsuf)$ jest prawdopodobieństwem tego, że do języka należy słowo o zadanym sufixie.
75 127 Można je oszacować za pomocą listy frekwencyjnej.
76 128  
77   -Zakładamy, że interp jest niezależne od lsuf, pod warunkiem określonego cat
78   -$P(lsuf,cat,interp)=P(lsuf,cat)P(interp|lsuf,cat)=P(lsuf,cat)P(interp|cat)$
  129 +Zakładamy, że \interp{} jest niezależne od \lsuf, pod warunkiem określonego \cat
  130 +\[P(\lsuf,\cat,\interp)=P(\lsuf,\cat)P(\interp|\lsuf,\cat)=P(\lsuf,\cat)P(\interp|\cat)\]
79 131  
80   -$P(lsuf,cat)$ i $P(interp|cat)$ można oszacować na podstawie listy frekwencyjnej.
  132 +$P(\lsuf,\cat)$ i $P(\interp|\cat)$ można oszacować na podstawie listy frekwencyjnej.
81 133  
82   -$P(fsuf|lsuf,cat,interp)$ wynosi 0, gdy nie ma reguły postaci (fsuf,lsuf,cat,interp);
83   -1, gdy jest dokładnie jedna reguła z (lsuf,cat,interp), a gdy jest ich więcej trzeba
84   -oszacować z listy frekwencyjnej.
  134 +\begin{task}
  135 +Oszacować $P(\fsuf)$ i $P(\lsuf,\cat)$ na podstawie listy frekwencyjnej.
  136 +Sprawdzić dla jakich sufiksów próbka jest mała albo nie ma jej wcale.
  137 +\end{task}
85 138  
86   -Pytanie 4: Czy powyższe przybliżenie jest poprawne, jak często jest więcej niż jedna reguła i ile wynoszą wówczas p-stwa?
  139 +% w razie gdyby był problem można próbować dzielić sufiksy na części i założyć niezależność tych części
  140 +
  141 +$P(\fsuf|\lsuf,\cat,\interp)$ wynosi 0, gdy nie ma reguły postaci (\fsuf,\lsuf,\cat,\interp);
  142 +1, gdy jest dokładnie jedna reguła z (\fsuf,\lsuf,\cat,\interp). Ustawiamy produktywność reguł tak
  143 +by nie pojawiało się więcej pasujących reguł.
  144 +
  145 +\begin{task}
  146 +Określić produktywność reguł i sprawdzić, czy nie ma niejednoznacznych dopasowań.
  147 +\end{task}
87 148  
88   -Pytanie 5: Co zrobić z niejednoznacznymi interpretacjami?
  149 +\begin{task}
  150 +Określić jakość modelu.
  151 +\end{task}
89 152  
90   -Zadania poboczne: wytworzenie otagowanej listy frekwencyjnej, wytworzenie zbioru reguł, wskazanie, które reguły opisują sytuacje wyjątkowe.
  153 +\begin{answer}
  154 +Wyliczona jakość modelu (stopień pokrycia listy frekwencyjnej przez co najmniej 95\% najbardziej prawdopodobnych interpretacji wg modelu) wyniosła 79,90\%.
  155 +\end{answer}
  156 +
  157 +%czasowniki produktywne to te z lematem ać ować ywać, ić, yć, (nąć)
  158 +
  159 +Pytanie 4: Czy powyższe przybliżenie jest poprawne, jak często jest więcej niż jedna reguła i ile wynoszą wówczas p-stwa?
  160 +
  161 +Zadania poboczne: wytworzenie otagowanej listy frekwencyjnej, wytworzenie (uzupełnienie) zbioru reguł na podstawie SGJP i listy frekwencyjnej, wskazanie, które reguły opisują sytuacje wyjątkowe.
91 162  
92 163 Zadanie na przyszłość: reguły słowotwórstwa i ich interpretacja semantyczna.
93 164  
94   -\end{document}
95 165 \ No newline at end of file
  166 +Do powyższego modelu trzeba jeszcze dodać prefixy nie i naj.
  167 +
  168 +\end{document}
... ...