Commit 5bff1aaed9336292eb84f0a18bfb5bd725bc2567

Authored by Wojciech Jaworski
1 parent accee0d9

biblioteka eniam-lexSemantics-1.0

integration/eniam-integration-1.0.tar.bz2 0 → 100644
No preview for this file type
integration/makefile
... ... @@ -15,8 +15,8 @@ install: all
15 15 cp eniam-integration.cmxa eniam-integration.a eniam-integration.cma $(INSTALLDIR)
16 16 cp ENIAM_CONLL.cmi ENIAMpreIntegration.cmi $(INSTALLDIR)
17 17 cp ENIAM_CONLL.cmx ENIAMpreIntegration.cmx $(INSTALLDIR)
18   - mkdir -p /usr/share/eniam/integration
19   - cp resources/* /usr/share/eniam/integration
  18 + # mkdir -p /usr/share/eniam/integration
  19 + # cp resources/* /usr/share/eniam/integration
20 20  
21 21 eniam-integration.cma: $(SOURCES)
22 22 ocamlc -linkall -a -o eniam-integration.cma $(OCAMLFLAGS) $^
... ...
lexSemantics/ENIAMlexSemantics.ml
... ... @@ -18,7 +18,9 @@
18 18 *)
19 19  
20 20 open ENIAMtokenizerTypes
  21 +open ENIAMsubsyntaxTypes
21 22 open ENIAMlexSemanticsTypes
  23 +open ENIAMwalTypes
22 24 open Xstd
23 25  
24 26 let string_of_lex_sems tokens lex_sems =
... ... @@ -55,13 +57,12 @@ let find_proper_names tokens i t =
55 57 match t.token with
56 58 Lemma(lemma,pos,interp) ->
57 59 if StringMap.mem proper_names lemma then
58   - let t = {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
59   - attrs=remove t.attrs "notvalidated proper"} in
60   - ExtArray.set tokens i t else
  60 + {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
  61 + attrs=remove t.attrs "notvalidated proper"} else
61 62 if Xlist.mem t.attrs "notvalidated proper" then
62   - let t = {t with token=Proper(lemma,pos,interp,[])} in
63   - ExtArray.set tokens i t
64   - | _ -> ()
  63 + {t with token=Proper(lemma,pos,interp,[])}
  64 + else t
  65 + | _ -> t
65 66  
66 67 let find_senses t = (* FIXME: sensy zawierające 'się' *)
67 68 match t.token with
... ... @@ -69,36 +70,67 @@ let find_senses t = (* FIXME: sensy zawierające 'się' *)
69 70 | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses
70 71 | _ -> []
71 72  
72   -
73   -let assign tokens text =
74   - let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in
75   - let _ = ExtArray.add lex_sems empty_lex_sem in
76   - Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
77   - let token = ExtArray.get tokens i in
78   - find_proper_names tokens i token;
79   - let senses = find_senses token in
80   - let lex_sem = {empty_lex_sem with senses=senses} in
81   - let j = ExtArray.add lex_sems lex_sem in
82   - if j <> i then failwith "assign_semantic_valence");
83   - lex_sems
84   -
85   -(*
86   -(* print_endline "a14"; *)
87   - let paths = assign_valence paths in
88   -(* print_endline "a15"; *)
89   -(* print_endline "a16"; *)
90   - let paths = disambiguate_senses paths in
91   - let paths = assign_simplified_valence paths in
92   - let paths = PreSemantics.assign_semantics paths in
93   -(* print_endline "a16"; *)
94   -
95   -
96   -
97   -
98   -
99   -let assign_valence paths =
100   - let lexemes = Xlist.fold paths StringMap.empty (fun lexemes t ->
101   - match t.token with
  73 +let rec find a l i =
  74 + if a.(i) = max_int then (
  75 + a.(i) <- i;
  76 + i) else
  77 + if a.(i) = i then (
  78 + Xlist.iter l (fun j -> a.(j) <- i);
  79 + i) else
  80 + find a (i :: l) a.(i)
  81 +
  82 +let union a i j =
  83 + if i = j then i else
  84 + let x = min i j in
  85 + let y = max i j in
  86 + a.(y) <- x;
  87 + x
  88 +
  89 +let rec split_tokens_into_groups_sentence a = function
  90 + RawSentence s -> ()
  91 + | StructSentence([],_) -> ()
  92 + | StructSentence((id,_,_) :: paths,_) ->
  93 + ignore (Xlist.fold paths (find a [] id) (fun m (id,_,_) ->
  94 + union a m (find a [] id)))
  95 + | DepSentence(paths) ->
  96 + if Array.length paths = 0 then () else
  97 + let id,_,_ = paths.(0) in
  98 + ignore (Int.fold 1 (Array.length paths - 1) (find a [] id) (fun m i ->
  99 + let id,_,_ = paths.(i) in
  100 + union a m (find a [] id)))
  101 + | QuotedSentences sentences ->
  102 + Xlist.iter sentences (fun p ->
  103 + split_tokens_into_groups_sentence a p.psentence)
  104 + | AltSentence l -> Xlist.iter l (fun (mode,sentence) ->
  105 + split_tokens_into_groups_sentence a sentence)
  106 +
  107 +let rec split_tokens_into_groups_paragraph a = function
  108 + RawParagraph s -> ()
  109 + | StructParagraph sentences ->
  110 + Xlist.iter sentences (fun p -> split_tokens_into_groups_sentence a p.psentence)
  111 + | AltParagraph l -> Xlist.iter l (fun (mode,paragraph) ->
  112 + split_tokens_into_groups_paragraph a paragraph)
  113 +
  114 +let rec split_tokens_into_groups_text a = function
  115 + RawText s -> ()
  116 + | StructText paragraphs ->
  117 + Xlist.iter paragraphs (split_tokens_into_groups_paragraph a)
  118 + | AltText l -> Xlist.iter l (fun (mode,text) ->
  119 + split_tokens_into_groups_text a text)
  120 +
  121 +let split_tokens_into_groups size text =
  122 + let a = Array.make size max_int in
  123 + split_tokens_into_groups_text a text;
  124 + Int.iter 1 (Array.length a - 1) (fun i ->
  125 + if a.(i) <> max_int then a.(i) <- a.(a.(i)));
  126 + let map = Int.fold 1 (Array.length a - 1) IntMap.empty (fun map i ->
  127 + if a.(i) = max_int then map else
  128 + IntMap.add_inc map a.(i) [i] (fun l -> i :: l)) in
  129 + IntMap.fold map [] (fun l _ v -> v :: l)
  130 +
  131 +let assign_valence tokens lex_sems group =
  132 + let lexemes = Xlist.fold group StringMap.empty (fun lexemes id ->
  133 + match (ExtArray.get tokens id).token with
102 134 Lemma(lemma,pos,_) ->
103 135 StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos)
104 136 | Proper(lemma,pos,_,_) ->
... ... @@ -108,104 +140,51 @@ let assign_valence paths =
108 140 | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in
109 141 StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *)
110 142 | _ -> lexemes) in
111   - let valence = WalFrames.find_frames lexemes in
112   - List.rev (Xlist.rev_map paths (fun t ->
113   - match t.token with
114   - Lemma(lemma,pos,_) -> {t with valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
115   - | Proper(lemma,pos,interp,_) -> {t with valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
116   - (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*));
117   - token=Lemma(lemma,pos,interp)}
118   - | _ -> t))
119   -
120   -(**********************************************************************************)
121   -
122   -(* let prepare_indexes (paths,_) =
123   - let set = Xlist.fold paths IntSet.empty (fun set t ->
124   - IntSet.add (IntSet.add set t.beg) t.next) in
125   - let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x ->
126   - IntMap.add map x n, n+1) in
127   - List.rev (Xlist.rev_map paths (fun t ->
128   - {t with lnode=IntMap.find map t.beg; rnode=IntMap.find map t.next})), last - 1 *)
  143 + let valence = ENIAMwalenty.find_frames lexemes in
  144 + Xlist.iter group (fun id ->
  145 + match (ExtArray.get tokens id).token with
  146 + Lemma(lemma,pos,_) ->
  147 + ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
  148 + valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
  149 + | Proper(lemma,pos,interp,_) ->
  150 + ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
  151 + valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
  152 + (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*))};
  153 + ExtArray.set tokens id {(ExtArray.get tokens id) with token=Lemma(lemma,pos,interp)}
  154 + | _ -> ())
129 155  
130 156 let get_prefs_schema prefs schema =
131 157 Xlist.fold schema prefs (fun prefs t ->
132   - Xlist.fold t.WalTypes.sel_prefs prefs StringSet.add)
  158 + Xlist.fold t.sel_prefs prefs StringSet.add)
133 159  
134 160 let map_prefs_schema senses schema =
135 161 Xlist.map schema (fun t ->
136   - if Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.Pro) || Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.ProNG) then t else
137   - {t with WalTypes.sel_prefs = Xlist.fold t.WalTypes.sel_prefs [] (fun l s ->
  162 + if Xlist.mem t.morfs (Phrase Pro) || Xlist.mem t.morfs (Phrase ProNG) then t else
  163 + {t with sel_prefs = Xlist.fold t.sel_prefs [] (fun l s ->
138 164 if StringSet.mem senses s then s :: l else l)})
139 165  
140   -let disambiguate_senses paths =
141   - let prefs = Xlist.fold paths (StringSet.singleton "ALL") (fun prefs t ->
142   - Xlist.fold t.valence prefs (fun prefs -> function
143   - _,WalTypes.Frame(_,schema) -> get_prefs_schema prefs schema
144   - | _,WalTypes.LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema
145   - | _,WalTypes.ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in
146   - let hipero = Xlist.fold paths (StringSet.singleton "ALL") (fun hipero t ->
147   - Xlist.fold t.senses hipero (fun hipero (_,l,_) ->
  166 +let disambiguate_senses lex_sems group =
  167 + let prefs = Xlist.fold group (StringSet.singleton "ALL") (fun prefs id ->
  168 + Xlist.fold (ExtArray.get lex_sems id).valence prefs (fun prefs -> function
  169 + _,Frame(_,schema) -> get_prefs_schema prefs schema
  170 + | _,LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema
  171 + | _,ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in
  172 + let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id ->
  173 + Xlist.fold (ExtArray.get lex_sems id).senses hipero (fun hipero (_,l,_) ->
148 174 Xlist.fold l hipero StringSet.add)) in
149 175 let senses = StringSet.intersection prefs hipero in
150 176 let is_zero = StringSet.mem hipero "0" in
151 177 let senses = if is_zero then StringSet.add senses "0" else senses in
152   - Xlist.map paths (fun t ->
153   - {t with valence = if is_zero then t.valence else
  178 + Xlist.iter group (fun id ->
  179 + let t = ExtArray.get lex_sems id in
  180 + ExtArray.set lex_sems id {t with valence = if is_zero then t.valence else
154 181 Xlist.map t.valence (function
155   - n,WalTypes.Frame(a,schema) -> n,WalTypes.Frame(a,map_prefs_schema senses schema)
156   - | n,WalTypes.LexFrame(s,p,r,schema) -> n,WalTypes.LexFrame(s,p,r,map_prefs_schema senses schema)
157   - | n,WalTypes.ComprepFrame(s,p,r,schema) -> n,WalTypes.ComprepFrame(s,p,r,map_prefs_schema senses schema));
  182 + n,Frame(a,schema) -> n,Frame(a,map_prefs_schema senses schema)
  183 + | n,LexFrame(s,p,r,schema) -> n,LexFrame(s,p,r,map_prefs_schema senses schema)
  184 + | n,ComprepFrame(s,p,r,schema) -> n,ComprepFrame(s,p,r,map_prefs_schema senses schema));
158 185 senses = Xlist.map t.senses (fun (s,l,w) ->
159 186 s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)})
160 187  
161   -(*let single_sense (paths,last) =
162   - List.rev (Xlist.rev_map paths (fun t ->
163   - let sense =
164   - if t.senses = [] then [] else
165   - [Xlist.fold t.senses ("",[],-.max_float) (fun (max_meaning,max_hipero,max_weight) (meaning,hipero,weight) ->
166   - if max_weight >= weight then max_meaning,max_hipero,max_weight else meaning,hipero,weight)] in
167   - {t with senses=sense})), last*)
168   -
169   -open WalTypes
170   -
171   -(*let single_schema schemata =
172   - let map = Xlist.fold schemata StringMap.empty (fun map schema ->
173   - let t = WalStringOf.schema (List.sort compare (Xlist.fold schema [] (fun l s ->
174   - if s.gf <> ARG && s.gf <> ADJUNCT then {s with role=""; role_attr=""; sel_prefs=[]} :: l else
175   - if s.cr <> [] || s.ce <> [] then {s with role=""; role_attr=""; sel_prefs=[]} :: l else l))) in
176   - StringMap.add_inc map t [schema] (fun l -> schema :: l)) in
177   - StringMap.fold map [] (fun l _ schemata ->
178   - let map = Xlist.fold schemata StringMap.empty (fun map schema ->
179   - Xlist.fold schema map (fun map s ->
180   - let t = WalStringOf.schema [{s with role=""; role_attr=""; sel_prefs=[]}] in
181   - StringMap.add_inc map t [s] (fun l -> s :: l))) in
182   - let schema = StringMap.fold map [] (fun schema _ l ->
183   - let s = List.hd l in
184   - {s with sel_prefs=Xlist.fold s.sel_prefs [] (fun l t -> if t = "0" || t = "T" then t :: l else l)} :: schema) in
185   - schema :: l)*)
186   -
187   -let remove_meaning = function
188   - DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a)
189   - | EmptyAtrs m -> EmptyAtrs []
190   - | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*))
191   - | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*))
192   - | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a)
193   - | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a)
194   - | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a)
195   - | _ -> failwith "remove_meaning"
196   -
197   -
198   -(*let single_frame (paths,last) =
199   - List.rev (Xlist.rev_map paths (fun t ->
200   - let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
201   - Frame(attrs,schema) ->
202   - let attrs = remove_meaning attrs in
203   - lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema]) (fun (_,l) -> attrs, schema :: l)
204   - | frame -> frame :: lex_frames, frames) in
205   - let frames = StringMap.fold frames lex_frames (fun frames _ (attrs,schemata) ->
206   - Xlist.fold (single_schema schemata) frames (fun frames frame -> Frame(attrs,frame) :: frames)) in
207   - {t with valence=frames})), last *)
208   -
209 188 let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *)
210 189 Phrase(NP(Case "dat")) -> l
211 190 | Phrase(NP(Case "inst")) -> l
... ... @@ -283,33 +262,34 @@ let simplify_schemata pos schemata =
283 262 if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else
284 263 (* if s.cr <> [] || s.ce <> [] then s :: l else *)
285 264 simplify_position pos l s)) in
286   - StringMap.add_inc schemata (WalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in
  265 + StringMap.add_inc schemata (ENIAMwalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in
287 266 StringMap.fold schemata [] (fun l _ s -> s :: l)
288 267  
289 268 (* FIXME: problem ComprepNP i PrepNCP *)
290 269 (* FIXME: problem gdy ten sam token występuje w kilku ścieżkach *)
291 270 let generate_verb_prep_adjuncts preps =
292   - Xlist.map preps (fun (lemma,case) -> WalFrames.verb_prep_adjunct_schema_field lemma case)
  271 + Xlist.map preps (fun (lemma,case) -> ENIAMwalFrames.verb_prep_adjunct_schema_field lemma case)
293 272  
294 273 let generate_verb_comprep_adjuncts compreps =
295   - Xlist.map compreps (fun lemma -> WalFrames.verb_comprep_adjunct_schema_field lemma)
  274 + Xlist.map compreps (fun lemma -> ENIAMwalFrames.verb_comprep_adjunct_schema_field lemma)
296 275  
297 276 let generate_verb_compar_adjuncts compars =
298   - Xlist.map compars (fun lemma -> WalFrames.verb_compar_adjunct_schema_field lemma)
  277 + Xlist.map compars (fun lemma -> ENIAMwalFrames.verb_compar_adjunct_schema_field lemma)
299 278  
300 279 let generate_noun_prep_adjuncts preps =
301   - WalFrames.noun_prep_adjunct_schema_field preps
  280 + ENIAMwalFrames.noun_prep_adjunct_schema_field preps
302 281  
303 282 let generate_noun_compar_adjuncts compars =
304   - WalFrames.noun_compar_adjunct_schema_field compars
  283 + ENIAMwalFrames.noun_compar_adjunct_schema_field compars
305 284  
306 285 let generate_adj_compar_adjuncts compars =
307   - WalFrames.noun_compar_adjunct_schema_field compars
  286 + ENIAMwalFrames.noun_compar_adjunct_schema_field compars
308 287  
309 288 let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"]
310 289  
311   -let generate_prep_adjunct_tokens paths =
312   - let map = Xlist.fold paths StringMap.empty (fun map t ->
  290 +let generate_prep_adjunct_tokens tokens group =
  291 + let map = Xlist.fold group StringMap.empty (fun map id ->
  292 + let t = ExtArray.get tokens id in
313 293 match t.token with
314 294 Lemma(lemma,"prep",interp) ->
315 295 let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in
... ... @@ -321,17 +301,19 @@ let generate_prep_adjunct_tokens paths =
321 301 | _ -> map) in
322 302 StringMap.fold map [] (fun l _ v -> v :: l)
323 303  
324   -let generate_comprep_adjunct_tokens paths =
325   - let lemmas = Xlist.fold paths StringSet.empty (fun lemmas t ->
  304 +let generate_comprep_adjunct_tokens tokens group =
  305 + let lemmas = Xlist.fold group StringSet.empty (fun lemmas id ->
  306 + let t = ExtArray.get tokens id in
326 307 match t.token with
327 308 Lemma(lemma,_,_) -> StringSet.add lemmas lemma
328 309 | _ -> lemmas) in
329   - StringMap.fold WalFrames.comprep_reqs [] (fun compreps comprep reqs ->
  310 + StringMap.fold ENIAMwalFrames.comprep_reqs [] (fun compreps comprep reqs ->
330 311 let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in
331 312 if b then comprep :: compreps else compreps)
332 313  
333   -let generate_compar_adjunct_tokens paths =
334   - let set = Xlist.fold paths StringSet.empty (fun set t ->
  314 +let generate_compar_adjunct_tokens tokens group =
  315 + let set = Xlist.fold group StringSet.empty (fun set id ->
  316 + let t = ExtArray.get tokens id in
335 317 match t.token with
336 318 Lemma(lemma,"prep",interp) ->
337 319 if not (StringSet.mem compars lemma) then set else
... ... @@ -343,29 +325,40 @@ let is_measure = function
343 325 NounAtrs(_,_,Common "measure") -> true
344 326 | _ -> false
345 327  
346   -let assign_simplified_valence paths =
347   - let preps = generate_prep_adjunct_tokens paths in
348   - let compreps = generate_comprep_adjunct_tokens paths in
349   - let compars = generate_compar_adjunct_tokens paths in
  328 +let remove_meaning = function
  329 + DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a)
  330 + | EmptyAtrs m -> EmptyAtrs []
  331 + | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*))
  332 + | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*))
  333 + | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a)
  334 + | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a)
  335 + | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a)
  336 + | _ -> failwith "remove_meaning"
  337 +
  338 +let assign_simplified_valence tokens lex_sems group =
  339 + let preps = generate_prep_adjunct_tokens tokens group in
  340 + let compreps = generate_comprep_adjunct_tokens tokens group in
  341 + let compars = generate_compar_adjunct_tokens tokens group in
350 342 let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in
351 343 let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in
352 344 let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in
353 345 let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in
354 346 let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in
355 347 let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in
356   - let verb_adjuncts = WalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in
357   - let noun_adjuncts = WalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
358   - let noun_measure_adjuncts = WalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
359   - let adj_adjuncts = WalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in
360   - let adv_adjuncts = WalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in
361   - List.rev (Xlist.rev_map paths (fun t ->
362   - let pos = match t.token with
363   - Lemma(_,pos,_) -> WalFrames.simplify_pos pos
  348 + let verb_adjuncts = ENIAMwalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in
  349 + let noun_adjuncts = ENIAMwalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
  350 + let noun_measure_adjuncts = ENIAMwalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
  351 + let adj_adjuncts = ENIAMwalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in
  352 + let adv_adjuncts = ENIAMwalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in
  353 + Xlist.iter group (fun id ->
  354 + let t = ExtArray.get lex_sems id in
  355 + let pos = match (ExtArray.get tokens id).token with
  356 + Lemma(_,pos,_) -> ENIAMwalFrames.simplify_pos pos
364 357 | _ -> "" in
365 358 let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
366 359 _,(Frame(attrs,schema) as frame) ->
367 360 let attrs = remove_meaning attrs in
368   - lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l)
  361 + lex_frames, StringMap.add_inc frames (ENIAMwalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l)
369 362 | _,frame -> frame :: lex_frames, frames) in
370 363 let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame ->
371 364 (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in
... ... @@ -380,93 +373,23 @@ let assign_simplified_valence paths =
380 373 (n,Frame(attrs,schema)) :: simp_frames,
381 374 Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames),
382 375 n+1)) in
383   - {t with simple_valence=simp_frames; valence=full_frames}))
384   -
385   -(* FIXME: dodać do walencji preferencje selekcyjne nadrzędników symboli: dzień, godzina, rysunek itp. *)
386   -(* FIXME: sprawdzić czy walencja nazw własnych jest dobrze zrobiona. *)
387   -
388   -(* let first_id = 1 (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
389   -
390   -let add_ids (paths,last) next_id =
391   - let paths,next_id = Xlist.fold ((*List.rev*) paths) ([],next_id) (fun (paths,id) t ->
392   - {t with id=id} :: paths, id+1) in
393   - (paths,last),next_id *)
394   -
395   -
396   -
397   -let parse query =
398   -(* print_endline "a1"; *)
399   - let l = Xunicode.classified_chars_of_utf8_string query in
400   -(* print_endline "a2"; *)
401   - let l = PreTokenizer.tokenize l in
402   -(* print_endline "a3"; *)
403   - let l = PrePatterns.normalize_tokens [] l in
404   -(* print_endline "a4"; *)
405   - let l = PrePatterns.find_replacement_patterns l in
406   -(* print_endline "a5"; *)
407   - let l = PrePatterns.remove_spaces [] l in
408   - let l = PrePatterns.find_abr_patterns PreAcronyms.abr_patterns l in
409   - let l = PrePatterns.normalize_tokens [] l in
410   -(* print_endline "a6"; *)
411   - let paths = PrePaths.translate_into_paths l in
412   -(* print_endline "a7"; *)
413   - let paths = PrePaths.lemmatize paths in
414   -(* print_endline "a8"; *)
415   - let paths,_ = PreMWE.process paths in
416   -(* print_endline "a12"; *)
417   - let paths = find_proper_names paths in
418   -(* print_endline "a13"; *)
419   - let paths = modify_weights paths in
420   - let paths = translate_digs paths in
421   - let paths = assign_senses paths in
422   -(* print_endline "a14"; *)
423   - let paths = assign_valence paths in
424   -(* print_endline "a15"; *)
425   - let paths = combine_interps paths in
426   -(* print_endline "a16"; *)
427   - let paths = disambiguate_senses paths in
428   - let paths = assign_simplified_valence paths in
429   - let paths = PreSemantics.assign_semantics paths in
430   -(* print_endline "a16"; *)
431   - let paths = select_tokens paths in
432   -(* print_endline "a17"; *)
433   -(* let paths = if !single_sense_flag then single_sense paths else paths in
434   - let paths = if !single_frame_flag then single_frame paths else paths in*)
435   - (*let paths, next_id = add_ids paths next_id in
436   - let paths = prepare_indexes paths in*)
437   -(* print_endline "a18"; *)
438   - paths(*, next_id*)
439   -(* print_endline (PrePaths.to_string paths); *)
440   -(* let paths =
441   - if PrePaths.no_possible_path (PrePaths.map paths PreLemmatization.remove_postags) then
442   - PrePaths.map paths process_ign
443   - else paths in
444   - let paths = PrePaths.map paths PreLemmatization.remove_postags in
445   - let paths = PreCaseShift.manage_lower_upper_case paths in (* FIXME: niepotrzebnie powiększa pierwszy token (przymiotniki partykuły itp.) *)
446   - let paths = PreLemmatization.combine_interps paths in
447   -(* print_endline (PrePaths.to_string paths); *)*)
448   -
449   -let parse_conll tokens dep_paths = (* FIXME: sprawdzić, czy zachowana jest kolejność elementów paths !!! *)
450   - let paths = List.rev (Int.fold 1 (Array.length dep_paths - 1) [] (fun paths conll_id ->
451   - let id,_,_ = dep_paths.(conll_id) in
452   - ExtArray.get tokens id :: paths)) in
453   - (* print_endline "a12"; *)
454   - let paths = find_proper_names paths in
455   - (* print_endline "a13"; *)
456   - let paths = modify_weights paths in
457   - let paths = PreWordnet.assign_senses paths in
458   - (* print_endline "a14"; *)
459   - (* let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *) *)
460   - (* print_endline "a15"; *)
461   - let paths = assign_valence paths in
462   - (* print_endline "a16"; *)
463   - let paths = disambiguate_senses paths in
464   - let paths = assign_simplified_valence paths in
465   - let paths = PreSemantics.assign_semantics paths in
466   - (* print_endline "a16"; *)
467   - let _ = Xlist.fold paths 1 (fun conll_id t ->
468   - let id,_,_ = dep_paths.(conll_id) in
469   - ExtArray.set tokens id t;
470   - conll_id + 1) in
471   - ()
472   -*)
  376 + ExtArray.set lex_sems id {t with simple_valence=simp_frames; valence=full_frames})
  377 +
  378 +let assign tokens text =
  379 + let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in
  380 + let _ = ExtArray.add lex_sems empty_lex_sem in
  381 + Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
  382 + let token = ExtArray.get tokens i in
  383 + let token = find_proper_names tokens i token in
  384 + ExtArray.set tokens i token;
  385 + let senses = find_senses token in
  386 + let lex_sem = {empty_lex_sem with senses=senses} in
  387 + let _ = ExtArray.add lex_sems lex_sem in
  388 + ());
  389 + let groups = split_tokens_into_groups (ExtArray.size tokens) text in
  390 + (* Xlist.iter groups (fun group -> print_endline (String.concat " " (Xlist.map group string_of_int))); *)
  391 + Xlist.iter groups (fun group -> assign_valence tokens lex_sems group);
  392 + Xlist.iter groups (fun group -> disambiguate_senses lex_sems group);
  393 + Xlist.iter groups (fun group -> assign_simplified_valence tokens lex_sems group);
  394 + Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group);
  395 + lex_sems
... ...
lexSemantics/ENIAMlexSemanticsData.ml
... ... @@ -20,7 +20,7 @@
20 20 open ENIAMtokenizerTypes
21 21 open ENIAMlexSemanticsTypes
22 22 open Xstd
23   -(*
  23 +
24 24 let subst_inst_roles = Xlist.fold [
25 25 "wiosna", "Time","";
26 26 "lato", "Time","";
... ... @@ -365,7 +365,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr
365 365 ] StringMap.empty (fun map (lemma,case,role,role_attr,hipero,sel_prefs) ->
366 366 let hipero = Xlist.fold hipero StringSet.empty ENIAMplWordnet.get_hipero_rec in
367 367 let map2 = try StringMap.find map lemma with Not_found -> StringMap.empty in
368   - let map2 = StringMap.add_inc map2 case [role,role_attr,hipero,sel_prefs] (fun l -> (role,role_attr,hipero,sel_prefs) :: l) in
  368 + let map2 = StringMap.add_inc map2 case [case,role,role_attr,hipero,sel_prefs] (fun l -> (case,role,role_attr,hipero,sel_prefs) :: l) in
369 369 StringMap.add map lemma map2)
370 370 (* "przeciwko","dat","Dat";
371 371 "przeciw","dat","Dat";
... ... @@ -377,10 +377,10 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr
377 377 let assign_prep_semantics lemma cases t =
378 378 try
379 379 let map = StringMap.find prep_roles lemma in
380   - Xlist.map cases (fun case ->
381   - {t with semantics=(try PrepSemantics (StringMap.find map case) with Not_found -> Normal);
382   - token=Lemma(lemma,"prep",[[[case]]])})
383   - with Not_found -> [t]
  380 + let l = List.flatten (Xlist.map cases (fun case ->
  381 + try StringMap.find map case with Not_found -> [])) in
  382 + if l = [] then Normal else PrepSemantics l
  383 + with Not_found -> Normal
384 384  
385 385 let subst_special_lexemes = Xlist.fold [
386 386 "jutro", ["indexical"];(*"dzień"*)
... ... @@ -532,21 +532,24 @@ let pron_lexemes = Xlist.fold [
532 532 ] StringMap.empty (fun map (k,v,w) -> StringMap.add map k (SpecialMod(v,w)))*)
533 533  
534 534 (* UWAGA: przy przetwarzaniu danych zdezambiguowanych ta procedura nie zmienia liczby tokenów *)
535   -let assign_semantics paths =
536   - List.rev (List.flatten (Xlist.rev_map paths (fun t ->
537   - match t.token with
538   - Lemma(lemma,"subst",_) -> [{t with lroles=(try StringMap.find subst_inst_roles lemma with Not_found -> "",""); semantics=try StringMap.find subst_special_lexemes lemma with Not_found -> Normal}]
539   - | Lemma(lemma,"depr",_) -> [{t with semantics=try StringMap.find subst_special_lexemes lemma with Not_found -> Normal}]
540   - | Lemma(lemma,"adj",_) -> [{t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal}]
541   - | Lemma(lemma,"adjc",_) -> [{t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal}]
542   - | Lemma(lemma,"adjp",_) -> [{t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal}]
543   - | Lemma(lemma,"adv",_) -> [{t with lroles=(try StringMap.find adv_roles lemma with Not_found -> "",""); semantics=try StringMap.find adv_special_lexemes lemma with Not_found -> Normal}]
544   - | Lemma(lemma,"qub",_) -> [{t with lroles=(try StringMap.find qub_roles lemma with Not_found -> "",""); semantics=try StringMap.find qub_special_lexemes lemma with Not_found -> Normal}]
  535 +let assign_semantics tokens lex_sems group =
  536 + Xlist.iter group (fun id ->
  537 + let token = (ExtArray.get tokens id).token in
  538 + let t = ExtArray.get lex_sems id in
  539 + let t = match token with
  540 + Lemma(lemma,"subst",_) -> {t with lroles=(try StringMap.find subst_inst_roles lemma with Not_found -> "",""); semantics=try StringMap.find subst_special_lexemes lemma with Not_found -> Normal}
  541 + | Lemma(lemma,"depr",_) -> {t with semantics=try StringMap.find subst_special_lexemes lemma with Not_found -> Normal}
  542 + | Lemma(lemma,"adj",_) -> {t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal}
  543 + | Lemma(lemma,"adjc",_) -> {t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal}
  544 + | Lemma(lemma,"adjp",_) -> {t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal}
  545 + | Lemma(lemma,"adv",_) -> {t with lroles=(try StringMap.find adv_roles lemma with Not_found -> "",""); semantics=try StringMap.find adv_special_lexemes lemma with Not_found -> Normal}
  546 + | Lemma(lemma,"qub",_) -> {t with lroles=(try StringMap.find qub_roles lemma with Not_found -> "",""); semantics=try StringMap.find qub_special_lexemes lemma with Not_found -> Normal}
545 547 (* | Lemma(lemma,"num",_) -> [{t with semantics=try StringMap.find num_lexemes lemma with Not_found -> Normal}] *)
546   - | Lemma(lemma,"ppron12",_) -> [{t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal}]
547   - | Lemma(lemma,"ppron3",_) -> [{t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal}]
548   - | Lemma(lemma,"siebie",_) -> [{t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal}]
549   - | Lemma(lemma,"prep",l) -> Xlist.fold l [] (fun l -> function cases :: _ -> assign_prep_semantics lemma cases t @ l | [] -> l)
550   - | _ -> [t]
551   - )))
552   -*)
  548 + | Lemma(lemma,"ppron12",_) -> {t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal}
  549 + | Lemma(lemma,"ppron3",_) -> {t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal}
  550 + | Lemma(lemma,"siebie",_) -> {t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal}
  551 + | Lemma(lemma,"prep",l) ->
  552 + let cases = Xlist.fold l StringSet.empty (fun set -> function cases :: _ -> Xlist.fold cases set StringSet.add | _ -> set) in
  553 + {t with semantics=assign_prep_semantics lemma (StringSet.to_list cases) t}
  554 + | _ -> t in
  555 + ExtArray.set lex_sems id t)
... ...
lexSemantics/ENIAMlexSemanticsTypes.ml
... ... @@ -33,7 +33,7 @@ type semantics =
33 33 | Special of string list
34 34 (* | SpecialNoun of type_arg list * type_term
35 35 | SpecialMod of string * (type_arg list * type_term)*)
36   - | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *)
  36 + | PrepSemantics of (string * string * string * StringSet.t * string list) list (* case,role,role_attr,hipero,sel_prefs *)
37 37  
38 38 type lex_sem = {
39 39 e: labels;
... ...
lexSemantics/eniam-lexSemantics-1.0.tar.bz2 0 → 100644
No preview for this file type
lexSemantics/makefile
... ... @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa #eniam-lexSemantics.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa eniam-lexSemantics.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9 9 SOURCES= ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsData.ml ENIAMlexSemantics.ml
... ... @@ -27,7 +27,7 @@ eniam-lexSemantics.cmxa: $(SOURCES)
27 27 ocamlopt -linkall -a -o eniam-lexSemantics.cmxa $(INCLUDES) $^
28 28  
29 29 test: test.ml
30   - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml
  30 + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml
31 31  
32 32 .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx
33 33  
... ...
lexSemantics/test.ml
... ... @@ -21,6 +21,11 @@
21 21 let test_strings = [
22 22 "Szpak frunie zimą.";
23 23 "Kot miauczy w październiku.";
  24 + "Np. Ala.";
  25 + "Kot np. miauczy.";
  26 + "Szpak frunie. Kot miauczy.";
  27 + "Szpak powiedział: „Frunę. Kiszę.”";
  28 + "W XX w. Warszawa.";
24 29 (* "a gdybym miałem";
25 30 "A Gdy Miałem";
26 31 "GDY MIAŁEM";
... ...
pre/makefile
1 1 OCAMLC=ocamlc
2 2 OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4   -INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam -I ../morphology -I ../parser -I ../corpora
  4 +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa
7 7 INSTALLDIR=`ocamlc -where`
8 8  
9   -WAL= paths.ml preTypes.ml
10   -PRE= preSemantics.ml ../corpora/CONLL.ml preProcessing.ml
  9 +WAL= paths.ml
  10 +PRE= preProcessing.ml
11 11  
12 12 all:
13 13 $(OCAMLOPT) -o pre $(OCAMLOPTFLAGS) $(WAL) $(PRE)
... ... @@ -33,4 +33,4 @@ all:
33 33 $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
34 34  
35 35 clean:
36   - rm -f *~ *.cm[oix] *.o pre concraft_test
  36 + rm -f *~ *.cm[oix] *.o pre
... ...
pre/preProcessing.ml
... ... @@ -17,355 +17,7 @@
17 17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 18 *)
19 19  
20   -open Xstd
21   -open PreTypes
22   -
23   -(* uwagi:
24   -jak przetwarzać num:comp
25   -czy rzeczownik niepoliczalny w liczbie mnogiej jest nadal niepoliczalny np. "Wody szumią."
26   -trzeba zrobić słownik mwe, i nazw własnych
27   -trzeba zweryfikować słownik niepoliczalnych
28   -przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga
29   -*)
30   -
31   -
32   -
33   -(**********************************************************************************)
34   -
35   -let proper_names =
36   - let l = Str.split_delim (Str.regexp "\n") (File.load_file Paths.proper_names_filename) in
37   - let l2 = Str.split_delim (Str.regexp "\n") (File.load_file Paths.proper_names_filename2) in
38   - Xlist.fold (l2 @ l) StringMap.empty (fun proper line ->
39   - if String.length line = 0 then proper else
40   - if String.get line 0 = '#' then proper else
41   - match Str.split_delim (Str.regexp "\t") line with
42   - [lemma; types] ->
43   - let types = Str.split (Str.regexp "|") types in
44   - StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
45   - | _ -> failwith ("proper_names: " ^ line))
46   -
47   -let remove l s =
48   - Xlist.fold l [] (fun l t ->
49   - if s = t then l else t :: l)
50   -
51   -let find_proper_names paths =
52   - List.rev (Xlist.rev_map paths (fun t ->
53   - match t.token with
54   - Lemma(lemma,pos,interp) ->
55   - if StringMap.mem proper_names lemma then
56   - {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
57   - attrs=remove t.attrs "notvalidated proper"}
58   - else
59   - if Xlist.mem t.attrs "notvalidated proper" then
60   - {t with token=Proper(lemma,pos,interp,[])}
61   - else t
62   - | _ -> t))
63   -
64   -
65   -let assign_valence paths =
66   - let lexemes = Xlist.fold paths StringMap.empty (fun lexemes t ->
67   - match t.token with
68   - Lemma(lemma,pos,_) ->
69   - StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos)
70   - | Proper(lemma,pos,_,_) ->
71   - let pos = match pos with
72   - "subst" -> "psubst"
73   - | "depr" -> "pdepr"
74   - | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in
75   - StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *)
76   - | _ -> lexemes) in
77   - let valence = WalFrames.find_frames lexemes in
78   - List.rev (Xlist.rev_map paths (fun t ->
79   - match t.token with
80   - Lemma(lemma,pos,_) -> {t with valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
81   - | Proper(lemma,pos,interp,_) -> {t with valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
82   - (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*));
83   - token=Lemma(lemma,pos,interp)}
84   - | _ -> t))
85   -
86   -(**********************************************************************************)
87   -
88   -(* let prepare_indexes (paths,_) =
89   - let set = Xlist.fold paths IntSet.empty (fun set t ->
90   - IntSet.add (IntSet.add set t.beg) t.next) in
91   - let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x ->
92   - IntMap.add map x n, n+1) in
93   - List.rev (Xlist.rev_map paths (fun t ->
94   - {t with lnode=IntMap.find map t.beg; rnode=IntMap.find map t.next})), last - 1 *)
95   -
96   -let get_prefs_schema prefs schema =
97   - Xlist.fold schema prefs (fun prefs t ->
98   - Xlist.fold t.WalTypes.sel_prefs prefs StringSet.add)
99   -
100   -let map_prefs_schema senses schema =
101   - Xlist.map schema (fun t ->
102   - if Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.Pro) || Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.ProNG) then t else
103   - {t with WalTypes.sel_prefs = Xlist.fold t.WalTypes.sel_prefs [] (fun l s ->
104   - if StringSet.mem senses s then s :: l else l)})
105   -
106   -let disambiguate_senses paths =
107   - let prefs = Xlist.fold paths (StringSet.singleton "ALL") (fun prefs t ->
108   - Xlist.fold t.valence prefs (fun prefs -> function
109   - _,WalTypes.Frame(_,schema) -> get_prefs_schema prefs schema
110   - | _,WalTypes.LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema
111   - | _,WalTypes.ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in
112   - let hipero = Xlist.fold paths (StringSet.singleton "ALL") (fun hipero t ->
113   - Xlist.fold t.senses hipero (fun hipero (_,l,_) ->
114   - Xlist.fold l hipero StringSet.add)) in
115   - let senses = StringSet.intersection prefs hipero in
116   - let is_zero = StringSet.mem hipero "0" in
117   - let senses = if is_zero then StringSet.add senses "0" else senses in
118   - Xlist.map paths (fun t ->
119   - {t with valence = if is_zero then t.valence else
120   - Xlist.map t.valence (function
121   - n,WalTypes.Frame(a,schema) -> n,WalTypes.Frame(a,map_prefs_schema senses schema)
122   - | n,WalTypes.LexFrame(s,p,r,schema) -> n,WalTypes.LexFrame(s,p,r,map_prefs_schema senses schema)
123   - | n,WalTypes.ComprepFrame(s,p,r,schema) -> n,WalTypes.ComprepFrame(s,p,r,map_prefs_schema senses schema));
124   - senses = Xlist.map t.senses (fun (s,l,w) ->
125   - s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)})
126   -
127   -(*let single_sense (paths,last) =
128   - List.rev (Xlist.rev_map paths (fun t ->
129   - let sense =
130   - if t.senses = [] then [] else
131   - [Xlist.fold t.senses ("",[],-.max_float) (fun (max_meaning,max_hipero,max_weight) (meaning,hipero,weight) ->
132   - if max_weight >= weight then max_meaning,max_hipero,max_weight else meaning,hipero,weight)] in
133   - {t with senses=sense})), last*)
134   -
135   -open WalTypes
136   -
137   -(*let single_schema schemata =
138   - let map = Xlist.fold schemata StringMap.empty (fun map schema ->
139   - let t = WalStringOf.schema (List.sort compare (Xlist.fold schema [] (fun l s ->
140   - if s.gf <> ARG && s.gf <> ADJUNCT then {s with role=""; role_attr=""; sel_prefs=[]} :: l else
141   - if s.cr <> [] || s.ce <> [] then {s with role=""; role_attr=""; sel_prefs=[]} :: l else l))) in
142   - StringMap.add_inc map t [schema] (fun l -> schema :: l)) in
143   - StringMap.fold map [] (fun l _ schemata ->
144   - let map = Xlist.fold schemata StringMap.empty (fun map schema ->
145   - Xlist.fold schema map (fun map s ->
146   - let t = WalStringOf.schema [{s with role=""; role_attr=""; sel_prefs=[]}] in
147   - StringMap.add_inc map t [s] (fun l -> s :: l))) in
148   - let schema = StringMap.fold map [] (fun schema _ l ->
149   - let s = List.hd l in
150   - {s with sel_prefs=Xlist.fold s.sel_prefs [] (fun l t -> if t = "0" || t = "T" then t :: l else l)} :: schema) in
151   - schema :: l)*)
152   -
153   -let remove_meaning = function
154   - DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a)
155   - | EmptyAtrs m -> EmptyAtrs []
156   - | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*))
157   - | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*))
158   - | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a)
159   - | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a)
160   - | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a)
161   - | _ -> failwith "remove_meaning"
162   -
163   -
164   -(*let single_frame (paths,last) =
165   - List.rev (Xlist.rev_map paths (fun t ->
166   - let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
167   - Frame(attrs,schema) ->
168   - let attrs = remove_meaning attrs in
169   - lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema]) (fun (_,l) -> attrs, schema :: l)
170   - | frame -> frame :: lex_frames, frames) in
171   - let frames = StringMap.fold frames lex_frames (fun frames _ (attrs,schemata) ->
172   - Xlist.fold (single_schema schemata) frames (fun frames frame -> Frame(attrs,frame) :: frames)) in
173   - {t with valence=frames})), last *)
174   -
175   -let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *)
176   - Phrase(NP(Case "dat")) -> l
177   - | Phrase(NP(Case "inst")) -> l
178   - | Phrase(PrepNP _) -> l
179   - | Phrase(PrepAdjP _) -> l
180   - | Phrase(NumP (Case "dat")) -> l
181   - | Phrase(NumP (Case "inst")) -> l
182   - | Phrase(PrepNumP _) -> l
183   - | Phrase(ComprepNP _) -> l
184   - | Phrase(ComparNP _) -> l
185   - | Phrase(ComparPP _) -> l
186   - | Phrase(IP) -> l
187   - | Phrase(CP _) -> l
188   - | Phrase(NCP(Case "dat",_,_)) -> l
189   - | Phrase(NCP(Case "inst",_,_)) -> l
190   - | Phrase(PrepNCP _) -> l
191   -(* | Phrase(PadvP) -> l *)
192   - | Phrase(AdvP) -> l
193   - | Phrase(PrepP) -> l
194   - | Phrase(Or) -> l
195   - | Phrase(Qub) -> l
196   - | Phrase(Adja) -> l
197   - | Phrase(Inclusion) -> l
198   - | Phrase Pro -> Phrase Null :: l
199   - | t -> t :: l
200   -
201   -let simplify_position_noun l = function
202   - Phrase(NP(Case "gen")) -> l
203   - | Phrase(NP(Case "nom")) -> l
204   - | Phrase(NP(CaseAgr)) -> l
205   - | Phrase(PrepNP _) -> l
206   - | Phrase(AdjP AllAgr) -> l
207   - | Phrase(NumP (Case "gen")) -> l
208   - | Phrase(NumP (Case "nom")) -> l
209   - | Phrase(NumP (CaseAgr)) -> l
210   - | Phrase(PrepNumP _) -> l
211   - | Phrase(ComprepNP _) -> l
212   - | Phrase(ComparNP _) -> l
213   - | Phrase(ComparPP _) -> l
214   - | Phrase(IP) -> l
215   - | Phrase(NCP(Case "gen",_,_)) -> l
216   - | Phrase(PrepNCP _) -> l
217   - | Phrase(PrepP) -> l
218   - | Phrase(Qub) -> l
219   - | Phrase(Adja) -> l
220   - | Phrase(Inclusion) -> l
221   - | Phrase Pro -> Phrase Null :: l
222   - | t -> t :: l
223   -
224   -let simplify_position_adj l = function
225   - Phrase(AdvP) -> l
226   - | t -> t :: l
227   -
228   -let simplify_position_adv l = function
229   - Phrase(AdvP) -> l
230   - | t -> t :: l
231   -
232   -
233   -let simplify_position pos l s =
234   - let morfs = match pos with
235   - "verb" -> List.rev (Xlist.fold s.morfs [] simplify_position_verb)
236   - | "noun" -> List.rev (Xlist.fold s.morfs [] simplify_position_noun)
237   - | "adj" -> List.rev (Xlist.fold s.morfs [] simplify_position_adj)
238   - | "adv" -> List.rev (Xlist.fold s.morfs [] simplify_position_adv)
239   - | _ -> s.morfs in
240   - match morfs with
241   - [] -> l
242   - | [Phrase Null] -> l
243   - | _ -> {s with morfs=morfs} :: l
244   -
245   -let simplify_schemata pos schemata =
246   - let schemata = Xlist.fold schemata StringMap.empty (fun schemata (schema,frame) ->
247   - let schema = List.sort compare (Xlist.fold schema [] (fun l s ->
248   - let s = {s with role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; morfs=List.sort compare s.morfs} in
249   - if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else
250   -(* if s.cr <> [] || s.ce <> [] then s :: l else *)
251   - simplify_position pos l s)) in
252   - StringMap.add_inc schemata (WalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in
253   - StringMap.fold schemata [] (fun l _ s -> s :: l)
254   -
255   -(* FIXME: problem ComprepNP i PrepNCP *)
256   -(* FIXME: problem gdy ten sam token występuje w kilku ścieżkach *)
257   -let generate_verb_prep_adjuncts preps =
258   - Xlist.map preps (fun (lemma,case) -> WalFrames.verb_prep_adjunct_schema_field lemma case)
259   -
260   -let generate_verb_comprep_adjuncts compreps =
261   - Xlist.map compreps (fun lemma -> WalFrames.verb_comprep_adjunct_schema_field lemma)
262   -
263   -let generate_verb_compar_adjuncts compars =
264   - Xlist.map compars (fun lemma -> WalFrames.verb_compar_adjunct_schema_field lemma)
265   -
266   -let generate_noun_prep_adjuncts preps =
267   - WalFrames.noun_prep_adjunct_schema_field preps
268   -
269   -let generate_noun_compar_adjuncts compars =
270   - WalFrames.noun_compar_adjunct_schema_field compars
271   -
272   -let generate_adj_compar_adjuncts compars =
273   - WalFrames.noun_compar_adjunct_schema_field compars
274   -
275   -let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"]
276   -
277   -let generate_prep_adjunct_tokens paths =
278   - let map = Xlist.fold paths StringMap.empty (fun map t ->
279   - match t.token with
280   - Lemma(lemma,"prep",interp) ->
281   - let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in
282   - if StringSet.mem compars lemma then map else
283   - Xlist.fold interp map (fun map -> function
284   - [cases] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
285   - | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
286   - | _ -> map)
287   - | _ -> map) in
288   - StringMap.fold map [] (fun l _ v -> v :: l)
289   -
290   -let generate_comprep_adjunct_tokens paths =
291   - let lemmas = Xlist.fold paths StringSet.empty (fun lemmas t ->
292   - match t.token with
293   - Lemma(lemma,_,_) -> StringSet.add lemmas lemma
294   - | _ -> lemmas) in
295   - StringMap.fold WalFrames.comprep_reqs [] (fun compreps comprep reqs ->
296   - let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in
297   - if b then comprep :: compreps else compreps)
298   -
299   -let generate_compar_adjunct_tokens paths =
300   - let set = Xlist.fold paths StringSet.empty (fun set t ->
301   - match t.token with
302   - Lemma(lemma,"prep",interp) ->
303   - if not (StringSet.mem compars lemma) then set else
304   - StringSet.add set lemma
305   - | _ -> set) in
306   - StringSet.to_list set
307   -
308   -let is_measure = function
309   - NounAtrs(_,_,Common "measure") -> true
310   - | _ -> false
311   -
312   -let assign_simplified_valence paths =
313   - let preps = generate_prep_adjunct_tokens paths in
314   - let compreps = generate_comprep_adjunct_tokens paths in
315   - let compars = generate_compar_adjunct_tokens paths in
316   - let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in
317   - let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in
318   - let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in
319   - let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in
320   - let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in
321   - let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in
322   - let verb_adjuncts = WalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in
323   - let noun_adjuncts = WalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
324   - let noun_measure_adjuncts = WalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
325   - let adj_adjuncts = WalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in
326   - let adv_adjuncts = WalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in
327   - List.rev (Xlist.rev_map paths (fun t ->
328   - let pos = match t.token with
329   - Lemma(_,pos,_) -> WalFrames.simplify_pos pos
330   - | _ -> "" in
331   - let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
332   - _,(Frame(attrs,schema) as frame) ->
333   - let attrs = remove_meaning attrs in
334   - lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l)
335   - | _,frame -> frame :: lex_frames, frames) in
336   - let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame ->
337   - (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in
338   - let simp_frames,full_frames,_ = StringMap.fold frames (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) _ (attrs,schemata) ->
339   - Xlist.fold (simplify_schemata pos schemata) (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) (schema,frames) ->
340   - let schema = match pos with
341   - "verb" -> schema @ verb_adjuncts
342   - | "noun" -> schema @ (if is_measure attrs then noun_measure_adjuncts else noun_adjuncts)
343   - | "adj" -> schema @ adj_adjuncts
344   - | "adv" -> schema @ adv_adjuncts
345   - | _ -> schema in
346   - (n,Frame(attrs,schema)) :: simp_frames,
347   - Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames),
348   - n+1)) in
349   - {t with simple_valence=simp_frames; valence=full_frames}))
350   -
351   -(* FIXME: dodać do walencji preferencje selekcyjne nadrzędników symboli: dzień, godzina, rysunek itp. *)
352   -(* FIXME: sprawdzić czy walencja nazw własnych jest dobrze zrobiona. *)
353   -
354   -(* let first_id = 1 (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
355   -
356   -let add_ids (paths,last) next_id =
357   - let paths,next_id = Xlist.fold ((*List.rev*) paths) ([],next_id) (fun (paths,id) t ->
358   - {t with id=id} :: paths, id+1) in
359   - (paths,last),next_id *)
360   -
361   -let assign_senses paths = (* FIXME: sensy zawierające 'się' *)
362   - List.rev (Xlist.rev_map paths (fun t ->
363   - match t.token with
364   - Lemma(lemma,pos,_) -> {t with senses=ENIAMplWordnet.find_senses lemma pos}
365   - | Proper(_,_,_,senses) -> {t with senses=ENIAMplWordnet.find_proper_senses senses}
366   - | _ -> t))
367   -
368   -
  20 +(*
369 21 let parse query =
370 22 (* print_endline "a1"; *)
371 23 let l = Xunicode.classified_chars_of_utf8_string query in
... ... @@ -442,107 +94,12 @@ let parse_conll tokens dep_paths = (* FIXME: sprawdzić, czy zachowana jest kole
442 94 conll_id + 1) in
443 95 ()
444 96  
445   -(*
446   -UWAGA: Aby korzytać z concrafta trzeba najpierw postawić serwer wpisując z linii poleceń:
447   -concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz
448   -*)
449   -
450   -let read_whole_channel c =
451   - let r = ref [] in
452   - try
453   - while true do
454   - r := (input_line c) :: !r
455   - done;
456   - !r
457   - with End_of_file -> List.rev (!r)
458   -
459   -let rec process_concraft_result orth lemma interp others rev = function
460   - [] -> List.rev ((orth,(lemma,interp) :: others) :: rev)
461   - | "" :: l -> process_concraft_result orth lemma interp others rev l
462   - | line :: l ->
463   - (match Xstring.split_delim "\t" line with
464   - [orth2;s] when s = "none" || s = "space" ->
465   - if orth = "" then process_concraft_result orth2 lemma interp others rev l
466   - else process_concraft_result orth2 "" "" [] ((orth,(lemma,interp) :: others) :: rev) l
467   - | ["";lemma2;interp2] -> process_concraft_result orth lemma interp ((lemma2,interp2) :: others) rev l
468   - | ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l
469   - | _ -> failwith ("process_concraft_result: " ^ line))
470   -
471   -let concraft_parse s =
472   - let concraft_in, concraft_out, concraft_err =
473   - Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
474   - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
475   - let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
476   - let result = read_whole_channel concraft_in in
477   - if err_msg <> "" then failwith err_msg else
478   - process_concraft_result "" "" "" [] [] result
479   -
480   -(*let rec load_concraft_sentence white orth rev ic =
481   - (* print_endline "load_concraft_sentence 1"; *)
482   - (* print_endline ("concraft error message: " ^ input_line concraft_err); *)
483   - let s = input_line ic in
484   - (* print_endline ("load_concraft_sentence: " ^ s); *)
485   - if s = "" then List.rev rev else
486   - match Xstring.split_delim "\t" s with
487   - [""; lemma; interp; "disamb"] -> load_concraft_sentence "" "" ((white,orth,lemma,interp) :: rev) ic
488   - | [""; lemma; interp] -> load_concraft_sentence white orth rev ic
489   - | [orth; white] -> load_concraft_sentence white orth rev ic
490   - | _ -> failwith ("load_concraft_sentence: " ^ s)*)
491   -
492   -let make_token (orth,l) =
493   - if l = [] then failwith "make_token 1" else
494   - let lemma,interp = List.hd l in
495   - let cat,interp = match Xstring.split ":" interp with
496   - cat :: l -> cat, [Xlist.map l (fun tag -> [tag])]
497   - | _ -> failwith ("make_token 2: " ^ orth ^ " " ^ lemma ^ " " ^ interp) in
498   - {empty_token with orth = orth; token = Lemma(lemma,cat,interp)}
499   -
500   -let parse_mate tokens pbeg s =
501   - (* print_endline ("parse_mate: " ^ s); *)
502   - (* Printf.fprintf concraft_out "%s\n\n%!" s;
503   - let l = load_concraft_sentence "" "" [] concraft_in in *)
504   - let l = concraft_parse s in
505   - let l = Xlist.map l make_token in
506   - let l = {empty_token with token = Interp "<conll_root>"} :: l in
507   - let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in
508   - let _ = CONLL.establish_for_token pbeg s tokens (List.tl l) in
509   - let dep_paths = Array.of_list l in
510   - parse_conll tokens dep_paths;
511   - dep_paths
512   -
513   -let rec parse_mate_sentence tokens mode pbeg = function
514   - RawSentence s -> if mode <> Mate || not Paths.config.Paths.concraft_enabled then RawSentence s else DepSentence (parse_mate tokens pbeg s)
515   - | StructSentence(paths,last) -> StructSentence(paths,last)
516   - | DepSentence(paths) -> DepSentence(paths)
517   - | QuotedSentences sentences ->
518   - QuotedSentences(Xlist.map sentences (fun p ->
519   - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix;
520   - psentence=parse_mate_sentence tokens mode pbeg p.PreTypes.psentence}))
521   - | AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) ->
522   - mode, parse_mate_sentence tokens mode pbeg sentence))
523   -
524   -let parse_mate_sentences tokens sentences =
525   - Xlist.map sentences (fun p ->
526   - {p with psentence=parse_mate_sentence tokens Struct p.pbeg p.psentence})
527   -
528 97 let parse_text = function
529 98 RawText query ->
530   - (* print_endline ("parse_text: " ^ query); *)
531   - let tokens = ExtArray.make 100 empty_token in
532   - let _ = ExtArray.add tokens empty_token in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
533   - let paragraphs = Xstring.split "\n\\|\r" query in
534   - let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
535   - let n = if Xlist.size paragraphs = 1 then 0 else 1 in
536   - let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph ->
537   - let paths = parse paragraph in
538   - (* print_endline "parse_text 1"; *)
539   - let pid = if n = 0 then "" else string_of_int n ^ "_" in
540   - let sentences = PreSentences.split_into_sentences pid paragraph tokens paths in
541   - (* print_endline "parse_text 2"; *)
542   - let sentences = parse_mate_sentences tokens sentences in
543   - (* print_endline "parse_text 3"; *)
544   - (AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1) in
545   - AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)]
  99 + let text,tokens = ENIAMsubsyntax.parse_text query in
  100 + let text = ENIAMpreIntegration.parse_text ENIAMsubsyntaxTypes.Struct tokens text in
  101 + let lex_sems = ENIAMlexSemantics.assign tokens text in
  102 + text,tokens,lex_sems
546 103 | AltText[Raw,RawText query;CONLL,StructText([
547 104 StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]],tokens)] ->
548 105 parse_conll tokens dep_paths;
... ... @@ -553,28 +110,54 @@ let parse_text = function
553 110 @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in
554 111 AltText[Raw,RawText query; Struct, StructText([
555 112 AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)]
  113 + | _ -> failwith "parse_text: not implemented"*)
  114 +
  115 +open ENIAMsubsyntaxTypes
  116 +
  117 +let parse_text = function
  118 + RawText query,_ ->
  119 + let text,tokens = ENIAMsubsyntax.parse_text query in
  120 + let text = ENIAMpreIntegration.parse_text ENIAMsubsyntaxTypes.Struct tokens text in
  121 + let lex_sems = ENIAMlexSemantics.assign tokens text in
  122 + text,tokens,lex_sems
  123 + | AltText[Raw,RawText query;CONLL,StructText[
  124 + StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens ->
  125 + let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in
  126 + let conll = StructParagraph[{p with psentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
  127 + @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in
  128 + let paths = ENIAMsubsyntax.parse query in
  129 + let sentences = ENIAMsentences.split_into_sentences "" query tokens paths in
  130 + let text = AltText[Raw,RawText query; Struct, StructText([
  131 + AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in
  132 + let lex_sems = ENIAMlexSemantics.assign tokens text in
  133 + text,tokens,lex_sems
556 134 | _ -> failwith "parse_text: not implemented"
557 135  
558 136 let rec main_loop in_chan out_chan =
559 137 (* print_endline "main_loop 1"; *)
560   - let query = (Marshal.from_channel in_chan : text) in
  138 + let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_record ExtArray.t) in
561 139 (* print_endline "main_loop 2"; *)
562   - if query = RawText "" then () else (
  140 + if fst query = RawText "" then () else (
563 141 (try
564 142 (* let time0 = Sys.time () in *)
565 143 let utime0 = Unix.gettimeofday () in
566 144 (* print_endline "main_loop 3a"; *)
567   - let text = parse_text query in
  145 + let text,tokens,lex_sems = parse_text query in
568 146 (* print_endline "main_loop 4a"; *)
569 147 (* let time2 = Sys.time () in *)
570 148 let utime2 = Unix.gettimeofday () in
571 149 (* Printf.printf "time=%f utime=%f\n%!" (time2 -. time0) (utime2 -. utime0); *)
572   - Marshal.to_channel out_chan (text(*paths,last,next_id*),"",utime2 -. utime0) [];
  150 + Marshal.to_channel out_chan (text,tokens,lex_sems,"",utime2 -. utime0) [];
573 151 (* print_endline "main_loop 5"; *)
574 152 ()
575 153 with e -> (
576 154 (* print_endline "main_loop 7"; *)
577   - Marshal.to_channel out_chan (RawText ""(*[],0*),Printexc.to_string e,0.) []));
  155 + Marshal.to_channel out_chan (
  156 + RawText "",
  157 + ExtArray.make 1 ENIAMtokenizerTypes.empty_token,
  158 + ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem,
  159 + Printexc.to_string e,
  160 + 0.) []));
578 161 (* print_endline "main_loop 6"; *)
579 162 flush out_chan;
580 163 main_loop in_chan out_chan)
... ...
pre/preTypes.ml deleted
1   -(*
2   - * ENIAM: Categorial Syntactic-Semantic Parser for Polish
3   - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
4   - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
5   - *
6   - * This program is free software: you can redistribute it and/or modify
7   - * it under the terms of the GNU General Public License as published by
8   - * the Free Software Foundation, either version 3 of the License, or
9   - * (at your option) any later version.
10   - *
11   - * This program is distributed in the hope that it will be useful,
12   - * but WITHOUT ANY WARRANTY; without even the implied warranty of
13   - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14   - * GNU General Public License for more details.
15   - *
16   - * You should have received a copy of the GNU General Public License
17   - * along with this program. If not, see <http://www.gnu.org/licenses/>.
18   - *)
19   -
20   -open Xstd
21   -
22   -(*let single_sense_flag = ref false(*true*)
23   -let single_frame_flag = ref false(*true*)*)
24   -
25   -(*type pos_record = {interp: string list list list; attrs: string list; proper: string list; senses: string list}
26   -
27   -type dict = {lemmas: pos_record StringMap.t StringMap.t; dbeg: int; dlen: int}*)
28   -
29   -(* type selector = Orth of string | Pos of string (*| All *) *)
30   -
31   -(* Długość pojedynczego znaku w tekście *)
32   -let factor = 100
33   -
34   -type labels = {
35   - number: string;
36   - case: string;
37   - gender: string;
38   - person: string;
39   - aspect: string;
40   - }
41   -
42   -
43   -(*type type_arg =
44   - TArg of string
45   - | TWith of type_arg list
46   -
47   -and type_term =
48   - TConst of string * type_arg list
49   - | TMod of type_term * type_term
50   - | TName of string
51   - | TVariant of type_term * type_term*)
52   -
53   -
54   -type semantics =
55   - Normal
56   - | Special of string list
57   -(* | SpecialNoun of type_arg list * type_term
58   - | SpecialMod of string * (type_arg list * type_term)*)
59   - | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *)
60   -
61   -type token =
62   - SmallLetter of string (* orth *)
63   - | CapLetter of string * string (* orth * lowercase *)
64   - | AllSmall of string (* orth *)
65   - | AllCap of string * string * string (* orth * lowercase * all lowercase *)
66   - | FirstCap of string * string * string * string (* orth * all lowercase * first letter uppercase * first letter lowercase *)
67   - | SomeCap of string (* orth *)
68   - | RomanDig of string * string (* value * cat *)
69   - | Interp of string (* orth *)
70   - | Symbol of string (* orth *)
71   - | Dig of string * string (* value * cat *)
72   - | Other2 of string (* orth *)
73   - | Lemma of string * string * string list list list (* lemma * cat * interp *)
74   - | Proper of string * string * string list list list * string list (* lemma * cat * interp * senses *)
75   -(* | Sense of string * string * string list list list * (string * string * string list) list (* lemma * cat * interp * senses *) *)
76   - | Compound of string * token list (* sense * components *)
77   - | Tokens of string * int list (*cat * token id list *)
78   -
79   -(* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających
80   - informacje o poszczególnych tokenach *)
81   -and token_record = {
82   - orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token *)
83   - corr_orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token z poprawionymi błędami *)
84   - beg: int; (* pozycja początkowa tokenu względem początku akapitu *)
85   - len: int; (* długość tokenu *)
86   - next: int; (* pozycja początkowa następnego tokenu względem początku akapitu *)
87   - token: token; (* treść tokenu *)
88   - attrs: string list; (* dodatkowe atrybuty *)
89   - weight: float;
90   - e: labels;
91   - valence: (int * ENIAMwalTypes.frame) list;
92   - simple_valence: (int * ENIAMwalTypes.frame) list;
93   - senses: (string * string list * float) list;
94   - lroles: string * string;
95   - semantics: semantics;
96   - }
97   -
98   -(* Tokeny umieszczone są w strukturze danych umożliwiającej efektywne wyszukiwanie ich sekwencji,
99   - struktura danych sama z siebie nie wnosi informacji *)
100   -type tokens =
101   - | Token of token_record
102   - | Variant of tokens list
103   - | Seq of tokens list
104   -
105   -type pat = L | CL | D of string | C of string | S of string | RD of string | O of string
106   -
107   -let empty_labels = {
108   - number="";
109   - case="";
110   - gender="";
111   - person="";
112   - aspect="";
113   - }
114   -
115   -let empty_token = {
116   - orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; weight=0.; e=empty_labels;
117   - attrs=[]; valence=[]; simple_valence=[]; senses=[];
118   - lroles="",""; semantics=Normal}
119   -
120   -type mode =
121   - Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE
122   -
123   -(* warstwy nkjp1m do analizy:
124   -header
125   -text
126   -ann_segmentation
127   -ann_morphosyntax
128   -ann_named
129   -*)
130   -
131   -(* zdania wydobyte na zewnątrz *)
132   -(* struktura ponadzdaniowa przetwarzana przed strukturą zależnościową *)
133   -(* istnieje ryzyko eksplozji interpretacji *)
134   -type sentence =
135   - RawSentence of string
136   - (* | CONLL of conll list *)
137   - | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *)
138   - | DepSentence of (int * int * string) array (* (id * super * label) conll_id *)
139   - | QuotedSentences of paragraph_record list
140   - (* | NKJP1M of nkjp1m list *)
141   - (* | Skladnica of skladnica_tree *)
142   - | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *)
143   -
144   -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *)
145   -
146   -and paragraph =
147   - RawParagraph of string
148   - | StructParagraph of paragraph_record list (* zdania *)
149   - | AltParagraph of (mode * paragraph) list
150   -
151   -type text =
152   - RawText of string
153   - | StructText of paragraph list * token_record ExtArray.t (* akapity * tokeny *)
154   - | AltText of (mode * text) list
subsyntax/ENIAMsentences.ml
... ... @@ -181,10 +181,7 @@ let rec extract_sentences_rec tokens id =
181 181 Tokens("sentence",ids) ->
182 182 let paths,last = make_paths tokens ids in
183 183 [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix="";
184   - psentence=AltSentence([Raw,RawSentence t.orth; ENIAM,StructSentence(paths,last)] (*@
185   - (if Paths.config.Paths.mate_parser_enabled then [Mate,RawSentence t.orth] else []) @
186   - (if Paths.config.Paths.swigra_enabled then [Swigra,RawSentence t.orth] else [])*)(* @
187   - (if Paths.config.Paths.polfie_enabled then [POLFIE,RawSentence t.orth] else []) *))}]
  184 + psentence=AltSentence([Raw,RawSentence t.orth; ENIAM,StructSentence(paths,last)])}]
188 185 | Tokens("quoted_sentences",ids) ->
189 186 [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix="";
190 187 psentence=AltSentence[Raw,RawSentence t.orth;
... ...
subsyntax/ENIAMsubsyntax.ml
... ... @@ -234,8 +234,5 @@ let parse_text query =
234 234 (* print_endline "parse_text 1"; *)
235 235 let pid = if n = 0 then "" else string_of_int n ^ "_" in
236 236 let sentences = ENIAMsentences.split_into_sentences pid paragraph tokens paths in
237   - (* print_endline "parse_text 2"; *)
238   - (* let sentences = parse_mate_sentences tokens sentences in *)
239   - (* print_endline "parse_text 3"; *)
240 237 (AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1) in
241 238 AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens
... ...
subsyntax/TODO
... ... @@ -2,5 +2,5 @@
2 2 - dodać zasoby MWE
3 3 - rozpoznawanie MWE ze Słowosieci
4 4  
5   -- przenieść odwołanie do Świgry i Mate z ENIAMsentences
6   -- przenieść odwołanie do parse_mate_sentences z ENIAMsubsyntax
  5 +- jak przetwarzać num:comp
  6 +- przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga
... ...
walenty/TODO
... ... @@ -4,3 +4,5 @@ a jedynie instalator, który dostaje Walentego i go przetwarza.
4 4 - dowiązanie symboliczne do Walentego - udokumentować.
5 5 - uporządkować położenia słowników pojemników itp.
6 6 - uporządkować położenie fixed.tab
  7 +- czy rzeczownik niepoliczalny w liczbie mnogiej jest nadal niepoliczalny np. "Wody szumią."
  8 +- trzeba zweryfikować słownik niepoliczalnych
... ...