Commit 5bff1aaed9336292eb84f0a18bfb5bd725bc2567
1 parent
accee0d9
biblioteka eniam-lexSemantics-1.0
Showing
15 changed files
with
240 additions
and
884 deletions
integration/eniam-integration-1.0.tar.bz2
0 → 100644
No preview for this file type
integration/makefile
... | ... | @@ -15,8 +15,8 @@ install: all |
15 | 15 | cp eniam-integration.cmxa eniam-integration.a eniam-integration.cma $(INSTALLDIR) |
16 | 16 | cp ENIAM_CONLL.cmi ENIAMpreIntegration.cmi $(INSTALLDIR) |
17 | 17 | cp ENIAM_CONLL.cmx ENIAMpreIntegration.cmx $(INSTALLDIR) |
18 | - mkdir -p /usr/share/eniam/integration | |
19 | - cp resources/* /usr/share/eniam/integration | |
18 | + # mkdir -p /usr/share/eniam/integration | |
19 | + # cp resources/* /usr/share/eniam/integration | |
20 | 20 | |
21 | 21 | eniam-integration.cma: $(SOURCES) |
22 | 22 | ocamlc -linkall -a -o eniam-integration.cma $(OCAMLFLAGS) $^ |
... | ... |
lexSemantics/ENIAMlexSemantics.ml
... | ... | @@ -18,7 +18,9 @@ |
18 | 18 | *) |
19 | 19 | |
20 | 20 | open ENIAMtokenizerTypes |
21 | +open ENIAMsubsyntaxTypes | |
21 | 22 | open ENIAMlexSemanticsTypes |
23 | +open ENIAMwalTypes | |
22 | 24 | open Xstd |
23 | 25 | |
24 | 26 | let string_of_lex_sems tokens lex_sems = |
... | ... | @@ -55,13 +57,12 @@ let find_proper_names tokens i t = |
55 | 57 | match t.token with |
56 | 58 | Lemma(lemma,pos,interp) -> |
57 | 59 | if StringMap.mem proper_names lemma then |
58 | - let t = {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma); | |
59 | - attrs=remove t.attrs "notvalidated proper"} in | |
60 | - ExtArray.set tokens i t else | |
60 | + {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma); | |
61 | + attrs=remove t.attrs "notvalidated proper"} else | |
61 | 62 | if Xlist.mem t.attrs "notvalidated proper" then |
62 | - let t = {t with token=Proper(lemma,pos,interp,[])} in | |
63 | - ExtArray.set tokens i t | |
64 | - | _ -> () | |
63 | + {t with token=Proper(lemma,pos,interp,[])} | |
64 | + else t | |
65 | + | _ -> t | |
65 | 66 | |
66 | 67 | let find_senses t = (* FIXME: sensy zawierające 'się' *) |
67 | 68 | match t.token with |
... | ... | @@ -69,36 +70,67 @@ let find_senses t = (* FIXME: sensy zawierające 'się' *) |
69 | 70 | | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses |
70 | 71 | | _ -> [] |
71 | 72 | |
72 | - | |
73 | -let assign tokens text = | |
74 | - let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in | |
75 | - let _ = ExtArray.add lex_sems empty_lex_sem in | |
76 | - Int.iter 1 (ExtArray.size tokens - 1) (fun i -> | |
77 | - let token = ExtArray.get tokens i in | |
78 | - find_proper_names tokens i token; | |
79 | - let senses = find_senses token in | |
80 | - let lex_sem = {empty_lex_sem with senses=senses} in | |
81 | - let j = ExtArray.add lex_sems lex_sem in | |
82 | - if j <> i then failwith "assign_semantic_valence"); | |
83 | - lex_sems | |
84 | - | |
85 | -(* | |
86 | -(* print_endline "a14"; *) | |
87 | - let paths = assign_valence paths in | |
88 | -(* print_endline "a15"; *) | |
89 | -(* print_endline "a16"; *) | |
90 | - let paths = disambiguate_senses paths in | |
91 | - let paths = assign_simplified_valence paths in | |
92 | - let paths = PreSemantics.assign_semantics paths in | |
93 | -(* print_endline "a16"; *) | |
94 | - | |
95 | - | |
96 | - | |
97 | - | |
98 | - | |
99 | -let assign_valence paths = | |
100 | - let lexemes = Xlist.fold paths StringMap.empty (fun lexemes t -> | |
101 | - match t.token with | |
73 | +let rec find a l i = | |
74 | + if a.(i) = max_int then ( | |
75 | + a.(i) <- i; | |
76 | + i) else | |
77 | + if a.(i) = i then ( | |
78 | + Xlist.iter l (fun j -> a.(j) <- i); | |
79 | + i) else | |
80 | + find a (i :: l) a.(i) | |
81 | + | |
82 | +let union a i j = | |
83 | + if i = j then i else | |
84 | + let x = min i j in | |
85 | + let y = max i j in | |
86 | + a.(y) <- x; | |
87 | + x | |
88 | + | |
89 | +let rec split_tokens_into_groups_sentence a = function | |
90 | + RawSentence s -> () | |
91 | + | StructSentence([],_) -> () | |
92 | + | StructSentence((id,_,_) :: paths,_) -> | |
93 | + ignore (Xlist.fold paths (find a [] id) (fun m (id,_,_) -> | |
94 | + union a m (find a [] id))) | |
95 | + | DepSentence(paths) -> | |
96 | + if Array.length paths = 0 then () else | |
97 | + let id,_,_ = paths.(0) in | |
98 | + ignore (Int.fold 1 (Array.length paths - 1) (find a [] id) (fun m i -> | |
99 | + let id,_,_ = paths.(i) in | |
100 | + union a m (find a [] id))) | |
101 | + | QuotedSentences sentences -> | |
102 | + Xlist.iter sentences (fun p -> | |
103 | + split_tokens_into_groups_sentence a p.psentence) | |
104 | + | AltSentence l -> Xlist.iter l (fun (mode,sentence) -> | |
105 | + split_tokens_into_groups_sentence a sentence) | |
106 | + | |
107 | +let rec split_tokens_into_groups_paragraph a = function | |
108 | + RawParagraph s -> () | |
109 | + | StructParagraph sentences -> | |
110 | + Xlist.iter sentences (fun p -> split_tokens_into_groups_sentence a p.psentence) | |
111 | + | AltParagraph l -> Xlist.iter l (fun (mode,paragraph) -> | |
112 | + split_tokens_into_groups_paragraph a paragraph) | |
113 | + | |
114 | +let rec split_tokens_into_groups_text a = function | |
115 | + RawText s -> () | |
116 | + | StructText paragraphs -> | |
117 | + Xlist.iter paragraphs (split_tokens_into_groups_paragraph a) | |
118 | + | AltText l -> Xlist.iter l (fun (mode,text) -> | |
119 | + split_tokens_into_groups_text a text) | |
120 | + | |
121 | +let split_tokens_into_groups size text = | |
122 | + let a = Array.make size max_int in | |
123 | + split_tokens_into_groups_text a text; | |
124 | + Int.iter 1 (Array.length a - 1) (fun i -> | |
125 | + if a.(i) <> max_int then a.(i) <- a.(a.(i))); | |
126 | + let map = Int.fold 1 (Array.length a - 1) IntMap.empty (fun map i -> | |
127 | + if a.(i) = max_int then map else | |
128 | + IntMap.add_inc map a.(i) [i] (fun l -> i :: l)) in | |
129 | + IntMap.fold map [] (fun l _ v -> v :: l) | |
130 | + | |
131 | +let assign_valence tokens lex_sems group = | |
132 | + let lexemes = Xlist.fold group StringMap.empty (fun lexemes id -> | |
133 | + match (ExtArray.get tokens id).token with | |
102 | 134 | Lemma(lemma,pos,_) -> |
103 | 135 | StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) |
104 | 136 | | Proper(lemma,pos,_,_) -> |
... | ... | @@ -108,104 +140,51 @@ let assign_valence paths = |
108 | 140 | | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in |
109 | 141 | StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *) |
110 | 142 | | _ -> lexemes) in |
111 | - let valence = WalFrames.find_frames lexemes in | |
112 | - List.rev (Xlist.rev_map paths (fun t -> | |
113 | - match t.token with | |
114 | - Lemma(lemma,pos,_) -> {t with valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []} | |
115 | - | Proper(lemma,pos,interp,_) -> {t with valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) | |
116 | - (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*)); | |
117 | - token=Lemma(lemma,pos,interp)} | |
118 | - | _ -> t)) | |
119 | - | |
120 | -(**********************************************************************************) | |
121 | - | |
122 | -(* let prepare_indexes (paths,_) = | |
123 | - let set = Xlist.fold paths IntSet.empty (fun set t -> | |
124 | - IntSet.add (IntSet.add set t.beg) t.next) in | |
125 | - let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x -> | |
126 | - IntMap.add map x n, n+1) in | |
127 | - List.rev (Xlist.rev_map paths (fun t -> | |
128 | - {t with lnode=IntMap.find map t.beg; rnode=IntMap.find map t.next})), last - 1 *) | |
143 | + let valence = ENIAMwalenty.find_frames lexemes in | |
144 | + Xlist.iter group (fun id -> | |
145 | + match (ExtArray.get tokens id).token with | |
146 | + Lemma(lemma,pos,_) -> | |
147 | + ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with | |
148 | + valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []} | |
149 | + | Proper(lemma,pos,interp,_) -> | |
150 | + ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with | |
151 | + valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) | |
152 | + (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*))}; | |
153 | + ExtArray.set tokens id {(ExtArray.get tokens id) with token=Lemma(lemma,pos,interp)} | |
154 | + | _ -> ()) | |
129 | 155 | |
130 | 156 | let get_prefs_schema prefs schema = |
131 | 157 | Xlist.fold schema prefs (fun prefs t -> |
132 | - Xlist.fold t.WalTypes.sel_prefs prefs StringSet.add) | |
158 | + Xlist.fold t.sel_prefs prefs StringSet.add) | |
133 | 159 | |
134 | 160 | let map_prefs_schema senses schema = |
135 | 161 | Xlist.map schema (fun t -> |
136 | - if Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.Pro) || Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.ProNG) then t else | |
137 | - {t with WalTypes.sel_prefs = Xlist.fold t.WalTypes.sel_prefs [] (fun l s -> | |
162 | + if Xlist.mem t.morfs (Phrase Pro) || Xlist.mem t.morfs (Phrase ProNG) then t else | |
163 | + {t with sel_prefs = Xlist.fold t.sel_prefs [] (fun l s -> | |
138 | 164 | if StringSet.mem senses s then s :: l else l)}) |
139 | 165 | |
140 | -let disambiguate_senses paths = | |
141 | - let prefs = Xlist.fold paths (StringSet.singleton "ALL") (fun prefs t -> | |
142 | - Xlist.fold t.valence prefs (fun prefs -> function | |
143 | - _,WalTypes.Frame(_,schema) -> get_prefs_schema prefs schema | |
144 | - | _,WalTypes.LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema | |
145 | - | _,WalTypes.ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in | |
146 | - let hipero = Xlist.fold paths (StringSet.singleton "ALL") (fun hipero t -> | |
147 | - Xlist.fold t.senses hipero (fun hipero (_,l,_) -> | |
166 | +let disambiguate_senses lex_sems group = | |
167 | + let prefs = Xlist.fold group (StringSet.singleton "ALL") (fun prefs id -> | |
168 | + Xlist.fold (ExtArray.get lex_sems id).valence prefs (fun prefs -> function | |
169 | + _,Frame(_,schema) -> get_prefs_schema prefs schema | |
170 | + | _,LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema | |
171 | + | _,ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in | |
172 | + let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id -> | |
173 | + Xlist.fold (ExtArray.get lex_sems id).senses hipero (fun hipero (_,l,_) -> | |
148 | 174 | Xlist.fold l hipero StringSet.add)) in |
149 | 175 | let senses = StringSet.intersection prefs hipero in |
150 | 176 | let is_zero = StringSet.mem hipero "0" in |
151 | 177 | let senses = if is_zero then StringSet.add senses "0" else senses in |
152 | - Xlist.map paths (fun t -> | |
153 | - {t with valence = if is_zero then t.valence else | |
178 | + Xlist.iter group (fun id -> | |
179 | + let t = ExtArray.get lex_sems id in | |
180 | + ExtArray.set lex_sems id {t with valence = if is_zero then t.valence else | |
154 | 181 | Xlist.map t.valence (function |
155 | - n,WalTypes.Frame(a,schema) -> n,WalTypes.Frame(a,map_prefs_schema senses schema) | |
156 | - | n,WalTypes.LexFrame(s,p,r,schema) -> n,WalTypes.LexFrame(s,p,r,map_prefs_schema senses schema) | |
157 | - | n,WalTypes.ComprepFrame(s,p,r,schema) -> n,WalTypes.ComprepFrame(s,p,r,map_prefs_schema senses schema)); | |
182 | + n,Frame(a,schema) -> n,Frame(a,map_prefs_schema senses schema) | |
183 | + | n,LexFrame(s,p,r,schema) -> n,LexFrame(s,p,r,map_prefs_schema senses schema) | |
184 | + | n,ComprepFrame(s,p,r,schema) -> n,ComprepFrame(s,p,r,map_prefs_schema senses schema)); | |
158 | 185 | senses = Xlist.map t.senses (fun (s,l,w) -> |
159 | 186 | s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)}) |
160 | 187 | |
161 | -(*let single_sense (paths,last) = | |
162 | - List.rev (Xlist.rev_map paths (fun t -> | |
163 | - let sense = | |
164 | - if t.senses = [] then [] else | |
165 | - [Xlist.fold t.senses ("",[],-.max_float) (fun (max_meaning,max_hipero,max_weight) (meaning,hipero,weight) -> | |
166 | - if max_weight >= weight then max_meaning,max_hipero,max_weight else meaning,hipero,weight)] in | |
167 | - {t with senses=sense})), last*) | |
168 | - | |
169 | -open WalTypes | |
170 | - | |
171 | -(*let single_schema schemata = | |
172 | - let map = Xlist.fold schemata StringMap.empty (fun map schema -> | |
173 | - let t = WalStringOf.schema (List.sort compare (Xlist.fold schema [] (fun l s -> | |
174 | - if s.gf <> ARG && s.gf <> ADJUNCT then {s with role=""; role_attr=""; sel_prefs=[]} :: l else | |
175 | - if s.cr <> [] || s.ce <> [] then {s with role=""; role_attr=""; sel_prefs=[]} :: l else l))) in | |
176 | - StringMap.add_inc map t [schema] (fun l -> schema :: l)) in | |
177 | - StringMap.fold map [] (fun l _ schemata -> | |
178 | - let map = Xlist.fold schemata StringMap.empty (fun map schema -> | |
179 | - Xlist.fold schema map (fun map s -> | |
180 | - let t = WalStringOf.schema [{s with role=""; role_attr=""; sel_prefs=[]}] in | |
181 | - StringMap.add_inc map t [s] (fun l -> s :: l))) in | |
182 | - let schema = StringMap.fold map [] (fun schema _ l -> | |
183 | - let s = List.hd l in | |
184 | - {s with sel_prefs=Xlist.fold s.sel_prefs [] (fun l t -> if t = "0" || t = "T" then t :: l else l)} :: schema) in | |
185 | - schema :: l)*) | |
186 | - | |
187 | -let remove_meaning = function | |
188 | - DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a) | |
189 | - | EmptyAtrs m -> EmptyAtrs [] | |
190 | - | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*)) | |
191 | - | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*)) | |
192 | - | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a) | |
193 | - | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a) | |
194 | - | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a) | |
195 | - | _ -> failwith "remove_meaning" | |
196 | - | |
197 | - | |
198 | -(*let single_frame (paths,last) = | |
199 | - List.rev (Xlist.rev_map paths (fun t -> | |
200 | - let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function | |
201 | - Frame(attrs,schema) -> | |
202 | - let attrs = remove_meaning attrs in | |
203 | - lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema]) (fun (_,l) -> attrs, schema :: l) | |
204 | - | frame -> frame :: lex_frames, frames) in | |
205 | - let frames = StringMap.fold frames lex_frames (fun frames _ (attrs,schemata) -> | |
206 | - Xlist.fold (single_schema schemata) frames (fun frames frame -> Frame(attrs,frame) :: frames)) in | |
207 | - {t with valence=frames})), last *) | |
208 | - | |
209 | 188 | let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *) |
210 | 189 | Phrase(NP(Case "dat")) -> l |
211 | 190 | | Phrase(NP(Case "inst")) -> l |
... | ... | @@ -283,33 +262,34 @@ let simplify_schemata pos schemata = |
283 | 262 | if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else |
284 | 263 | (* if s.cr <> [] || s.ce <> [] then s :: l else *) |
285 | 264 | simplify_position pos l s)) in |
286 | - StringMap.add_inc schemata (WalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in | |
265 | + StringMap.add_inc schemata (ENIAMwalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in | |
287 | 266 | StringMap.fold schemata [] (fun l _ s -> s :: l) |
288 | 267 | |
289 | 268 | (* FIXME: problem ComprepNP i PrepNCP *) |
290 | 269 | (* FIXME: problem gdy ten sam token występuje w kilku ścieżkach *) |
291 | 270 | let generate_verb_prep_adjuncts preps = |
292 | - Xlist.map preps (fun (lemma,case) -> WalFrames.verb_prep_adjunct_schema_field lemma case) | |
271 | + Xlist.map preps (fun (lemma,case) -> ENIAMwalFrames.verb_prep_adjunct_schema_field lemma case) | |
293 | 272 | |
294 | 273 | let generate_verb_comprep_adjuncts compreps = |
295 | - Xlist.map compreps (fun lemma -> WalFrames.verb_comprep_adjunct_schema_field lemma) | |
274 | + Xlist.map compreps (fun lemma -> ENIAMwalFrames.verb_comprep_adjunct_schema_field lemma) | |
296 | 275 | |
297 | 276 | let generate_verb_compar_adjuncts compars = |
298 | - Xlist.map compars (fun lemma -> WalFrames.verb_compar_adjunct_schema_field lemma) | |
277 | + Xlist.map compars (fun lemma -> ENIAMwalFrames.verb_compar_adjunct_schema_field lemma) | |
299 | 278 | |
300 | 279 | let generate_noun_prep_adjuncts preps = |
301 | - WalFrames.noun_prep_adjunct_schema_field preps | |
280 | + ENIAMwalFrames.noun_prep_adjunct_schema_field preps | |
302 | 281 | |
303 | 282 | let generate_noun_compar_adjuncts compars = |
304 | - WalFrames.noun_compar_adjunct_schema_field compars | |
283 | + ENIAMwalFrames.noun_compar_adjunct_schema_field compars | |
305 | 284 | |
306 | 285 | let generate_adj_compar_adjuncts compars = |
307 | - WalFrames.noun_compar_adjunct_schema_field compars | |
286 | + ENIAMwalFrames.noun_compar_adjunct_schema_field compars | |
308 | 287 | |
309 | 288 | let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"] |
310 | 289 | |
311 | -let generate_prep_adjunct_tokens paths = | |
312 | - let map = Xlist.fold paths StringMap.empty (fun map t -> | |
290 | +let generate_prep_adjunct_tokens tokens group = | |
291 | + let map = Xlist.fold group StringMap.empty (fun map id -> | |
292 | + let t = ExtArray.get tokens id in | |
313 | 293 | match t.token with |
314 | 294 | Lemma(lemma,"prep",interp) -> |
315 | 295 | let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in |
... | ... | @@ -321,17 +301,19 @@ let generate_prep_adjunct_tokens paths = |
321 | 301 | | _ -> map) in |
322 | 302 | StringMap.fold map [] (fun l _ v -> v :: l) |
323 | 303 | |
324 | -let generate_comprep_adjunct_tokens paths = | |
325 | - let lemmas = Xlist.fold paths StringSet.empty (fun lemmas t -> | |
304 | +let generate_comprep_adjunct_tokens tokens group = | |
305 | + let lemmas = Xlist.fold group StringSet.empty (fun lemmas id -> | |
306 | + let t = ExtArray.get tokens id in | |
326 | 307 | match t.token with |
327 | 308 | Lemma(lemma,_,_) -> StringSet.add lemmas lemma |
328 | 309 | | _ -> lemmas) in |
329 | - StringMap.fold WalFrames.comprep_reqs [] (fun compreps comprep reqs -> | |
310 | + StringMap.fold ENIAMwalFrames.comprep_reqs [] (fun compreps comprep reqs -> | |
330 | 311 | let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in |
331 | 312 | if b then comprep :: compreps else compreps) |
332 | 313 | |
333 | -let generate_compar_adjunct_tokens paths = | |
334 | - let set = Xlist.fold paths StringSet.empty (fun set t -> | |
314 | +let generate_compar_adjunct_tokens tokens group = | |
315 | + let set = Xlist.fold group StringSet.empty (fun set id -> | |
316 | + let t = ExtArray.get tokens id in | |
335 | 317 | match t.token with |
336 | 318 | Lemma(lemma,"prep",interp) -> |
337 | 319 | if not (StringSet.mem compars lemma) then set else |
... | ... | @@ -343,29 +325,40 @@ let is_measure = function |
343 | 325 | NounAtrs(_,_,Common "measure") -> true |
344 | 326 | | _ -> false |
345 | 327 | |
346 | -let assign_simplified_valence paths = | |
347 | - let preps = generate_prep_adjunct_tokens paths in | |
348 | - let compreps = generate_comprep_adjunct_tokens paths in | |
349 | - let compars = generate_compar_adjunct_tokens paths in | |
328 | +let remove_meaning = function | |
329 | + DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a) | |
330 | + | EmptyAtrs m -> EmptyAtrs [] | |
331 | + | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*)) | |
332 | + | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*)) | |
333 | + | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a) | |
334 | + | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a) | |
335 | + | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a) | |
336 | + | _ -> failwith "remove_meaning" | |
337 | + | |
338 | +let assign_simplified_valence tokens lex_sems group = | |
339 | + let preps = generate_prep_adjunct_tokens tokens group in | |
340 | + let compreps = generate_comprep_adjunct_tokens tokens group in | |
341 | + let compars = generate_compar_adjunct_tokens tokens group in | |
350 | 342 | let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in |
351 | 343 | let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in |
352 | 344 | let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in |
353 | 345 | let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in |
354 | 346 | let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in |
355 | 347 | let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in |
356 | - let verb_adjuncts = WalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in | |
357 | - let noun_adjuncts = WalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in | |
358 | - let noun_measure_adjuncts = WalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in | |
359 | - let adj_adjuncts = WalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in | |
360 | - let adv_adjuncts = WalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in | |
361 | - List.rev (Xlist.rev_map paths (fun t -> | |
362 | - let pos = match t.token with | |
363 | - Lemma(_,pos,_) -> WalFrames.simplify_pos pos | |
348 | + let verb_adjuncts = ENIAMwalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in | |
349 | + let noun_adjuncts = ENIAMwalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in | |
350 | + let noun_measure_adjuncts = ENIAMwalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in | |
351 | + let adj_adjuncts = ENIAMwalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in | |
352 | + let adv_adjuncts = ENIAMwalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in | |
353 | + Xlist.iter group (fun id -> | |
354 | + let t = ExtArray.get lex_sems id in | |
355 | + let pos = match (ExtArray.get tokens id).token with | |
356 | + Lemma(_,pos,_) -> ENIAMwalFrames.simplify_pos pos | |
364 | 357 | | _ -> "" in |
365 | 358 | let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function |
366 | 359 | _,(Frame(attrs,schema) as frame) -> |
367 | 360 | let attrs = remove_meaning attrs in |
368 | - lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l) | |
361 | + lex_frames, StringMap.add_inc frames (ENIAMwalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l) | |
369 | 362 | | _,frame -> frame :: lex_frames, frames) in |
370 | 363 | let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame -> |
371 | 364 | (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in |
... | ... | @@ -380,93 +373,23 @@ let assign_simplified_valence paths = |
380 | 373 | (n,Frame(attrs,schema)) :: simp_frames, |
381 | 374 | Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames), |
382 | 375 | n+1)) in |
383 | - {t with simple_valence=simp_frames; valence=full_frames})) | |
384 | - | |
385 | -(* FIXME: dodać do walencji preferencje selekcyjne nadrzędników symboli: dzień, godzina, rysunek itp. *) | |
386 | -(* FIXME: sprawdzić czy walencja nazw własnych jest dobrze zrobiona. *) | |
387 | - | |
388 | -(* let first_id = 1 (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) | |
389 | - | |
390 | -let add_ids (paths,last) next_id = | |
391 | - let paths,next_id = Xlist.fold ((*List.rev*) paths) ([],next_id) (fun (paths,id) t -> | |
392 | - {t with id=id} :: paths, id+1) in | |
393 | - (paths,last),next_id *) | |
394 | - | |
395 | - | |
396 | - | |
397 | -let parse query = | |
398 | -(* print_endline "a1"; *) | |
399 | - let l = Xunicode.classified_chars_of_utf8_string query in | |
400 | -(* print_endline "a2"; *) | |
401 | - let l = PreTokenizer.tokenize l in | |
402 | -(* print_endline "a3"; *) | |
403 | - let l = PrePatterns.normalize_tokens [] l in | |
404 | -(* print_endline "a4"; *) | |
405 | - let l = PrePatterns.find_replacement_patterns l in | |
406 | -(* print_endline "a5"; *) | |
407 | - let l = PrePatterns.remove_spaces [] l in | |
408 | - let l = PrePatterns.find_abr_patterns PreAcronyms.abr_patterns l in | |
409 | - let l = PrePatterns.normalize_tokens [] l in | |
410 | -(* print_endline "a6"; *) | |
411 | - let paths = PrePaths.translate_into_paths l in | |
412 | -(* print_endline "a7"; *) | |
413 | - let paths = PrePaths.lemmatize paths in | |
414 | -(* print_endline "a8"; *) | |
415 | - let paths,_ = PreMWE.process paths in | |
416 | -(* print_endline "a12"; *) | |
417 | - let paths = find_proper_names paths in | |
418 | -(* print_endline "a13"; *) | |
419 | - let paths = modify_weights paths in | |
420 | - let paths = translate_digs paths in | |
421 | - let paths = assign_senses paths in | |
422 | -(* print_endline "a14"; *) | |
423 | - let paths = assign_valence paths in | |
424 | -(* print_endline "a15"; *) | |
425 | - let paths = combine_interps paths in | |
426 | -(* print_endline "a16"; *) | |
427 | - let paths = disambiguate_senses paths in | |
428 | - let paths = assign_simplified_valence paths in | |
429 | - let paths = PreSemantics.assign_semantics paths in | |
430 | -(* print_endline "a16"; *) | |
431 | - let paths = select_tokens paths in | |
432 | -(* print_endline "a17"; *) | |
433 | -(* let paths = if !single_sense_flag then single_sense paths else paths in | |
434 | - let paths = if !single_frame_flag then single_frame paths else paths in*) | |
435 | - (*let paths, next_id = add_ids paths next_id in | |
436 | - let paths = prepare_indexes paths in*) | |
437 | -(* print_endline "a18"; *) | |
438 | - paths(*, next_id*) | |
439 | -(* print_endline (PrePaths.to_string paths); *) | |
440 | -(* let paths = | |
441 | - if PrePaths.no_possible_path (PrePaths.map paths PreLemmatization.remove_postags) then | |
442 | - PrePaths.map paths process_ign | |
443 | - else paths in | |
444 | - let paths = PrePaths.map paths PreLemmatization.remove_postags in | |
445 | - let paths = PreCaseShift.manage_lower_upper_case paths in (* FIXME: niepotrzebnie powiększa pierwszy token (przymiotniki partykuły itp.) *) | |
446 | - let paths = PreLemmatization.combine_interps paths in | |
447 | -(* print_endline (PrePaths.to_string paths); *)*) | |
448 | - | |
449 | -let parse_conll tokens dep_paths = (* FIXME: sprawdzić, czy zachowana jest kolejność elementów paths !!! *) | |
450 | - let paths = List.rev (Int.fold 1 (Array.length dep_paths - 1) [] (fun paths conll_id -> | |
451 | - let id,_,_ = dep_paths.(conll_id) in | |
452 | - ExtArray.get tokens id :: paths)) in | |
453 | - (* print_endline "a12"; *) | |
454 | - let paths = find_proper_names paths in | |
455 | - (* print_endline "a13"; *) | |
456 | - let paths = modify_weights paths in | |
457 | - let paths = PreWordnet.assign_senses paths in | |
458 | - (* print_endline "a14"; *) | |
459 | - (* let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *) *) | |
460 | - (* print_endline "a15"; *) | |
461 | - let paths = assign_valence paths in | |
462 | - (* print_endline "a16"; *) | |
463 | - let paths = disambiguate_senses paths in | |
464 | - let paths = assign_simplified_valence paths in | |
465 | - let paths = PreSemantics.assign_semantics paths in | |
466 | - (* print_endline "a16"; *) | |
467 | - let _ = Xlist.fold paths 1 (fun conll_id t -> | |
468 | - let id,_,_ = dep_paths.(conll_id) in | |
469 | - ExtArray.set tokens id t; | |
470 | - conll_id + 1) in | |
471 | - () | |
472 | -*) | |
376 | + ExtArray.set lex_sems id {t with simple_valence=simp_frames; valence=full_frames}) | |
377 | + | |
378 | +let assign tokens text = | |
379 | + let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in | |
380 | + let _ = ExtArray.add lex_sems empty_lex_sem in | |
381 | + Int.iter 1 (ExtArray.size tokens - 1) (fun i -> | |
382 | + let token = ExtArray.get tokens i in | |
383 | + let token = find_proper_names tokens i token in | |
384 | + ExtArray.set tokens i token; | |
385 | + let senses = find_senses token in | |
386 | + let lex_sem = {empty_lex_sem with senses=senses} in | |
387 | + let _ = ExtArray.add lex_sems lex_sem in | |
388 | + ()); | |
389 | + let groups = split_tokens_into_groups (ExtArray.size tokens) text in | |
390 | + (* Xlist.iter groups (fun group -> print_endline (String.concat " " (Xlist.map group string_of_int))); *) | |
391 | + Xlist.iter groups (fun group -> assign_valence tokens lex_sems group); | |
392 | + Xlist.iter groups (fun group -> disambiguate_senses lex_sems group); | |
393 | + Xlist.iter groups (fun group -> assign_simplified_valence tokens lex_sems group); | |
394 | + Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group); | |
395 | + lex_sems | |
... | ... |
lexSemantics/ENIAMlexSemanticsData.ml
... | ... | @@ -20,7 +20,7 @@ |
20 | 20 | open ENIAMtokenizerTypes |
21 | 21 | open ENIAMlexSemanticsTypes |
22 | 22 | open Xstd |
23 | -(* | |
23 | + | |
24 | 24 | let subst_inst_roles = Xlist.fold [ |
25 | 25 | "wiosna", "Time",""; |
26 | 26 | "lato", "Time",""; |
... | ... | @@ -365,7 +365,7 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr |
365 | 365 | ] StringMap.empty (fun map (lemma,case,role,role_attr,hipero,sel_prefs) -> |
366 | 366 | let hipero = Xlist.fold hipero StringSet.empty ENIAMplWordnet.get_hipero_rec in |
367 | 367 | let map2 = try StringMap.find map lemma with Not_found -> StringMap.empty in |
368 | - let map2 = StringMap.add_inc map2 case [role,role_attr,hipero,sel_prefs] (fun l -> (role,role_attr,hipero,sel_prefs) :: l) in | |
368 | + let map2 = StringMap.add_inc map2 case [case,role,role_attr,hipero,sel_prefs] (fun l -> (case,role,role_attr,hipero,sel_prefs) :: l) in | |
369 | 369 | StringMap.add map lemma map2) |
370 | 370 | (* "przeciwko","dat","Dat"; |
371 | 371 | "przeciw","dat","Dat"; |
... | ... | @@ -377,10 +377,10 @@ let prep_roles = Xlist.fold [ (* lemma,case,role,role_attr,meaning/hipero,sel_pr |
377 | 377 | let assign_prep_semantics lemma cases t = |
378 | 378 | try |
379 | 379 | let map = StringMap.find prep_roles lemma in |
380 | - Xlist.map cases (fun case -> | |
381 | - {t with semantics=(try PrepSemantics (StringMap.find map case) with Not_found -> Normal); | |
382 | - token=Lemma(lemma,"prep",[[[case]]])}) | |
383 | - with Not_found -> [t] | |
380 | + let l = List.flatten (Xlist.map cases (fun case -> | |
381 | + try StringMap.find map case with Not_found -> [])) in | |
382 | + if l = [] then Normal else PrepSemantics l | |
383 | + with Not_found -> Normal | |
384 | 384 | |
385 | 385 | let subst_special_lexemes = Xlist.fold [ |
386 | 386 | "jutro", ["indexical"];(*"dzień"*) |
... | ... | @@ -532,21 +532,24 @@ let pron_lexemes = Xlist.fold [ |
532 | 532 | ] StringMap.empty (fun map (k,v,w) -> StringMap.add map k (SpecialMod(v,w)))*) |
533 | 533 | |
534 | 534 | (* UWAGA: przy przetwarzaniu danych zdezambiguowanych ta procedura nie zmienia liczby tokenów *) |
535 | -let assign_semantics paths = | |
536 | - List.rev (List.flatten (Xlist.rev_map paths (fun t -> | |
537 | - match t.token with | |
538 | - Lemma(lemma,"subst",_) -> [{t with lroles=(try StringMap.find subst_inst_roles lemma with Not_found -> "",""); semantics=try StringMap.find subst_special_lexemes lemma with Not_found -> Normal}] | |
539 | - | Lemma(lemma,"depr",_) -> [{t with semantics=try StringMap.find subst_special_lexemes lemma with Not_found -> Normal}] | |
540 | - | Lemma(lemma,"adj",_) -> [{t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal}] | |
541 | - | Lemma(lemma,"adjc",_) -> [{t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal}] | |
542 | - | Lemma(lemma,"adjp",_) -> [{t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal}] | |
543 | - | Lemma(lemma,"adv",_) -> [{t with lroles=(try StringMap.find adv_roles lemma with Not_found -> "",""); semantics=try StringMap.find adv_special_lexemes lemma with Not_found -> Normal}] | |
544 | - | Lemma(lemma,"qub",_) -> [{t with lroles=(try StringMap.find qub_roles lemma with Not_found -> "",""); semantics=try StringMap.find qub_special_lexemes lemma with Not_found -> Normal}] | |
535 | +let assign_semantics tokens lex_sems group = | |
536 | + Xlist.iter group (fun id -> | |
537 | + let token = (ExtArray.get tokens id).token in | |
538 | + let t = ExtArray.get lex_sems id in | |
539 | + let t = match token with | |
540 | + Lemma(lemma,"subst",_) -> {t with lroles=(try StringMap.find subst_inst_roles lemma with Not_found -> "",""); semantics=try StringMap.find subst_special_lexemes lemma with Not_found -> Normal} | |
541 | + | Lemma(lemma,"depr",_) -> {t with semantics=try StringMap.find subst_special_lexemes lemma with Not_found -> Normal} | |
542 | + | Lemma(lemma,"adj",_) -> {t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal} | |
543 | + | Lemma(lemma,"adjc",_) -> {t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal} | |
544 | + | Lemma(lemma,"adjp",_) -> {t with lroles=(try StringMap.find adj_roles lemma with Not_found -> "",""); semantics=try StringMap.find adj_special_lexemes lemma with Not_found -> Normal} | |
545 | + | Lemma(lemma,"adv",_) -> {t with lroles=(try StringMap.find adv_roles lemma with Not_found -> "",""); semantics=try StringMap.find adv_special_lexemes lemma with Not_found -> Normal} | |
546 | + | Lemma(lemma,"qub",_) -> {t with lroles=(try StringMap.find qub_roles lemma with Not_found -> "",""); semantics=try StringMap.find qub_special_lexemes lemma with Not_found -> Normal} | |
545 | 547 | (* | Lemma(lemma,"num",_) -> [{t with semantics=try StringMap.find num_lexemes lemma with Not_found -> Normal}] *) |
546 | - | Lemma(lemma,"ppron12",_) -> [{t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal}] | |
547 | - | Lemma(lemma,"ppron3",_) -> [{t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal}] | |
548 | - | Lemma(lemma,"siebie",_) -> [{t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal}] | |
549 | - | Lemma(lemma,"prep",l) -> Xlist.fold l [] (fun l -> function cases :: _ -> assign_prep_semantics lemma cases t @ l | [] -> l) | |
550 | - | _ -> [t] | |
551 | - ))) | |
552 | -*) | |
548 | + | Lemma(lemma,"ppron12",_) -> {t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal} | |
549 | + | Lemma(lemma,"ppron3",_) -> {t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal} | |
550 | + | Lemma(lemma,"siebie",_) -> {t with semantics=try StringMap.find pron_lexemes lemma with Not_found -> Normal} | |
551 | + | Lemma(lemma,"prep",l) -> | |
552 | + let cases = Xlist.fold l StringSet.empty (fun set -> function cases :: _ -> Xlist.fold cases set StringSet.add | _ -> set) in | |
553 | + {t with semantics=assign_prep_semantics lemma (StringSet.to_list cases) t} | |
554 | + | _ -> t in | |
555 | + ExtArray.set lex_sems id t) | |
... | ... |
lexSemantics/ENIAMlexSemanticsTypes.ml
... | ... | @@ -33,7 +33,7 @@ type semantics = |
33 | 33 | | Special of string list |
34 | 34 | (* | SpecialNoun of type_arg list * type_term |
35 | 35 | | SpecialMod of string * (type_arg list * type_term)*) |
36 | - | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *) | |
36 | + | PrepSemantics of (string * string * string * StringSet.t * string list) list (* case,role,role_attr,hipero,sel_prefs *) | |
37 | 37 | |
38 | 38 | type lex_sem = { |
39 | 39 | e: labels; |
... | ... |
lexSemantics/eniam-lexSemantics-1.0.tar.bz2
0 → 100644
No preview for this file type
lexSemantics/makefile
... | ... | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa #eniam-lexSemantics.cmxa | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa eniam-lexSemantics.cmxa | |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | 9 | SOURCES= ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsData.ml ENIAMlexSemantics.ml |
... | ... | @@ -27,7 +27,7 @@ eniam-lexSemantics.cmxa: $(SOURCES) |
27 | 27 | ocamlopt -linkall -a -o eniam-lexSemantics.cmxa $(INCLUDES) $^ |
28 | 28 | |
29 | 29 | test: test.ml |
30 | - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml | |
30 | + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml | |
31 | 31 | |
32 | 32 | .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx |
33 | 33 | |
... | ... |
lexSemantics/test.ml
... | ... | @@ -21,6 +21,11 @@ |
21 | 21 | let test_strings = [ |
22 | 22 | "Szpak frunie zimą."; |
23 | 23 | "Kot miauczy w październiku."; |
24 | + "Np. Ala."; | |
25 | + "Kot np. miauczy."; | |
26 | + "Szpak frunie. Kot miauczy."; | |
27 | + "Szpak powiedział: „Frunę. Kiszę.”"; | |
28 | + "W XX w. Warszawa."; | |
24 | 29 | (* "a gdybym miałem"; |
25 | 30 | "A Gdy Miałem"; |
26 | 31 | "GDY MIAŁEM"; |
... | ... |
pre/makefile
1 | 1 | OCAMLC=ocamlc |
2 | 2 | OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | -INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam -I ../morphology -I ../parser -I ../corpora | |
4 | +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam | |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa eniam-integration.cmxa eniam-lexSemantics.cmxa | |
7 | 7 | INSTALLDIR=`ocamlc -where` |
8 | 8 | |
9 | -WAL= paths.ml preTypes.ml | |
10 | -PRE= preSemantics.ml ../corpora/CONLL.ml preProcessing.ml | |
9 | +WAL= paths.ml | |
10 | +PRE= preProcessing.ml | |
11 | 11 | |
12 | 12 | all: |
13 | 13 | $(OCAMLOPT) -o pre $(OCAMLOPTFLAGS) $(WAL) $(PRE) |
... | ... | @@ -33,4 +33,4 @@ all: |
33 | 33 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
34 | 34 | |
35 | 35 | clean: |
36 | - rm -f *~ *.cm[oix] *.o pre concraft_test | |
36 | + rm -f *~ *.cm[oix] *.o pre | |
... | ... |
pre/preProcessing.ml
... | ... | @@ -17,355 +17,7 @@ |
17 | 17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
18 | 18 | *) |
19 | 19 | |
20 | -open Xstd | |
21 | -open PreTypes | |
22 | - | |
23 | -(* uwagi: | |
24 | -jak przetwarzać num:comp | |
25 | -czy rzeczownik niepoliczalny w liczbie mnogiej jest nadal niepoliczalny np. "Wody szumią." | |
26 | -trzeba zrobić słownik mwe, i nazw własnych | |
27 | -trzeba zweryfikować słownik niepoliczalnych | |
28 | -przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga | |
29 | -*) | |
30 | - | |
31 | - | |
32 | - | |
33 | -(**********************************************************************************) | |
34 | - | |
35 | -let proper_names = | |
36 | - let l = Str.split_delim (Str.regexp "\n") (File.load_file Paths.proper_names_filename) in | |
37 | - let l2 = Str.split_delim (Str.regexp "\n") (File.load_file Paths.proper_names_filename2) in | |
38 | - Xlist.fold (l2 @ l) StringMap.empty (fun proper line -> | |
39 | - if String.length line = 0 then proper else | |
40 | - if String.get line 0 = '#' then proper else | |
41 | - match Str.split_delim (Str.regexp "\t") line with | |
42 | - [lemma; types] -> | |
43 | - let types = Str.split (Str.regexp "|") types in | |
44 | - StringMap.add_inc proper lemma types (fun types2 -> types @ types2) | |
45 | - | _ -> failwith ("proper_names: " ^ line)) | |
46 | - | |
47 | -let remove l s = | |
48 | - Xlist.fold l [] (fun l t -> | |
49 | - if s = t then l else t :: l) | |
50 | - | |
51 | -let find_proper_names paths = | |
52 | - List.rev (Xlist.rev_map paths (fun t -> | |
53 | - match t.token with | |
54 | - Lemma(lemma,pos,interp) -> | |
55 | - if StringMap.mem proper_names lemma then | |
56 | - {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma); | |
57 | - attrs=remove t.attrs "notvalidated proper"} | |
58 | - else | |
59 | - if Xlist.mem t.attrs "notvalidated proper" then | |
60 | - {t with token=Proper(lemma,pos,interp,[])} | |
61 | - else t | |
62 | - | _ -> t)) | |
63 | - | |
64 | - | |
65 | -let assign_valence paths = | |
66 | - let lexemes = Xlist.fold paths StringMap.empty (fun lexemes t -> | |
67 | - match t.token with | |
68 | - Lemma(lemma,pos,_) -> | |
69 | - StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) | |
70 | - | Proper(lemma,pos,_,_) -> | |
71 | - let pos = match pos with | |
72 | - "subst" -> "psubst" | |
73 | - | "depr" -> "pdepr" | |
74 | - | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in | |
75 | - StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *) | |
76 | - | _ -> lexemes) in | |
77 | - let valence = WalFrames.find_frames lexemes in | |
78 | - List.rev (Xlist.rev_map paths (fun t -> | |
79 | - match t.token with | |
80 | - Lemma(lemma,pos,_) -> {t with valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []} | |
81 | - | Proper(lemma,pos,interp,_) -> {t with valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) | |
82 | - (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*)); | |
83 | - token=Lemma(lemma,pos,interp)} | |
84 | - | _ -> t)) | |
85 | - | |
86 | -(**********************************************************************************) | |
87 | - | |
88 | -(* let prepare_indexes (paths,_) = | |
89 | - let set = Xlist.fold paths IntSet.empty (fun set t -> | |
90 | - IntSet.add (IntSet.add set t.beg) t.next) in | |
91 | - let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x -> | |
92 | - IntMap.add map x n, n+1) in | |
93 | - List.rev (Xlist.rev_map paths (fun t -> | |
94 | - {t with lnode=IntMap.find map t.beg; rnode=IntMap.find map t.next})), last - 1 *) | |
95 | - | |
96 | -let get_prefs_schema prefs schema = | |
97 | - Xlist.fold schema prefs (fun prefs t -> | |
98 | - Xlist.fold t.WalTypes.sel_prefs prefs StringSet.add) | |
99 | - | |
100 | -let map_prefs_schema senses schema = | |
101 | - Xlist.map schema (fun t -> | |
102 | - if Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.Pro) || Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.ProNG) then t else | |
103 | - {t with WalTypes.sel_prefs = Xlist.fold t.WalTypes.sel_prefs [] (fun l s -> | |
104 | - if StringSet.mem senses s then s :: l else l)}) | |
105 | - | |
106 | -let disambiguate_senses paths = | |
107 | - let prefs = Xlist.fold paths (StringSet.singleton "ALL") (fun prefs t -> | |
108 | - Xlist.fold t.valence prefs (fun prefs -> function | |
109 | - _,WalTypes.Frame(_,schema) -> get_prefs_schema prefs schema | |
110 | - | _,WalTypes.LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema | |
111 | - | _,WalTypes.ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in | |
112 | - let hipero = Xlist.fold paths (StringSet.singleton "ALL") (fun hipero t -> | |
113 | - Xlist.fold t.senses hipero (fun hipero (_,l,_) -> | |
114 | - Xlist.fold l hipero StringSet.add)) in | |
115 | - let senses = StringSet.intersection prefs hipero in | |
116 | - let is_zero = StringSet.mem hipero "0" in | |
117 | - let senses = if is_zero then StringSet.add senses "0" else senses in | |
118 | - Xlist.map paths (fun t -> | |
119 | - {t with valence = if is_zero then t.valence else | |
120 | - Xlist.map t.valence (function | |
121 | - n,WalTypes.Frame(a,schema) -> n,WalTypes.Frame(a,map_prefs_schema senses schema) | |
122 | - | n,WalTypes.LexFrame(s,p,r,schema) -> n,WalTypes.LexFrame(s,p,r,map_prefs_schema senses schema) | |
123 | - | n,WalTypes.ComprepFrame(s,p,r,schema) -> n,WalTypes.ComprepFrame(s,p,r,map_prefs_schema senses schema)); | |
124 | - senses = Xlist.map t.senses (fun (s,l,w) -> | |
125 | - s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)}) | |
126 | - | |
127 | -(*let single_sense (paths,last) = | |
128 | - List.rev (Xlist.rev_map paths (fun t -> | |
129 | - let sense = | |
130 | - if t.senses = [] then [] else | |
131 | - [Xlist.fold t.senses ("",[],-.max_float) (fun (max_meaning,max_hipero,max_weight) (meaning,hipero,weight) -> | |
132 | - if max_weight >= weight then max_meaning,max_hipero,max_weight else meaning,hipero,weight)] in | |
133 | - {t with senses=sense})), last*) | |
134 | - | |
135 | -open WalTypes | |
136 | - | |
137 | -(*let single_schema schemata = | |
138 | - let map = Xlist.fold schemata StringMap.empty (fun map schema -> | |
139 | - let t = WalStringOf.schema (List.sort compare (Xlist.fold schema [] (fun l s -> | |
140 | - if s.gf <> ARG && s.gf <> ADJUNCT then {s with role=""; role_attr=""; sel_prefs=[]} :: l else | |
141 | - if s.cr <> [] || s.ce <> [] then {s with role=""; role_attr=""; sel_prefs=[]} :: l else l))) in | |
142 | - StringMap.add_inc map t [schema] (fun l -> schema :: l)) in | |
143 | - StringMap.fold map [] (fun l _ schemata -> | |
144 | - let map = Xlist.fold schemata StringMap.empty (fun map schema -> | |
145 | - Xlist.fold schema map (fun map s -> | |
146 | - let t = WalStringOf.schema [{s with role=""; role_attr=""; sel_prefs=[]}] in | |
147 | - StringMap.add_inc map t [s] (fun l -> s :: l))) in | |
148 | - let schema = StringMap.fold map [] (fun schema _ l -> | |
149 | - let s = List.hd l in | |
150 | - {s with sel_prefs=Xlist.fold s.sel_prefs [] (fun l t -> if t = "0" || t = "T" then t :: l else l)} :: schema) in | |
151 | - schema :: l)*) | |
152 | - | |
153 | -let remove_meaning = function | |
154 | - DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a) | |
155 | - | EmptyAtrs m -> EmptyAtrs [] | |
156 | - | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*)) | |
157 | - | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*)) | |
158 | - | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a) | |
159 | - | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a) | |
160 | - | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a) | |
161 | - | _ -> failwith "remove_meaning" | |
162 | - | |
163 | - | |
164 | -(*let single_frame (paths,last) = | |
165 | - List.rev (Xlist.rev_map paths (fun t -> | |
166 | - let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function | |
167 | - Frame(attrs,schema) -> | |
168 | - let attrs = remove_meaning attrs in | |
169 | - lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema]) (fun (_,l) -> attrs, schema :: l) | |
170 | - | frame -> frame :: lex_frames, frames) in | |
171 | - let frames = StringMap.fold frames lex_frames (fun frames _ (attrs,schemata) -> | |
172 | - Xlist.fold (single_schema schemata) frames (fun frames frame -> Frame(attrs,frame) :: frames)) in | |
173 | - {t with valence=frames})), last *) | |
174 | - | |
175 | -let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *) | |
176 | - Phrase(NP(Case "dat")) -> l | |
177 | - | Phrase(NP(Case "inst")) -> l | |
178 | - | Phrase(PrepNP _) -> l | |
179 | - | Phrase(PrepAdjP _) -> l | |
180 | - | Phrase(NumP (Case "dat")) -> l | |
181 | - | Phrase(NumP (Case "inst")) -> l | |
182 | - | Phrase(PrepNumP _) -> l | |
183 | - | Phrase(ComprepNP _) -> l | |
184 | - | Phrase(ComparNP _) -> l | |
185 | - | Phrase(ComparPP _) -> l | |
186 | - | Phrase(IP) -> l | |
187 | - | Phrase(CP _) -> l | |
188 | - | Phrase(NCP(Case "dat",_,_)) -> l | |
189 | - | Phrase(NCP(Case "inst",_,_)) -> l | |
190 | - | Phrase(PrepNCP _) -> l | |
191 | -(* | Phrase(PadvP) -> l *) | |
192 | - | Phrase(AdvP) -> l | |
193 | - | Phrase(PrepP) -> l | |
194 | - | Phrase(Or) -> l | |
195 | - | Phrase(Qub) -> l | |
196 | - | Phrase(Adja) -> l | |
197 | - | Phrase(Inclusion) -> l | |
198 | - | Phrase Pro -> Phrase Null :: l | |
199 | - | t -> t :: l | |
200 | - | |
201 | -let simplify_position_noun l = function | |
202 | - Phrase(NP(Case "gen")) -> l | |
203 | - | Phrase(NP(Case "nom")) -> l | |
204 | - | Phrase(NP(CaseAgr)) -> l | |
205 | - | Phrase(PrepNP _) -> l | |
206 | - | Phrase(AdjP AllAgr) -> l | |
207 | - | Phrase(NumP (Case "gen")) -> l | |
208 | - | Phrase(NumP (Case "nom")) -> l | |
209 | - | Phrase(NumP (CaseAgr)) -> l | |
210 | - | Phrase(PrepNumP _) -> l | |
211 | - | Phrase(ComprepNP _) -> l | |
212 | - | Phrase(ComparNP _) -> l | |
213 | - | Phrase(ComparPP _) -> l | |
214 | - | Phrase(IP) -> l | |
215 | - | Phrase(NCP(Case "gen",_,_)) -> l | |
216 | - | Phrase(PrepNCP _) -> l | |
217 | - | Phrase(PrepP) -> l | |
218 | - | Phrase(Qub) -> l | |
219 | - | Phrase(Adja) -> l | |
220 | - | Phrase(Inclusion) -> l | |
221 | - | Phrase Pro -> Phrase Null :: l | |
222 | - | t -> t :: l | |
223 | - | |
224 | -let simplify_position_adj l = function | |
225 | - Phrase(AdvP) -> l | |
226 | - | t -> t :: l | |
227 | - | |
228 | -let simplify_position_adv l = function | |
229 | - Phrase(AdvP) -> l | |
230 | - | t -> t :: l | |
231 | - | |
232 | - | |
233 | -let simplify_position pos l s = | |
234 | - let morfs = match pos with | |
235 | - "verb" -> List.rev (Xlist.fold s.morfs [] simplify_position_verb) | |
236 | - | "noun" -> List.rev (Xlist.fold s.morfs [] simplify_position_noun) | |
237 | - | "adj" -> List.rev (Xlist.fold s.morfs [] simplify_position_adj) | |
238 | - | "adv" -> List.rev (Xlist.fold s.morfs [] simplify_position_adv) | |
239 | - | _ -> s.morfs in | |
240 | - match morfs with | |
241 | - [] -> l | |
242 | - | [Phrase Null] -> l | |
243 | - | _ -> {s with morfs=morfs} :: l | |
244 | - | |
245 | -let simplify_schemata pos schemata = | |
246 | - let schemata = Xlist.fold schemata StringMap.empty (fun schemata (schema,frame) -> | |
247 | - let schema = List.sort compare (Xlist.fold schema [] (fun l s -> | |
248 | - let s = {s with role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; morfs=List.sort compare s.morfs} in | |
249 | - if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else | |
250 | -(* if s.cr <> [] || s.ce <> [] then s :: l else *) | |
251 | - simplify_position pos l s)) in | |
252 | - StringMap.add_inc schemata (WalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in | |
253 | - StringMap.fold schemata [] (fun l _ s -> s :: l) | |
254 | - | |
255 | -(* FIXME: problem ComprepNP i PrepNCP *) | |
256 | -(* FIXME: problem gdy ten sam token występuje w kilku ścieżkach *) | |
257 | -let generate_verb_prep_adjuncts preps = | |
258 | - Xlist.map preps (fun (lemma,case) -> WalFrames.verb_prep_adjunct_schema_field lemma case) | |
259 | - | |
260 | -let generate_verb_comprep_adjuncts compreps = | |
261 | - Xlist.map compreps (fun lemma -> WalFrames.verb_comprep_adjunct_schema_field lemma) | |
262 | - | |
263 | -let generate_verb_compar_adjuncts compars = | |
264 | - Xlist.map compars (fun lemma -> WalFrames.verb_compar_adjunct_schema_field lemma) | |
265 | - | |
266 | -let generate_noun_prep_adjuncts preps = | |
267 | - WalFrames.noun_prep_adjunct_schema_field preps | |
268 | - | |
269 | -let generate_noun_compar_adjuncts compars = | |
270 | - WalFrames.noun_compar_adjunct_schema_field compars | |
271 | - | |
272 | -let generate_adj_compar_adjuncts compars = | |
273 | - WalFrames.noun_compar_adjunct_schema_field compars | |
274 | - | |
275 | -let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"] | |
276 | - | |
277 | -let generate_prep_adjunct_tokens paths = | |
278 | - let map = Xlist.fold paths StringMap.empty (fun map t -> | |
279 | - match t.token with | |
280 | - Lemma(lemma,"prep",interp) -> | |
281 | - let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in | |
282 | - if StringSet.mem compars lemma then map else | |
283 | - Xlist.fold interp map (fun map -> function | |
284 | - [cases] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case)) | |
285 | - | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case)) | |
286 | - | _ -> map) | |
287 | - | _ -> map) in | |
288 | - StringMap.fold map [] (fun l _ v -> v :: l) | |
289 | - | |
290 | -let generate_comprep_adjunct_tokens paths = | |
291 | - let lemmas = Xlist.fold paths StringSet.empty (fun lemmas t -> | |
292 | - match t.token with | |
293 | - Lemma(lemma,_,_) -> StringSet.add lemmas lemma | |
294 | - | _ -> lemmas) in | |
295 | - StringMap.fold WalFrames.comprep_reqs [] (fun compreps comprep reqs -> | |
296 | - let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in | |
297 | - if b then comprep :: compreps else compreps) | |
298 | - | |
299 | -let generate_compar_adjunct_tokens paths = | |
300 | - let set = Xlist.fold paths StringSet.empty (fun set t -> | |
301 | - match t.token with | |
302 | - Lemma(lemma,"prep",interp) -> | |
303 | - if not (StringSet.mem compars lemma) then set else | |
304 | - StringSet.add set lemma | |
305 | - | _ -> set) in | |
306 | - StringSet.to_list set | |
307 | - | |
308 | -let is_measure = function | |
309 | - NounAtrs(_,_,Common "measure") -> true | |
310 | - | _ -> false | |
311 | - | |
312 | -let assign_simplified_valence paths = | |
313 | - let preps = generate_prep_adjunct_tokens paths in | |
314 | - let compreps = generate_comprep_adjunct_tokens paths in | |
315 | - let compars = generate_compar_adjunct_tokens paths in | |
316 | - let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in | |
317 | - let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in | |
318 | - let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in | |
319 | - let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in | |
320 | - let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in | |
321 | - let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in | |
322 | - let verb_adjuncts = WalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in | |
323 | - let noun_adjuncts = WalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in | |
324 | - let noun_measure_adjuncts = WalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in | |
325 | - let adj_adjuncts = WalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in | |
326 | - let adv_adjuncts = WalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in | |
327 | - List.rev (Xlist.rev_map paths (fun t -> | |
328 | - let pos = match t.token with | |
329 | - Lemma(_,pos,_) -> WalFrames.simplify_pos pos | |
330 | - | _ -> "" in | |
331 | - let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function | |
332 | - _,(Frame(attrs,schema) as frame) -> | |
333 | - let attrs = remove_meaning attrs in | |
334 | - lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l) | |
335 | - | _,frame -> frame :: lex_frames, frames) in | |
336 | - let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame -> | |
337 | - (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in | |
338 | - let simp_frames,full_frames,_ = StringMap.fold frames (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) _ (attrs,schemata) -> | |
339 | - Xlist.fold (simplify_schemata pos schemata) (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) (schema,frames) -> | |
340 | - let schema = match pos with | |
341 | - "verb" -> schema @ verb_adjuncts | |
342 | - | "noun" -> schema @ (if is_measure attrs then noun_measure_adjuncts else noun_adjuncts) | |
343 | - | "adj" -> schema @ adj_adjuncts | |
344 | - | "adv" -> schema @ adv_adjuncts | |
345 | - | _ -> schema in | |
346 | - (n,Frame(attrs,schema)) :: simp_frames, | |
347 | - Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames), | |
348 | - n+1)) in | |
349 | - {t with simple_valence=simp_frames; valence=full_frames})) | |
350 | - | |
351 | -(* FIXME: dodać do walencji preferencje selekcyjne nadrzędników symboli: dzień, godzina, rysunek itp. *) | |
352 | -(* FIXME: sprawdzić czy walencja nazw własnych jest dobrze zrobiona. *) | |
353 | - | |
354 | -(* let first_id = 1 (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) | |
355 | - | |
356 | -let add_ids (paths,last) next_id = | |
357 | - let paths,next_id = Xlist.fold ((*List.rev*) paths) ([],next_id) (fun (paths,id) t -> | |
358 | - {t with id=id} :: paths, id+1) in | |
359 | - (paths,last),next_id *) | |
360 | - | |
361 | -let assign_senses paths = (* FIXME: sensy zawierające 'się' *) | |
362 | - List.rev (Xlist.rev_map paths (fun t -> | |
363 | - match t.token with | |
364 | - Lemma(lemma,pos,_) -> {t with senses=ENIAMplWordnet.find_senses lemma pos} | |
365 | - | Proper(_,_,_,senses) -> {t with senses=ENIAMplWordnet.find_proper_senses senses} | |
366 | - | _ -> t)) | |
367 | - | |
368 | - | |
20 | +(* | |
369 | 21 | let parse query = |
370 | 22 | (* print_endline "a1"; *) |
371 | 23 | let l = Xunicode.classified_chars_of_utf8_string query in |
... | ... | @@ -442,107 +94,12 @@ let parse_conll tokens dep_paths = (* FIXME: sprawdzić, czy zachowana jest kole |
442 | 94 | conll_id + 1) in |
443 | 95 | () |
444 | 96 | |
445 | -(* | |
446 | -UWAGA: Aby korzytać z concrafta trzeba najpierw postawić serwer wpisując z linii poleceń: | |
447 | -concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz | |
448 | -*) | |
449 | - | |
450 | -let read_whole_channel c = | |
451 | - let r = ref [] in | |
452 | - try | |
453 | - while true do | |
454 | - r := (input_line c) :: !r | |
455 | - done; | |
456 | - !r | |
457 | - with End_of_file -> List.rev (!r) | |
458 | - | |
459 | -let rec process_concraft_result orth lemma interp others rev = function | |
460 | - [] -> List.rev ((orth,(lemma,interp) :: others) :: rev) | |
461 | - | "" :: l -> process_concraft_result orth lemma interp others rev l | |
462 | - | line :: l -> | |
463 | - (match Xstring.split_delim "\t" line with | |
464 | - [orth2;s] when s = "none" || s = "space" -> | |
465 | - if orth = "" then process_concraft_result orth2 lemma interp others rev l | |
466 | - else process_concraft_result orth2 "" "" [] ((orth,(lemma,interp) :: others) :: rev) l | |
467 | - | ["";lemma2;interp2] -> process_concraft_result orth lemma interp ((lemma2,interp2) :: others) rev l | |
468 | - | ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l | |
469 | - | _ -> failwith ("process_concraft_result: " ^ line)) | |
470 | - | |
471 | -let concraft_parse s = | |
472 | - let concraft_in, concraft_out, concraft_err = | |
473 | - Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client") | |
474 | - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in | |
475 | - let err_msg = String.concat "\n" (read_whole_channel concraft_err) in | |
476 | - let result = read_whole_channel concraft_in in | |
477 | - if err_msg <> "" then failwith err_msg else | |
478 | - process_concraft_result "" "" "" [] [] result | |
479 | - | |
480 | -(*let rec load_concraft_sentence white orth rev ic = | |
481 | - (* print_endline "load_concraft_sentence 1"; *) | |
482 | - (* print_endline ("concraft error message: " ^ input_line concraft_err); *) | |
483 | - let s = input_line ic in | |
484 | - (* print_endline ("load_concraft_sentence: " ^ s); *) | |
485 | - if s = "" then List.rev rev else | |
486 | - match Xstring.split_delim "\t" s with | |
487 | - [""; lemma; interp; "disamb"] -> load_concraft_sentence "" "" ((white,orth,lemma,interp) :: rev) ic | |
488 | - | [""; lemma; interp] -> load_concraft_sentence white orth rev ic | |
489 | - | [orth; white] -> load_concraft_sentence white orth rev ic | |
490 | - | _ -> failwith ("load_concraft_sentence: " ^ s)*) | |
491 | - | |
492 | -let make_token (orth,l) = | |
493 | - if l = [] then failwith "make_token 1" else | |
494 | - let lemma,interp = List.hd l in | |
495 | - let cat,interp = match Xstring.split ":" interp with | |
496 | - cat :: l -> cat, [Xlist.map l (fun tag -> [tag])] | |
497 | - | _ -> failwith ("make_token 2: " ^ orth ^ " " ^ lemma ^ " " ^ interp) in | |
498 | - {empty_token with orth = orth; token = Lemma(lemma,cat,interp)} | |
499 | - | |
500 | -let parse_mate tokens pbeg s = | |
501 | - (* print_endline ("parse_mate: " ^ s); *) | |
502 | - (* Printf.fprintf concraft_out "%s\n\n%!" s; | |
503 | - let l = load_concraft_sentence "" "" [] concraft_in in *) | |
504 | - let l = concraft_parse s in | |
505 | - let l = Xlist.map l make_token in | |
506 | - let l = {empty_token with token = Interp "<conll_root>"} :: l in | |
507 | - let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in | |
508 | - let _ = CONLL.establish_for_token pbeg s tokens (List.tl l) in | |
509 | - let dep_paths = Array.of_list l in | |
510 | - parse_conll tokens dep_paths; | |
511 | - dep_paths | |
512 | - | |
513 | -let rec parse_mate_sentence tokens mode pbeg = function | |
514 | - RawSentence s -> if mode <> Mate || not Paths.config.Paths.concraft_enabled then RawSentence s else DepSentence (parse_mate tokens pbeg s) | |
515 | - | StructSentence(paths,last) -> StructSentence(paths,last) | |
516 | - | DepSentence(paths) -> DepSentence(paths) | |
517 | - | QuotedSentences sentences -> | |
518 | - QuotedSentences(Xlist.map sentences (fun p -> | |
519 | - {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix; | |
520 | - psentence=parse_mate_sentence tokens mode pbeg p.PreTypes.psentence})) | |
521 | - | AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) -> | |
522 | - mode, parse_mate_sentence tokens mode pbeg sentence)) | |
523 | - | |
524 | -let parse_mate_sentences tokens sentences = | |
525 | - Xlist.map sentences (fun p -> | |
526 | - {p with psentence=parse_mate_sentence tokens Struct p.pbeg p.psentence}) | |
527 | - | |
528 | 97 | let parse_text = function |
529 | 98 | RawText query -> |
530 | - (* print_endline ("parse_text: " ^ query); *) | |
531 | - let tokens = ExtArray.make 100 empty_token in | |
532 | - let _ = ExtArray.add tokens empty_token in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) | |
533 | - let paragraphs = Xstring.split "\n\\|\r" query in | |
534 | - let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in | |
535 | - let n = if Xlist.size paragraphs = 1 then 0 else 1 in | |
536 | - let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph -> | |
537 | - let paths = parse paragraph in | |
538 | - (* print_endline "parse_text 1"; *) | |
539 | - let pid = if n = 0 then "" else string_of_int n ^ "_" in | |
540 | - let sentences = PreSentences.split_into_sentences pid paragraph tokens paths in | |
541 | - (* print_endline "parse_text 2"; *) | |
542 | - let sentences = parse_mate_sentences tokens sentences in | |
543 | - (* print_endline "parse_text 3"; *) | |
544 | - (AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1) in | |
545 | - AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)] | |
99 | + let text,tokens = ENIAMsubsyntax.parse_text query in | |
100 | + let text = ENIAMpreIntegration.parse_text ENIAMsubsyntaxTypes.Struct tokens text in | |
101 | + let lex_sems = ENIAMlexSemantics.assign tokens text in | |
102 | + text,tokens,lex_sems | |
546 | 103 | | AltText[Raw,RawText query;CONLL,StructText([ |
547 | 104 | StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]],tokens)] -> |
548 | 105 | parse_conll tokens dep_paths; |
... | ... | @@ -553,28 +110,54 @@ let parse_text = function |
553 | 110 | @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in |
554 | 111 | AltText[Raw,RawText query; Struct, StructText([ |
555 | 112 | AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)] |
113 | + | _ -> failwith "parse_text: not implemented"*) | |
114 | + | |
115 | +open ENIAMsubsyntaxTypes | |
116 | + | |
117 | +let parse_text = function | |
118 | + RawText query,_ -> | |
119 | + let text,tokens = ENIAMsubsyntax.parse_text query in | |
120 | + let text = ENIAMpreIntegration.parse_text ENIAMsubsyntaxTypes.Struct tokens text in | |
121 | + let lex_sems = ENIAMlexSemantics.assign tokens text in | |
122 | + text,tokens,lex_sems | |
123 | + | AltText[Raw,RawText query;CONLL,StructText[ | |
124 | + StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens -> | |
125 | + let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in | |
126 | + let conll = StructParagraph[{p with psentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] | |
127 | + @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in | |
128 | + let paths = ENIAMsubsyntax.parse query in | |
129 | + let sentences = ENIAMsentences.split_into_sentences "" query tokens paths in | |
130 | + let text = AltText[Raw,RawText query; Struct, StructText([ | |
131 | + AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in | |
132 | + let lex_sems = ENIAMlexSemantics.assign tokens text in | |
133 | + text,tokens,lex_sems | |
556 | 134 | | _ -> failwith "parse_text: not implemented" |
557 | 135 | |
558 | 136 | let rec main_loop in_chan out_chan = |
559 | 137 | (* print_endline "main_loop 1"; *) |
560 | - let query = (Marshal.from_channel in_chan : text) in | |
138 | + let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_record ExtArray.t) in | |
561 | 139 | (* print_endline "main_loop 2"; *) |
562 | - if query = RawText "" then () else ( | |
140 | + if fst query = RawText "" then () else ( | |
563 | 141 | (try |
564 | 142 | (* let time0 = Sys.time () in *) |
565 | 143 | let utime0 = Unix.gettimeofday () in |
566 | 144 | (* print_endline "main_loop 3a"; *) |
567 | - let text = parse_text query in | |
145 | + let text,tokens,lex_sems = parse_text query in | |
568 | 146 | (* print_endline "main_loop 4a"; *) |
569 | 147 | (* let time2 = Sys.time () in *) |
570 | 148 | let utime2 = Unix.gettimeofday () in |
571 | 149 | (* Printf.printf "time=%f utime=%f\n%!" (time2 -. time0) (utime2 -. utime0); *) |
572 | - Marshal.to_channel out_chan (text(*paths,last,next_id*),"",utime2 -. utime0) []; | |
150 | + Marshal.to_channel out_chan (text,tokens,lex_sems,"",utime2 -. utime0) []; | |
573 | 151 | (* print_endline "main_loop 5"; *) |
574 | 152 | () |
575 | 153 | with e -> ( |
576 | 154 | (* print_endline "main_loop 7"; *) |
577 | - Marshal.to_channel out_chan (RawText ""(*[],0*),Printexc.to_string e,0.) [])); | |
155 | + Marshal.to_channel out_chan ( | |
156 | + RawText "", | |
157 | + ExtArray.make 1 ENIAMtokenizerTypes.empty_token, | |
158 | + ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem, | |
159 | + Printexc.to_string e, | |
160 | + 0.) [])); | |
578 | 161 | (* print_endline "main_loop 6"; *) |
579 | 162 | flush out_chan; |
580 | 163 | main_loop in_chan out_chan) |
... | ... |
pre/preTypes.ml deleted
1 | -(* | |
2 | - * ENIAM: Categorial Syntactic-Semantic Parser for Polish | |
3 | - * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | - * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | - * | |
6 | - * This program is free software: you can redistribute it and/or modify | |
7 | - * it under the terms of the GNU General Public License as published by | |
8 | - * the Free Software Foundation, either version 3 of the License, or | |
9 | - * (at your option) any later version. | |
10 | - * | |
11 | - * This program is distributed in the hope that it will be useful, | |
12 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - * GNU General Public License for more details. | |
15 | - * | |
16 | - * You should have received a copy of the GNU General Public License | |
17 | - * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | - *) | |
19 | - | |
20 | -open Xstd | |
21 | - | |
22 | -(*let single_sense_flag = ref false(*true*) | |
23 | -let single_frame_flag = ref false(*true*)*) | |
24 | - | |
25 | -(*type pos_record = {interp: string list list list; attrs: string list; proper: string list; senses: string list} | |
26 | - | |
27 | -type dict = {lemmas: pos_record StringMap.t StringMap.t; dbeg: int; dlen: int}*) | |
28 | - | |
29 | -(* type selector = Orth of string | Pos of string (*| All *) *) | |
30 | - | |
31 | -(* Długość pojedynczego znaku w tekście *) | |
32 | -let factor = 100 | |
33 | - | |
34 | -type labels = { | |
35 | - number: string; | |
36 | - case: string; | |
37 | - gender: string; | |
38 | - person: string; | |
39 | - aspect: string; | |
40 | - } | |
41 | - | |
42 | - | |
43 | -(*type type_arg = | |
44 | - TArg of string | |
45 | - | TWith of type_arg list | |
46 | - | |
47 | -and type_term = | |
48 | - TConst of string * type_arg list | |
49 | - | TMod of type_term * type_term | |
50 | - | TName of string | |
51 | - | TVariant of type_term * type_term*) | |
52 | - | |
53 | - | |
54 | -type semantics = | |
55 | - Normal | |
56 | - | Special of string list | |
57 | -(* | SpecialNoun of type_arg list * type_term | |
58 | - | SpecialMod of string * (type_arg list * type_term)*) | |
59 | - | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *) | |
60 | - | |
61 | -type token = | |
62 | - SmallLetter of string (* orth *) | |
63 | - | CapLetter of string * string (* orth * lowercase *) | |
64 | - | AllSmall of string (* orth *) | |
65 | - | AllCap of string * string * string (* orth * lowercase * all lowercase *) | |
66 | - | FirstCap of string * string * string * string (* orth * all lowercase * first letter uppercase * first letter lowercase *) | |
67 | - | SomeCap of string (* orth *) | |
68 | - | RomanDig of string * string (* value * cat *) | |
69 | - | Interp of string (* orth *) | |
70 | - | Symbol of string (* orth *) | |
71 | - | Dig of string * string (* value * cat *) | |
72 | - | Other2 of string (* orth *) | |
73 | - | Lemma of string * string * string list list list (* lemma * cat * interp *) | |
74 | - | Proper of string * string * string list list list * string list (* lemma * cat * interp * senses *) | |
75 | -(* | Sense of string * string * string list list list * (string * string * string list) list (* lemma * cat * interp * senses *) *) | |
76 | - | Compound of string * token list (* sense * components *) | |
77 | - | Tokens of string * int list (*cat * token id list *) | |
78 | - | |
79 | -(* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających | |
80 | - informacje o poszczególnych tokenach *) | |
81 | -and token_record = { | |
82 | - orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token *) | |
83 | - corr_orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token z poprawionymi błędami *) | |
84 | - beg: int; (* pozycja początkowa tokenu względem początku akapitu *) | |
85 | - len: int; (* długość tokenu *) | |
86 | - next: int; (* pozycja początkowa następnego tokenu względem początku akapitu *) | |
87 | - token: token; (* treść tokenu *) | |
88 | - attrs: string list; (* dodatkowe atrybuty *) | |
89 | - weight: float; | |
90 | - e: labels; | |
91 | - valence: (int * ENIAMwalTypes.frame) list; | |
92 | - simple_valence: (int * ENIAMwalTypes.frame) list; | |
93 | - senses: (string * string list * float) list; | |
94 | - lroles: string * string; | |
95 | - semantics: semantics; | |
96 | - } | |
97 | - | |
98 | -(* Tokeny umieszczone są w strukturze danych umożliwiającej efektywne wyszukiwanie ich sekwencji, | |
99 | - struktura danych sama z siebie nie wnosi informacji *) | |
100 | -type tokens = | |
101 | - | Token of token_record | |
102 | - | Variant of tokens list | |
103 | - | Seq of tokens list | |
104 | - | |
105 | -type pat = L | CL | D of string | C of string | S of string | RD of string | O of string | |
106 | - | |
107 | -let empty_labels = { | |
108 | - number=""; | |
109 | - case=""; | |
110 | - gender=""; | |
111 | - person=""; | |
112 | - aspect=""; | |
113 | - } | |
114 | - | |
115 | -let empty_token = { | |
116 | - orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; weight=0.; e=empty_labels; | |
117 | - attrs=[]; valence=[]; simple_valence=[]; senses=[]; | |
118 | - lroles="",""; semantics=Normal} | |
119 | - | |
120 | -type mode = | |
121 | - Raw | Struct | CONLL | ENIAM | Mate | Swigra | POLFIE | |
122 | - | |
123 | -(* warstwy nkjp1m do analizy: | |
124 | -header | |
125 | -text | |
126 | -ann_segmentation | |
127 | -ann_morphosyntax | |
128 | -ann_named | |
129 | -*) | |
130 | - | |
131 | -(* zdania wydobyte na zewnątrz *) | |
132 | -(* struktura ponadzdaniowa przetwarzana przed strukturą zależnościową *) | |
133 | -(* istnieje ryzyko eksplozji interpretacji *) | |
134 | -type sentence = | |
135 | - RawSentence of string | |
136 | - (* | CONLL of conll list *) | |
137 | - | StructSentence of (int * int * int) list * int (* (id * lnode * rnode) list * last *) | |
138 | - | DepSentence of (int * int * string) array (* (id * super * label) conll_id *) | |
139 | - | QuotedSentences of paragraph_record list | |
140 | - (* | NKJP1M of nkjp1m list *) | |
141 | - (* | Skladnica of skladnica_tree *) | |
142 | - | AltSentence of (mode * sentence) list (* string = etykieta np raw, nkjp, krzaki *) | |
143 | - | |
144 | -and paragraph_record = {pid: string; pbeg: int; plen: int; pnext: int; psentence: sentence; pfile_prefix: string} (* beg i len liczone po znakach unicode ( * 100 ???) *) | |
145 | - | |
146 | -and paragraph = | |
147 | - RawParagraph of string | |
148 | - | StructParagraph of paragraph_record list (* zdania *) | |
149 | - | AltParagraph of (mode * paragraph) list | |
150 | - | |
151 | -type text = | |
152 | - RawText of string | |
153 | - | StructText of paragraph list * token_record ExtArray.t (* akapity * tokeny *) | |
154 | - | AltText of (mode * text) list |
subsyntax/ENIAMsentences.ml
... | ... | @@ -181,10 +181,7 @@ let rec extract_sentences_rec tokens id = |
181 | 181 | Tokens("sentence",ids) -> |
182 | 182 | let paths,last = make_paths tokens ids in |
183 | 183 | [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix=""; |
184 | - psentence=AltSentence([Raw,RawSentence t.orth; ENIAM,StructSentence(paths,last)] (*@ | |
185 | - (if Paths.config.Paths.mate_parser_enabled then [Mate,RawSentence t.orth] else []) @ | |
186 | - (if Paths.config.Paths.swigra_enabled then [Swigra,RawSentence t.orth] else [])*)(* @ | |
187 | - (if Paths.config.Paths.polfie_enabled then [POLFIE,RawSentence t.orth] else []) *))}] | |
184 | + psentence=AltSentence([Raw,RawSentence t.orth; ENIAM,StructSentence(paths,last)])}] | |
188 | 185 | | Tokens("quoted_sentences",ids) -> |
189 | 186 | [{pid=string_of_int id; pbeg=t.beg; plen=t.len; pnext=t.next; pfile_prefix=""; |
190 | 187 | psentence=AltSentence[Raw,RawSentence t.orth; |
... | ... |
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -234,8 +234,5 @@ let parse_text query = |
234 | 234 | (* print_endline "parse_text 1"; *) |
235 | 235 | let pid = if n = 0 then "" else string_of_int n ^ "_" in |
236 | 236 | let sentences = ENIAMsentences.split_into_sentences pid paragraph tokens paths in |
237 | - (* print_endline "parse_text 2"; *) | |
238 | - (* let sentences = parse_mate_sentences tokens sentences in *) | |
239 | - (* print_endline "parse_text 3"; *) | |
240 | 237 | (AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1) in |
241 | 238 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens |
... | ... |
subsyntax/TODO
... | ... | @@ -2,5 +2,5 @@ |
2 | 2 | - dodać zasoby MWE |
3 | 3 | - rozpoznawanie MWE ze Słowosieci |
4 | 4 | |
5 | -- przenieść odwołanie do Świgry i Mate z ENIAMsentences | |
6 | -- przenieść odwołanie do parse_mate_sentences z ENIAMsubsyntax | |
5 | +- jak przetwarzać num:comp | |
6 | +- przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga | |
... | ... |
walenty/TODO
... | ... | @@ -4,3 +4,5 @@ a jedynie instalator, który dostaje Walentego i go przetwarza. |
4 | 4 | - dowiązanie symboliczne do Walentego - udokumentować. |
5 | 5 | - uporządkować położenia słowników pojemników itp. |
6 | 6 | - uporządkować położenie fixed.tab |
7 | +- czy rzeczownik niepoliczalny w liczbie mnogiej jest nadal niepoliczalny np. "Wody szumią." | |
8 | +- trzeba zweryfikować słownik niepoliczalnych | |
... | ... |