Commit 282d8aec4adcce710d0dd8181d56b8723fe1ba41
1 parent
d3e13e84
wstępna wersja biblioteki eniam-lexSemantics-1.0
Showing
14 changed files
with
880 additions
and
13 deletions
integration/README
... | ... | @@ -8,8 +8,9 @@ Install |
8 | 8 | |
9 | 9 | ENIAMintegration requires OCaml version 4.02.3 compiler |
10 | 10 | together with Xlib library version 3.1 or later, |
11 | -ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0 | |
12 | -and ENIAMsubsyntax library version 1.0. | |
11 | +ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0, | |
12 | +ENIAMsubsyntax library version 1.0, ENIAMwalenty library version 1.0 | |
13 | +and ENIAMplWordnet library version 1.0. | |
13 | 14 | |
14 | 15 | In order to install type: |
15 | 16 | |
... | ... |
integration/makefile
... | ... | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa | |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | 9 | SOURCES= ENIAM_CONLL.ml ENIAMpreIntegration.ml |
... | ... | @@ -25,7 +25,7 @@ eniam-integration.cmxa: $(SOURCES) |
25 | 25 | ocamlopt -linkall -a -o eniam-integration.cmxa $(INCLUDES) $^ |
26 | 26 | |
27 | 27 | test: test.ml |
28 | - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml | |
28 | + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml | |
29 | 29 | |
30 | 30 | concraft_test: concraft_test.ml |
31 | 31 | $(OCAMLOPT) -o concraft_test $(OCAMLOPTFLAGS) concraft_test.ml |
... | ... | @@ -51,4 +51,4 @@ concraft_test: concraft_test.ml |
51 | 51 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
52 | 52 | |
53 | 53 | clean: |
54 | - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test | |
54 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test concraft_test | |
... | ... |
integration/test deleted
No preview for this file type
lexSemantics/ENIAMlexSemantics.ml
0 → 100644
1 | +(* | |
2 | + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. | |
3 | + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open ENIAMtokenizerTypes | |
21 | +open ENIAMlexSemanticsTypes | |
22 | +open Xstd | |
23 | + | |
24 | +let load_proper_name proper = function | |
25 | + [lemma; types] -> | |
26 | + let types = Str.split (Str.regexp "|") types in | |
27 | + StringMap.add_inc proper lemma types (fun types2 -> types @ types2) | |
28 | + | l -> failwith ("proper_names: " ^ String.concat " " l) | |
29 | + | |
30 | +let proper_names = | |
31 | + let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in | |
32 | + let proper = File.fold_tab proper_names_filename2 StringMap.empty load_proper_name in | |
33 | + proper | |
34 | + | |
35 | +let remove l s = | |
36 | + Xlist.fold l [] (fun l t -> | |
37 | + if s = t then l else t :: l) | |
38 | + | |
39 | +let find_proper_names tokens i t = | |
40 | + match t.token with | |
41 | + Lemma(lemma,pos,interp) -> | |
42 | + if StringMap.mem proper_names lemma then | |
43 | + let t = {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma); | |
44 | + attrs=remove t.attrs "notvalidated proper"} in | |
45 | + ExtArray.set tokens i t else | |
46 | + if Xlist.mem t.attrs "notvalidated proper" then | |
47 | + let t = {t with token=Proper(lemma,pos,interp,[])} in | |
48 | + ExtArray.set tokens i t | |
49 | + | _ -> () | |
50 | + | |
51 | +let find_senses t = (* FIXME: sensy zawierające 'się' *) | |
52 | + match t.token with | |
53 | + Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos | |
54 | + | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses | |
55 | + | _ -> [] | |
56 | + | |
57 | + | |
58 | +let assign tokens text = | |
59 | + let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in | |
60 | + let _ = ExtArray.add lex_sems empty_lex_sem in | |
61 | + Int.iter 1 (ExtArray.size tokens - 1) (fun i -> | |
62 | + let token = ExtArray.get tokens i in | |
63 | + find_proper_names tokens i token; | |
64 | + let senses = find_senses token in | |
65 | + let lex_sem = {empty_lex_sem with senses=senses} in | |
66 | + let j = ExtArray.add lex_sems lex_sem in | |
67 | + if j <> i then failwith "assign_semantic_valence") | |
68 | + | |
69 | +(* | |
70 | +(* print_endline "a14"; *) | |
71 | + let paths = assign_valence paths in | |
72 | +(* print_endline "a15"; *) | |
73 | +(* print_endline "a16"; *) | |
74 | + let paths = disambiguate_senses paths in | |
75 | + let paths = assign_simplified_valence paths in | |
76 | + let paths = PreSemantics.assign_semantics paths in | |
77 | +(* print_endline "a16"; *) | |
78 | + | |
79 | + | |
80 | + | |
81 | + | |
82 | + | |
83 | +let assign_valence paths = | |
84 | + let lexemes = Xlist.fold paths StringMap.empty (fun lexemes t -> | |
85 | + match t.token with | |
86 | + Lemma(lemma,pos,_) -> | |
87 | + StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) | |
88 | + | Proper(lemma,pos,_,_) -> | |
89 | + let pos = match pos with | |
90 | + "subst" -> "psubst" | |
91 | + | "depr" -> "pdepr" | |
92 | + | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in | |
93 | + StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *) | |
94 | + | _ -> lexemes) in | |
95 | + let valence = WalFrames.find_frames lexemes in | |
96 | + List.rev (Xlist.rev_map paths (fun t -> | |
97 | + match t.token with | |
98 | + Lemma(lemma,pos,_) -> {t with valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []} | |
99 | + | Proper(lemma,pos,interp,_) -> {t with valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) | |
100 | + (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*)); | |
101 | + token=Lemma(lemma,pos,interp)} | |
102 | + | _ -> t)) | |
103 | + | |
104 | +(**********************************************************************************) | |
105 | + | |
106 | +(* let prepare_indexes (paths,_) = | |
107 | + let set = Xlist.fold paths IntSet.empty (fun set t -> | |
108 | + IntSet.add (IntSet.add set t.beg) t.next) in | |
109 | + let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x -> | |
110 | + IntMap.add map x n, n+1) in | |
111 | + List.rev (Xlist.rev_map paths (fun t -> | |
112 | + {t with lnode=IntMap.find map t.beg; rnode=IntMap.find map t.next})), last - 1 *) | |
113 | + | |
114 | +let get_prefs_schema prefs schema = | |
115 | + Xlist.fold schema prefs (fun prefs t -> | |
116 | + Xlist.fold t.WalTypes.sel_prefs prefs StringSet.add) | |
117 | + | |
118 | +let map_prefs_schema senses schema = | |
119 | + Xlist.map schema (fun t -> | |
120 | + if Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.Pro) || Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.ProNG) then t else | |
121 | + {t with WalTypes.sel_prefs = Xlist.fold t.WalTypes.sel_prefs [] (fun l s -> | |
122 | + if StringSet.mem senses s then s :: l else l)}) | |
123 | + | |
124 | +let disambiguate_senses paths = | |
125 | + let prefs = Xlist.fold paths (StringSet.singleton "ALL") (fun prefs t -> | |
126 | + Xlist.fold t.valence prefs (fun prefs -> function | |
127 | + _,WalTypes.Frame(_,schema) -> get_prefs_schema prefs schema | |
128 | + | _,WalTypes.LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema | |
129 | + | _,WalTypes.ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in | |
130 | + let hipero = Xlist.fold paths (StringSet.singleton "ALL") (fun hipero t -> | |
131 | + Xlist.fold t.senses hipero (fun hipero (_,l,_) -> | |
132 | + Xlist.fold l hipero StringSet.add)) in | |
133 | + let senses = StringSet.intersection prefs hipero in | |
134 | + let is_zero = StringSet.mem hipero "0" in | |
135 | + let senses = if is_zero then StringSet.add senses "0" else senses in | |
136 | + Xlist.map paths (fun t -> | |
137 | + {t with valence = if is_zero then t.valence else | |
138 | + Xlist.map t.valence (function | |
139 | + n,WalTypes.Frame(a,schema) -> n,WalTypes.Frame(a,map_prefs_schema senses schema) | |
140 | + | n,WalTypes.LexFrame(s,p,r,schema) -> n,WalTypes.LexFrame(s,p,r,map_prefs_schema senses schema) | |
141 | + | n,WalTypes.ComprepFrame(s,p,r,schema) -> n,WalTypes.ComprepFrame(s,p,r,map_prefs_schema senses schema)); | |
142 | + senses = Xlist.map t.senses (fun (s,l,w) -> | |
143 | + s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)}) | |
144 | + | |
145 | +(*let single_sense (paths,last) = | |
146 | + List.rev (Xlist.rev_map paths (fun t -> | |
147 | + let sense = | |
148 | + if t.senses = [] then [] else | |
149 | + [Xlist.fold t.senses ("",[],-.max_float) (fun (max_meaning,max_hipero,max_weight) (meaning,hipero,weight) -> | |
150 | + if max_weight >= weight then max_meaning,max_hipero,max_weight else meaning,hipero,weight)] in | |
151 | + {t with senses=sense})), last*) | |
152 | + | |
153 | +open WalTypes | |
154 | + | |
155 | +(*let single_schema schemata = | |
156 | + let map = Xlist.fold schemata StringMap.empty (fun map schema -> | |
157 | + let t = WalStringOf.schema (List.sort compare (Xlist.fold schema [] (fun l s -> | |
158 | + if s.gf <> ARG && s.gf <> ADJUNCT then {s with role=""; role_attr=""; sel_prefs=[]} :: l else | |
159 | + if s.cr <> [] || s.ce <> [] then {s with role=""; role_attr=""; sel_prefs=[]} :: l else l))) in | |
160 | + StringMap.add_inc map t [schema] (fun l -> schema :: l)) in | |
161 | + StringMap.fold map [] (fun l _ schemata -> | |
162 | + let map = Xlist.fold schemata StringMap.empty (fun map schema -> | |
163 | + Xlist.fold schema map (fun map s -> | |
164 | + let t = WalStringOf.schema [{s with role=""; role_attr=""; sel_prefs=[]}] in | |
165 | + StringMap.add_inc map t [s] (fun l -> s :: l))) in | |
166 | + let schema = StringMap.fold map [] (fun schema _ l -> | |
167 | + let s = List.hd l in | |
168 | + {s with sel_prefs=Xlist.fold s.sel_prefs [] (fun l t -> if t = "0" || t = "T" then t :: l else l)} :: schema) in | |
169 | + schema :: l)*) | |
170 | + | |
171 | +let remove_meaning = function | |
172 | + DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a) | |
173 | + | EmptyAtrs m -> EmptyAtrs [] | |
174 | + | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*)) | |
175 | + | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*)) | |
176 | + | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a) | |
177 | + | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a) | |
178 | + | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a) | |
179 | + | _ -> failwith "remove_meaning" | |
180 | + | |
181 | + | |
182 | +(*let single_frame (paths,last) = | |
183 | + List.rev (Xlist.rev_map paths (fun t -> | |
184 | + let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function | |
185 | + Frame(attrs,schema) -> | |
186 | + let attrs = remove_meaning attrs in | |
187 | + lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema]) (fun (_,l) -> attrs, schema :: l) | |
188 | + | frame -> frame :: lex_frames, frames) in | |
189 | + let frames = StringMap.fold frames lex_frames (fun frames _ (attrs,schemata) -> | |
190 | + Xlist.fold (single_schema schemata) frames (fun frames frame -> Frame(attrs,frame) :: frames)) in | |
191 | + {t with valence=frames})), last *) | |
192 | + | |
193 | +let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *) | |
194 | + Phrase(NP(Case "dat")) -> l | |
195 | + | Phrase(NP(Case "inst")) -> l | |
196 | + | Phrase(PrepNP _) -> l | |
197 | + | Phrase(PrepAdjP _) -> l | |
198 | + | Phrase(NumP (Case "dat")) -> l | |
199 | + | Phrase(NumP (Case "inst")) -> l | |
200 | + | Phrase(PrepNumP _) -> l | |
201 | + | Phrase(ComprepNP _) -> l | |
202 | + | Phrase(ComparNP _) -> l | |
203 | + | Phrase(ComparPP _) -> l | |
204 | + | Phrase(IP) -> l | |
205 | + | Phrase(CP _) -> l | |
206 | + | Phrase(NCP(Case "dat",_,_)) -> l | |
207 | + | Phrase(NCP(Case "inst",_,_)) -> l | |
208 | + | Phrase(PrepNCP _) -> l | |
209 | +(* | Phrase(PadvP) -> l *) | |
210 | + | Phrase(AdvP) -> l | |
211 | + | Phrase(PrepP) -> l | |
212 | + | Phrase(Or) -> l | |
213 | + | Phrase(Qub) -> l | |
214 | + | Phrase(Adja) -> l | |
215 | + | Phrase(Inclusion) -> l | |
216 | + | Phrase Pro -> Phrase Null :: l | |
217 | + | t -> t :: l | |
218 | + | |
219 | +let simplify_position_noun l = function | |
220 | + Phrase(NP(Case "gen")) -> l | |
221 | + | Phrase(NP(Case "nom")) -> l | |
222 | + | Phrase(NP(CaseAgr)) -> l | |
223 | + | Phrase(PrepNP _) -> l | |
224 | + | Phrase(AdjP AllAgr) -> l | |
225 | + | Phrase(NumP (Case "gen")) -> l | |
226 | + | Phrase(NumP (Case "nom")) -> l | |
227 | + | Phrase(NumP (CaseAgr)) -> l | |
228 | + | Phrase(PrepNumP _) -> l | |
229 | + | Phrase(ComprepNP _) -> l | |
230 | + | Phrase(ComparNP _) -> l | |
231 | + | Phrase(ComparPP _) -> l | |
232 | + | Phrase(IP) -> l | |
233 | + | Phrase(NCP(Case "gen",_,_)) -> l | |
234 | + | Phrase(PrepNCP _) -> l | |
235 | + | Phrase(PrepP) -> l | |
236 | + | Phrase(Qub) -> l | |
237 | + | Phrase(Adja) -> l | |
238 | + | Phrase(Inclusion) -> l | |
239 | + | Phrase Pro -> Phrase Null :: l | |
240 | + | t -> t :: l | |
241 | + | |
242 | +let simplify_position_adj l = function | |
243 | + Phrase(AdvP) -> l | |
244 | + | t -> t :: l | |
245 | + | |
246 | +let simplify_position_adv l = function | |
247 | + Phrase(AdvP) -> l | |
248 | + | t -> t :: l | |
249 | + | |
250 | + | |
251 | +let simplify_position pos l s = | |
252 | + let morfs = match pos with | |
253 | + "verb" -> List.rev (Xlist.fold s.morfs [] simplify_position_verb) | |
254 | + | "noun" -> List.rev (Xlist.fold s.morfs [] simplify_position_noun) | |
255 | + | "adj" -> List.rev (Xlist.fold s.morfs [] simplify_position_adj) | |
256 | + | "adv" -> List.rev (Xlist.fold s.morfs [] simplify_position_adv) | |
257 | + | _ -> s.morfs in | |
258 | + match morfs with | |
259 | + [] -> l | |
260 | + | [Phrase Null] -> l | |
261 | + | _ -> {s with morfs=morfs} :: l | |
262 | + | |
263 | +let simplify_schemata pos schemata = | |
264 | + let schemata = Xlist.fold schemata StringMap.empty (fun schemata (schema,frame) -> | |
265 | + let schema = List.sort compare (Xlist.fold schema [] (fun l s -> | |
266 | + let s = {s with role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; morfs=List.sort compare s.morfs} in | |
267 | + if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else | |
268 | +(* if s.cr <> [] || s.ce <> [] then s :: l else *) | |
269 | + simplify_position pos l s)) in | |
270 | + StringMap.add_inc schemata (WalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in | |
271 | + StringMap.fold schemata [] (fun l _ s -> s :: l) | |
272 | + | |
273 | +(* FIXME: problem ComprepNP i PrepNCP *) | |
274 | +(* FIXME: problem gdy ten sam token występuje w kilku ścieżkach *) | |
275 | +let generate_verb_prep_adjuncts preps = | |
276 | + Xlist.map preps (fun (lemma,case) -> WalFrames.verb_prep_adjunct_schema_field lemma case) | |
277 | + | |
278 | +let generate_verb_comprep_adjuncts compreps = | |
279 | + Xlist.map compreps (fun lemma -> WalFrames.verb_comprep_adjunct_schema_field lemma) | |
280 | + | |
281 | +let generate_verb_compar_adjuncts compars = | |
282 | + Xlist.map compars (fun lemma -> WalFrames.verb_compar_adjunct_schema_field lemma) | |
283 | + | |
284 | +let generate_noun_prep_adjuncts preps = | |
285 | + WalFrames.noun_prep_adjunct_schema_field preps | |
286 | + | |
287 | +let generate_noun_compar_adjuncts compars = | |
288 | + WalFrames.noun_compar_adjunct_schema_field compars | |
289 | + | |
290 | +let generate_adj_compar_adjuncts compars = | |
291 | + WalFrames.noun_compar_adjunct_schema_field compars | |
292 | + | |
293 | +let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"] | |
294 | + | |
295 | +let generate_prep_adjunct_tokens paths = | |
296 | + let map = Xlist.fold paths StringMap.empty (fun map t -> | |
297 | + match t.token with | |
298 | + Lemma(lemma,"prep",interp) -> | |
299 | + let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in | |
300 | + if StringSet.mem compars lemma then map else | |
301 | + Xlist.fold interp map (fun map -> function | |
302 | + [cases] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case)) | |
303 | + | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case)) | |
304 | + | _ -> map) | |
305 | + | _ -> map) in | |
306 | + StringMap.fold map [] (fun l _ v -> v :: l) | |
307 | + | |
308 | +let generate_comprep_adjunct_tokens paths = | |
309 | + let lemmas = Xlist.fold paths StringSet.empty (fun lemmas t -> | |
310 | + match t.token with | |
311 | + Lemma(lemma,_,_) -> StringSet.add lemmas lemma | |
312 | + | _ -> lemmas) in | |
313 | + StringMap.fold WalFrames.comprep_reqs [] (fun compreps comprep reqs -> | |
314 | + let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in | |
315 | + if b then comprep :: compreps else compreps) | |
316 | + | |
317 | +let generate_compar_adjunct_tokens paths = | |
318 | + let set = Xlist.fold paths StringSet.empty (fun set t -> | |
319 | + match t.token with | |
320 | + Lemma(lemma,"prep",interp) -> | |
321 | + if not (StringSet.mem compars lemma) then set else | |
322 | + StringSet.add set lemma | |
323 | + | _ -> set) in | |
324 | + StringSet.to_list set | |
325 | + | |
326 | +let is_measure = function | |
327 | + NounAtrs(_,_,Common "measure") -> true | |
328 | + | _ -> false | |
329 | + | |
330 | +let assign_simplified_valence paths = | |
331 | + let preps = generate_prep_adjunct_tokens paths in | |
332 | + let compreps = generate_comprep_adjunct_tokens paths in | |
333 | + let compars = generate_compar_adjunct_tokens paths in | |
334 | + let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in | |
335 | + let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in | |
336 | + let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in | |
337 | + let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in | |
338 | + let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in | |
339 | + let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in | |
340 | + let verb_adjuncts = WalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in | |
341 | + let noun_adjuncts = WalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in | |
342 | + let noun_measure_adjuncts = WalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in | |
343 | + let adj_adjuncts = WalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in | |
344 | + let adv_adjuncts = WalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in | |
345 | + List.rev (Xlist.rev_map paths (fun t -> | |
346 | + let pos = match t.token with | |
347 | + Lemma(_,pos,_) -> WalFrames.simplify_pos pos | |
348 | + | _ -> "" in | |
349 | + let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function | |
350 | + _,(Frame(attrs,schema) as frame) -> | |
351 | + let attrs = remove_meaning attrs in | |
352 | + lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l) | |
353 | + | _,frame -> frame :: lex_frames, frames) in | |
354 | + let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame -> | |
355 | + (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in | |
356 | + let simp_frames,full_frames,_ = StringMap.fold frames (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) _ (attrs,schemata) -> | |
357 | + Xlist.fold (simplify_schemata pos schemata) (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) (schema,frames) -> | |
358 | + let schema = match pos with | |
359 | + "verb" -> schema @ verb_adjuncts | |
360 | + | "noun" -> schema @ (if is_measure attrs then noun_measure_adjuncts else noun_adjuncts) | |
361 | + | "adj" -> schema @ adj_adjuncts | |
362 | + | "adv" -> schema @ adv_adjuncts | |
363 | + | _ -> schema in | |
364 | + (n,Frame(attrs,schema)) :: simp_frames, | |
365 | + Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames), | |
366 | + n+1)) in | |
367 | + {t with simple_valence=simp_frames; valence=full_frames})) | |
368 | + | |
369 | +(* FIXME: dodać do walencji preferencje selekcyjne nadrzędników symboli: dzień, godzina, rysunek itp. *) | |
370 | +(* FIXME: sprawdzić czy walencja nazw własnych jest dobrze zrobiona. *) | |
371 | + | |
372 | +(* let first_id = 1 (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) | |
373 | + | |
374 | +let add_ids (paths,last) next_id = | |
375 | + let paths,next_id = Xlist.fold ((*List.rev*) paths) ([],next_id) (fun (paths,id) t -> | |
376 | + {t with id=id} :: paths, id+1) in | |
377 | + (paths,last),next_id *) | |
378 | + | |
379 | + | |
380 | + | |
381 | +let parse query = | |
382 | +(* print_endline "a1"; *) | |
383 | + let l = Xunicode.classified_chars_of_utf8_string query in | |
384 | +(* print_endline "a2"; *) | |
385 | + let l = PreTokenizer.tokenize l in | |
386 | +(* print_endline "a3"; *) | |
387 | + let l = PrePatterns.normalize_tokens [] l in | |
388 | +(* print_endline "a4"; *) | |
389 | + let l = PrePatterns.find_replacement_patterns l in | |
390 | +(* print_endline "a5"; *) | |
391 | + let l = PrePatterns.remove_spaces [] l in | |
392 | + let l = PrePatterns.find_abr_patterns PreAcronyms.abr_patterns l in | |
393 | + let l = PrePatterns.normalize_tokens [] l in | |
394 | +(* print_endline "a6"; *) | |
395 | + let paths = PrePaths.translate_into_paths l in | |
396 | +(* print_endline "a7"; *) | |
397 | + let paths = PrePaths.lemmatize paths in | |
398 | +(* print_endline "a8"; *) | |
399 | + let paths,_ = PreMWE.process paths in | |
400 | +(* print_endline "a12"; *) | |
401 | + let paths = find_proper_names paths in | |
402 | +(* print_endline "a13"; *) | |
403 | + let paths = modify_weights paths in | |
404 | + let paths = translate_digs paths in | |
405 | + let paths = assign_senses paths in | |
406 | +(* print_endline "a14"; *) | |
407 | + let paths = assign_valence paths in | |
408 | +(* print_endline "a15"; *) | |
409 | + let paths = combine_interps paths in | |
410 | +(* print_endline "a16"; *) | |
411 | + let paths = disambiguate_senses paths in | |
412 | + let paths = assign_simplified_valence paths in | |
413 | + let paths = PreSemantics.assign_semantics paths in | |
414 | +(* print_endline "a16"; *) | |
415 | + let paths = select_tokens paths in | |
416 | +(* print_endline "a17"; *) | |
417 | +(* let paths = if !single_sense_flag then single_sense paths else paths in | |
418 | + let paths = if !single_frame_flag then single_frame paths else paths in*) | |
419 | + (*let paths, next_id = add_ids paths next_id in | |
420 | + let paths = prepare_indexes paths in*) | |
421 | +(* print_endline "a18"; *) | |
422 | + paths(*, next_id*) | |
423 | +(* print_endline (PrePaths.to_string paths); *) | |
424 | +(* let paths = | |
425 | + if PrePaths.no_possible_path (PrePaths.map paths PreLemmatization.remove_postags) then | |
426 | + PrePaths.map paths process_ign | |
427 | + else paths in | |
428 | + let paths = PrePaths.map paths PreLemmatization.remove_postags in | |
429 | + let paths = PreCaseShift.manage_lower_upper_case paths in (* FIXME: niepotrzebnie powiększa pierwszy token (przymiotniki partykuły itp.) *) | |
430 | + let paths = PreLemmatization.combine_interps paths in | |
431 | +(* print_endline (PrePaths.to_string paths); *)*) | |
432 | + | |
433 | +let parse_conll tokens dep_paths = (* FIXME: sprawdzić, czy zachowana jest kolejność elementów paths !!! *) | |
434 | + let paths = List.rev (Int.fold 1 (Array.length dep_paths - 1) [] (fun paths conll_id -> | |
435 | + let id,_,_ = dep_paths.(conll_id) in | |
436 | + ExtArray.get tokens id :: paths)) in | |
437 | + (* print_endline "a12"; *) | |
438 | + let paths = find_proper_names paths in | |
439 | + (* print_endline "a13"; *) | |
440 | + let paths = modify_weights paths in | |
441 | + let paths = PreWordnet.assign_senses paths in | |
442 | + (* print_endline "a14"; *) | |
443 | + (* let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *) *) | |
444 | + (* print_endline "a15"; *) | |
445 | + let paths = assign_valence paths in | |
446 | + (* print_endline "a16"; *) | |
447 | + let paths = disambiguate_senses paths in | |
448 | + let paths = assign_simplified_valence paths in | |
449 | + let paths = PreSemantics.assign_semantics paths in | |
450 | + (* print_endline "a16"; *) | |
451 | + let _ = Xlist.fold paths 1 (fun conll_id t -> | |
452 | + let id,_,_ = dep_paths.(conll_id) in | |
453 | + ExtArray.set tokens id t; | |
454 | + conll_id + 1) in | |
455 | + () | |
456 | +*) | |
... | ... |
pre/preSemantics.ml renamed to lexSemantics/ENIAMlexSemanticsData.ml
1 | 1 | (* |
2 | - * ENIAM: Categorial Syntactic-Semantic Parser for Polish | |
2 | + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. | |
3 | 3 | * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> |
4 | 4 | * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences |
5 | 5 | * |
6 | - * This program is free software: you can redistribute it and/or modify | |
7 | - * it under the terms of the GNU General Public License as published by | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | 8 | * the Free Software Foundation, either version 3 of the License, or |
9 | 9 | * (at your option) any later version. |
10 | 10 | * |
11 | - * This program is distributed in the hope that it will be useful, | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | - * GNU General Public License for more details. | |
14 | + * GNU Lesser General Public License for more details. | |
15 | 15 | * |
16 | - * You should have received a copy of the GNU General Public License | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | 17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
18 | 18 | *) |
19 | 19 | |
... | ... |
lexSemantics/ENIAMlexSemanticsTypes.ml
0 → 100644
1 | +(* | |
2 | + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. | |
3 | + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open ENIAMtokenizerTypes | |
21 | +open Xstd | |
22 | + | |
23 | +type labels = { | |
24 | + number: string; | |
25 | + case: string; | |
26 | + gender: string; | |
27 | + person: string; | |
28 | + aspect: string; | |
29 | + } | |
30 | + | |
31 | +type semantics = | |
32 | + Normal | |
33 | + | Special of string list | |
34 | +(* | SpecialNoun of type_arg list * type_term | |
35 | + | SpecialMod of string * (type_arg list * type_term)*) | |
36 | + | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *) | |
37 | + | |
38 | +type lex_sem = { | |
39 | + e: labels; | |
40 | + valence: (int * ENIAMwalTypes.frame) list; | |
41 | + simple_valence: (int * ENIAMwalTypes.frame) list; | |
42 | + senses: (string * string list * float) list; | |
43 | + lroles: string * string; | |
44 | + semantics: semantics; | |
45 | + } | |
46 | + | |
47 | +let empty_labels = { | |
48 | + number=""; | |
49 | + case=""; | |
50 | + gender=""; | |
51 | + person=""; | |
52 | + aspect=""; | |
53 | + } | |
54 | + | |
55 | +let empty_lex_sem = { | |
56 | + e=empty_labels; valence=[]; simple_valence=[]; senses=[]; | |
57 | + lroles="",""; semantics=Normal} | |
58 | + | |
59 | +let proper_names_filename = resource_path ^ "/lexSemantics/proper_names_sgjp_polimorf.tab" | |
60 | +let proper_names_filename2 = resource_path ^ "/lexSemantics/proper_names.tab" | |
... | ... |
lexSemantics/README
0 → 100644
1 | +ENIAMsemValence Version 1.0 : | |
2 | +----------------------- | |
3 | + | |
4 | +ENIAMsemValence is a library that assigns tokens with lexicosemantic information. | |
5 | +It recognizes named entities and assigns thematic roles, | |
6 | +senses, valence and other semantic information to tokens. | |
7 | + | |
8 | +Install | |
9 | +------- | |
10 | + | |
11 | +ENIAMsemValence requires OCaml version 4.02.3 compiler | |
12 | +together with Xlib library version 3.1 or later, | |
13 | +ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0 | |
14 | +and ENIAMsubsyntax library version 1.0. | |
15 | + | |
16 | +In order to install type: | |
17 | + | |
18 | +make install | |
19 | + | |
20 | +by default, ENIAMsemValence is installed in the 'ocamlc -where'/eniam directory. | |
21 | +you can change it by editing the Makefile. | |
22 | + | |
23 | +In order to test library type: | |
24 | +make test | |
25 | +./test | |
26 | + | |
27 | +By default ENIAMsemValence looks for resources in /usr/share/eniam directory. | |
28 | +However this behaviour may be changed by setting end exporting ENIAM_RESOURCE_PATH | |
29 | +environment variable. | |
30 | + | |
31 | +Credits | |
32 | +------- | |
33 | +Copyright © 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
34 | +Copyright © 2016 Institute of Computer Science Polish Academy of Sciences | |
35 | + | |
36 | +The library uses the following licensed resources: | |
37 | + | |
38 | +SGJP: Grammatical Dictionary of Polish, version 20151020 | |
39 | +Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin | |
40 | +Woliński, Robert Wołosz, Danuta Skowrońska | |
41 | +http://sgjp.pl | |
42 | + | |
43 | +Licence | |
44 | +------- | |
45 | + | |
46 | +This library is free software: you can redistribute it and/or modify | |
47 | +it under the terms of the GNU Lesser General Public License as published by | |
48 | +the Free Software Foundation, either version 3 of the License, or | |
49 | +(at your option) any later version. | |
50 | + | |
51 | +This library is distributed in the hope that it will be useful, | |
52 | +but WITHOUT ANY WARRANTY; without even the implied warranty of | |
53 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
54 | +GNU Lesser General Public License for more details. | |
55 | + | |
56 | +You should have received a copy of the GNU Lesser General Public License | |
57 | +along with this program. If not, see <http://www.gnu.org/licenses/>. | |
... | ... |
lexSemantics/lgpl-3.0.txt
0 → 100644
1 | + GNU LESSER GENERAL PUBLIC LICENSE | |
2 | + Version 3, 29 June 2007 | |
3 | + | |
4 | + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> | |
5 | + Everyone is permitted to copy and distribute verbatim copies | |
6 | + of this license document, but changing it is not allowed. | |
7 | + | |
8 | + | |
9 | + This version of the GNU Lesser General Public License incorporates | |
10 | +the terms and conditions of version 3 of the GNU General Public | |
11 | +License, supplemented by the additional permissions listed below. | |
12 | + | |
13 | + 0. Additional Definitions. | |
14 | + | |
15 | + As used herein, "this License" refers to version 3 of the GNU Lesser | |
16 | +General Public License, and the "GNU GPL" refers to version 3 of the GNU | |
17 | +General Public License. | |
18 | + | |
19 | + "The Library" refers to a covered work governed by this License, | |
20 | +other than an Application or a Combined Work as defined below. | |
21 | + | |
22 | + An "Application" is any work that makes use of an interface provided | |
23 | +by the Library, but which is not otherwise based on the Library. | |
24 | +Defining a subclass of a class defined by the Library is deemed a mode | |
25 | +of using an interface provided by the Library. | |
26 | + | |
27 | + A "Combined Work" is a work produced by combining or linking an | |
28 | +Application with the Library. The particular version of the Library | |
29 | +with which the Combined Work was made is also called the "Linked | |
30 | +Version". | |
31 | + | |
32 | + The "Minimal Corresponding Source" for a Combined Work means the | |
33 | +Corresponding Source for the Combined Work, excluding any source code | |
34 | +for portions of the Combined Work that, considered in isolation, are | |
35 | +based on the Application, and not on the Linked Version. | |
36 | + | |
37 | + The "Corresponding Application Code" for a Combined Work means the | |
38 | +object code and/or source code for the Application, including any data | |
39 | +and utility programs needed for reproducing the Combined Work from the | |
40 | +Application, but excluding the System Libraries of the Combined Work. | |
41 | + | |
42 | + 1. Exception to Section 3 of the GNU GPL. | |
43 | + | |
44 | + You may convey a covered work under sections 3 and 4 of this License | |
45 | +without being bound by section 3 of the GNU GPL. | |
46 | + | |
47 | + 2. Conveying Modified Versions. | |
48 | + | |
49 | + If you modify a copy of the Library, and, in your modifications, a | |
50 | +facility refers to a function or data to be supplied by an Application | |
51 | +that uses the facility (other than as an argument passed when the | |
52 | +facility is invoked), then you may convey a copy of the modified | |
53 | +version: | |
54 | + | |
55 | + a) under this License, provided that you make a good faith effort to | |
56 | + ensure that, in the event an Application does not supply the | |
57 | + function or data, the facility still operates, and performs | |
58 | + whatever part of its purpose remains meaningful, or | |
59 | + | |
60 | + b) under the GNU GPL, with none of the additional permissions of | |
61 | + this License applicable to that copy. | |
62 | + | |
63 | + 3. Object Code Incorporating Material from Library Header Files. | |
64 | + | |
65 | + The object code form of an Application may incorporate material from | |
66 | +a header file that is part of the Library. You may convey such object | |
67 | +code under terms of your choice, provided that, if the incorporated | |
68 | +material is not limited to numerical parameters, data structure | |
69 | +layouts and accessors, or small macros, inline functions and templates | |
70 | +(ten or fewer lines in length), you do both of the following: | |
71 | + | |
72 | + a) Give prominent notice with each copy of the object code that the | |
73 | + Library is used in it and that the Library and its use are | |
74 | + covered by this License. | |
75 | + | |
76 | + b) Accompany the object code with a copy of the GNU GPL and this license | |
77 | + document. | |
78 | + | |
79 | + 4. Combined Works. | |
80 | + | |
81 | + You may convey a Combined Work under terms of your choice that, | |
82 | +taken together, effectively do not restrict modification of the | |
83 | +portions of the Library contained in the Combined Work and reverse | |
84 | +engineering for debugging such modifications, if you also do each of | |
85 | +the following: | |
86 | + | |
87 | + a) Give prominent notice with each copy of the Combined Work that | |
88 | + the Library is used in it and that the Library and its use are | |
89 | + covered by this License. | |
90 | + | |
91 | + b) Accompany the Combined Work with a copy of the GNU GPL and this license | |
92 | + document. | |
93 | + | |
94 | + c) For a Combined Work that displays copyright notices during | |
95 | + execution, include the copyright notice for the Library among | |
96 | + these notices, as well as a reference directing the user to the | |
97 | + copies of the GNU GPL and this license document. | |
98 | + | |
99 | + d) Do one of the following: | |
100 | + | |
101 | + 0) Convey the Minimal Corresponding Source under the terms of this | |
102 | + License, and the Corresponding Application Code in a form | |
103 | + suitable for, and under terms that permit, the user to | |
104 | + recombine or relink the Application with a modified version of | |
105 | + the Linked Version to produce a modified Combined Work, in the | |
106 | + manner specified by section 6 of the GNU GPL for conveying | |
107 | + Corresponding Source. | |
108 | + | |
109 | + 1) Use a suitable shared library mechanism for linking with the | |
110 | + Library. A suitable mechanism is one that (a) uses at run time | |
111 | + a copy of the Library already present on the user's computer | |
112 | + system, and (b) will operate properly with a modified version | |
113 | + of the Library that is interface-compatible with the Linked | |
114 | + Version. | |
115 | + | |
116 | + e) Provide Installation Information, but only if you would otherwise | |
117 | + be required to provide such information under section 6 of the | |
118 | + GNU GPL, and only to the extent that such information is | |
119 | + necessary to install and execute a modified version of the | |
120 | + Combined Work produced by recombining or relinking the | |
121 | + Application with a modified version of the Linked Version. (If | |
122 | + you use option 4d0, the Installation Information must accompany | |
123 | + the Minimal Corresponding Source and Corresponding Application | |
124 | + Code. If you use option 4d1, you must provide the Installation | |
125 | + Information in the manner specified by section 6 of the GNU GPL | |
126 | + for conveying Corresponding Source.) | |
127 | + | |
128 | + 5. Combined Libraries. | |
129 | + | |
130 | + You may place library facilities that are a work based on the | |
131 | +Library side by side in a single library together with other library | |
132 | +facilities that are not Applications and are not covered by this | |
133 | +License, and convey such a combined library under terms of your | |
134 | +choice, if you do both of the following: | |
135 | + | |
136 | + a) Accompany the combined library with a copy of the same work based | |
137 | + on the Library, uncombined with any other library facilities, | |
138 | + conveyed under the terms of this License. | |
139 | + | |
140 | + b) Give prominent notice with the combined library that part of it | |
141 | + is a work based on the Library, and explaining where to find the | |
142 | + accompanying uncombined form of the same work. | |
143 | + | |
144 | + 6. Revised Versions of the GNU Lesser General Public License. | |
145 | + | |
146 | + The Free Software Foundation may publish revised and/or new versions | |
147 | +of the GNU Lesser General Public License from time to time. Such new | |
148 | +versions will be similar in spirit to the present version, but may | |
149 | +differ in detail to address new problems or concerns. | |
150 | + | |
151 | + Each version is given a distinguishing version number. If the | |
152 | +Library as you received it specifies that a certain numbered version | |
153 | +of the GNU Lesser General Public License "or any later version" | |
154 | +applies to it, you have the option of following the terms and | |
155 | +conditions either of that published version or of any later version | |
156 | +published by the Free Software Foundation. If the Library as you | |
157 | +received it does not specify a version number of the GNU Lesser | |
158 | +General Public License, you may choose any version of the GNU Lesser | |
159 | +General Public License ever published by the Free Software Foundation. | |
160 | + | |
161 | + If the Library as you received it specifies that a proxy can decide | |
162 | +whether future versions of the GNU Lesser General Public License shall | |
163 | +apply, that proxy's public statement of acceptance of any version is | |
164 | +permanent authorization for you to choose that version for the | |
165 | +Library. | |
... | ... |
lexSemantics/makefile
0 → 100644
1 | +OCAMLC=ocamlc | |
2 | +OCAMLOPT=ocamlopt | |
3 | +OCAMLDEP=ocamldep | |
4 | +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam | |
5 | +OCAMLFLAGS=$(INCLUDES) -g | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa #eniam-lexSemantics.cmxa | |
7 | +INSTALLDIR=`ocamlc -where`/eniam | |
8 | + | |
9 | +SOURCES= ENIAMlexSemanticsTypes.ml ENIAMlexSemantics.ml | |
10 | + | |
11 | +all: eniam-lexSemantics.cma eniam-lexSemantics.cmxa | |
12 | + | |
13 | +install: all | |
14 | + mkdir -p $(INSTALLDIR) | |
15 | + cp eniam-lexSemantics.cmxa eniam-lexSemantics.a eniam-lexSemantics.cma $(INSTALLDIR) | |
16 | + cp ENIAMlexSemanticsTypes.cmi ENIAMlexSemantics.cmi $(INSTALLDIR) | |
17 | + cp ENIAMlexSemanticsTypes.cmx ENIAMlexSemantics.cmx $(INSTALLDIR) | |
18 | + mkdir -p /usr/share/eniam/lexSemantics | |
19 | + cp resources/* /usr/share/eniam/lexSemantics | |
20 | + ln -s /usr/share/eniam/lexSemantics/proper_names_20160104.tab /usr/share/eniam/lexSemantics/proper_names.tab | |
21 | + ln -s /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab | |
22 | + | |
23 | +eniam-lexSemantics.cma: $(SOURCES) | |
24 | + ocamlc -linkall -a -o eniam-lexSemantics.cma $(OCAMLFLAGS) $^ | |
25 | + | |
26 | +eniam-lexSemantics.cmxa: $(SOURCES) | |
27 | + ocamlopt -linkall -a -o eniam-lexSemantics.cmxa $(INCLUDES) $^ | |
28 | + | |
29 | +test: test.ml | |
30 | + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml | |
31 | + | |
32 | +.SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx | |
33 | + | |
34 | +.mll.ml: | |
35 | + ocamllex $< | |
36 | + | |
37 | +.mly.mli: | |
38 | + ocamlyacc $< | |
39 | + | |
40 | +.mly.ml: | |
41 | + ocamlyacc $< | |
42 | + | |
43 | +.ml.cmo: | |
44 | + $(OCAMLC) $(OCAMLFLAGS) -c $< | |
45 | + | |
46 | +.mli.cmi: | |
47 | + $(OCAMLC) $(OCAMLFALGS) -c $< | |
48 | + | |
49 | +.ml.cmx: | |
50 | + $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< | |
51 | + | |
52 | +clean: | |
53 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test | |
... | ... |
resources/SGJP/README renamed to lexSemantics/resources/README
1 | -Files in this folder were created on the basis of | |
1 | +File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of | |
2 | + | |
2 | 3 | SGJP: Grammatical Dictionary of Polish, version 20151020 |
3 | 4 | Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin |
4 | 5 | Woliński, Robert Wołosz, Danuta Skowrońska |
... | ... |
resources/proper_names_20160104.tab renamed to lexSemantics/resources/proper_names_20160104.tab
resources/SGJP/proper_names_sgjp_polimorf_20151020.tab renamed to lexSemantics/resources/proper_names_sgjp_polimorf_20151020.tab
lexSemantics/test.ml
0 → 100644
1 | +(* | |
2 | + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. | |
3 | + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | + | |
21 | +let test_strings = [ | |
22 | + "Szpak frunie zimą."; | |
23 | + "Kot miauczy w październiku."; | |
24 | +(* "a gdybym miałem"; | |
25 | + "A Gdy Miałem"; | |
26 | + "GDY MIAŁEM"; | |
27 | + "I II III IV V VI VII VIII IX X MCXIV MXC"; | |
28 | + "Kiedy Piotr Prabucki, przewodniczący Komisji Budżetu PeKaO"; | |
29 | + "25 idzie 20."; | |
30 | + "Kot. Kot. kot."; | |
31 | + "25."; | |
32 | + "25.888.231"; | |
33 | + "Ala 25.888.231.111 ma."; | |
34 | + "Ala 25.888.031,011."; | |
35 | + "Ala -25.888.031,011."; | |
36 | + "Ala -25 ."; | |
37 | + "Ala -1° C 3° ciepła 20—30°C od 180° do 260°C około 6° poniżej horyzontu."; | |
38 | + "Ala 22-25 ."; | |
39 | + "Ala 22.5.2000-25.5.2001 .";*) | |
40 | +(* "Np. Ala.";*) | |
41 | + (* "w. dom."; | |
42 | + "tzn."; | |
43 | + "c.d.n."; *) | |
44 | +(* "Arabia Saudyjska biegnie."; | |
45 | + "Cauchy'ego ONZ-owska biegnie.";*) | |
46 | + (* "TE-cie E-e."; | |
47 | + "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE."; | |
48 | + "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; *) | |
49 | +(* "Tom idzie.";*) | |
50 | + (* "Miałem miał."; *) | |
51 | +(* "Szpak śpiewa."; | |
52 | + "Ala ma kota."; | |
53 | + "Ale mają kota:"*) | |
54 | + ] | |
55 | + | |
56 | +let _ = | |
57 | + print_endline "Testy wbudowane"; | |
58 | + Xlist.iter test_strings (fun s -> | |
59 | + print_endline ("\nTEST: " ^ s); | |
60 | + let text,tokens = ENIAMsubsyntax.parse_text s in | |
61 | + let lex_sems = ENIAMlexSemantics.assign tokens text in | |
62 | + (* print_endline (ENIAMtokenizer.xml_of tokens); *) | |
63 | + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token))); | |
64 | +(* print_endline "Testy użytkownika."; | |
65 | + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; | |
66 | + let s = ref (read_line ()) in | |
67 | + while !s <> "" do | |
68 | + let tokens = ENIAMtokenizer.parse !s in | |
69 | + (* print_endline (ENIAMtokenizer.xml_of tokens); *) | |
70 | + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token)); | |
71 | + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; | |
72 | + s := read_line () | |
73 | + done;*) | |
74 | + () | |
... | ... |
tokenizer/ENIAMtokenizerTypes.ml
... | ... | @@ -42,7 +42,7 @@ type token = |
42 | 42 | |
43 | 43 | (* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających |
44 | 44 | informacje o poszczególnych tokenach *) |
45 | -and token_record = { | |
45 | +type token_record = { | |
46 | 46 | orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token *) |
47 | 47 | corr_orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token z poprawionymi błędami *) |
48 | 48 | beg: int; (* pozycja początkowa tokenu względem początku akapitu *) |
... | ... |