Commit 282d8aec4adcce710d0dd8181d56b8723fe1ba41

Authored by Wojciech Jaworski
1 parent d3e13e84

wstępna wersja biblioteki eniam-lexSemantics-1.0

integration/README
... ... @@ -8,8 +8,9 @@ Install
8 8  
9 9 ENIAMintegration requires OCaml version 4.02.3 compiler
10 10 together with Xlib library version 3.1 or later,
11   -ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0
12   -and ENIAMsubsyntax library version 1.0.
  11 +ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0,
  12 +ENIAMsubsyntax library version 1.0, ENIAMwalenty library version 1.0
  13 +and ENIAMplWordnet library version 1.0.
13 14  
14 15 In order to install type:
15 16  
... ...
integration/makefile
... ... @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9 9 SOURCES= ENIAM_CONLL.ml ENIAMpreIntegration.ml
... ... @@ -25,7 +25,7 @@ eniam-integration.cmxa: $(SOURCES)
25 25 ocamlopt -linkall -a -o eniam-integration.cmxa $(INCLUDES) $^
26 26  
27 27 test: test.ml
28   - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml
  28 + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml
29 29  
30 30 concraft_test: concraft_test.ml
31 31 $(OCAMLOPT) -o concraft_test $(OCAMLOPTFLAGS) concraft_test.ml
... ... @@ -51,4 +51,4 @@ concraft_test: concraft_test.ml
51 51 $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
52 52  
53 53 clean:
54   - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test
  54 + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test concraft_test
... ...
integration/test deleted
No preview for this file type
lexSemantics/ENIAMlexSemantics.ml 0 → 100644
  1 +(*
  2 + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
  3 + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open ENIAMtokenizerTypes
  21 +open ENIAMlexSemanticsTypes
  22 +open Xstd
  23 +
  24 +let load_proper_name proper = function
  25 + [lemma; types] ->
  26 + let types = Str.split (Str.regexp "|") types in
  27 + StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
  28 + | l -> failwith ("proper_names: " ^ String.concat " " l)
  29 +
  30 +let proper_names =
  31 + let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in
  32 + let proper = File.fold_tab proper_names_filename2 StringMap.empty load_proper_name in
  33 + proper
  34 +
  35 +let remove l s =
  36 + Xlist.fold l [] (fun l t ->
  37 + if s = t then l else t :: l)
  38 +
  39 +let find_proper_names tokens i t =
  40 + match t.token with
  41 + Lemma(lemma,pos,interp) ->
  42 + if StringMap.mem proper_names lemma then
  43 + let t = {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
  44 + attrs=remove t.attrs "notvalidated proper"} in
  45 + ExtArray.set tokens i t else
  46 + if Xlist.mem t.attrs "notvalidated proper" then
  47 + let t = {t with token=Proper(lemma,pos,interp,[])} in
  48 + ExtArray.set tokens i t
  49 + | _ -> ()
  50 +
  51 +let find_senses t = (* FIXME: sensy zawierające 'się' *)
  52 + match t.token with
  53 + Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos
  54 + | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses
  55 + | _ -> []
  56 +
  57 +
  58 +let assign tokens text =
  59 + let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in
  60 + let _ = ExtArray.add lex_sems empty_lex_sem in
  61 + Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
  62 + let token = ExtArray.get tokens i in
  63 + find_proper_names tokens i token;
  64 + let senses = find_senses token in
  65 + let lex_sem = {empty_lex_sem with senses=senses} in
  66 + let j = ExtArray.add lex_sems lex_sem in
  67 + if j <> i then failwith "assign_semantic_valence")
  68 +
  69 +(*
  70 +(* print_endline "a14"; *)
  71 + let paths = assign_valence paths in
  72 +(* print_endline "a15"; *)
  73 +(* print_endline "a16"; *)
  74 + let paths = disambiguate_senses paths in
  75 + let paths = assign_simplified_valence paths in
  76 + let paths = PreSemantics.assign_semantics paths in
  77 +(* print_endline "a16"; *)
  78 +
  79 +
  80 +
  81 +
  82 +
  83 +let assign_valence paths =
  84 + let lexemes = Xlist.fold paths StringMap.empty (fun lexemes t ->
  85 + match t.token with
  86 + Lemma(lemma,pos,_) ->
  87 + StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos)
  88 + | Proper(lemma,pos,_,_) ->
  89 + let pos = match pos with
  90 + "subst" -> "psubst"
  91 + | "depr" -> "pdepr"
  92 + | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in
  93 + StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *)
  94 + | _ -> lexemes) in
  95 + let valence = WalFrames.find_frames lexemes in
  96 + List.rev (Xlist.rev_map paths (fun t ->
  97 + match t.token with
  98 + Lemma(lemma,pos,_) -> {t with valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
  99 + | Proper(lemma,pos,interp,_) -> {t with valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
  100 + (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*));
  101 + token=Lemma(lemma,pos,interp)}
  102 + | _ -> t))
  103 +
  104 +(**********************************************************************************)
  105 +
  106 +(* let prepare_indexes (paths,_) =
  107 + let set = Xlist.fold paths IntSet.empty (fun set t ->
  108 + IntSet.add (IntSet.add set t.beg) t.next) in
  109 + let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x ->
  110 + IntMap.add map x n, n+1) in
  111 + List.rev (Xlist.rev_map paths (fun t ->
  112 + {t with lnode=IntMap.find map t.beg; rnode=IntMap.find map t.next})), last - 1 *)
  113 +
  114 +let get_prefs_schema prefs schema =
  115 + Xlist.fold schema prefs (fun prefs t ->
  116 + Xlist.fold t.WalTypes.sel_prefs prefs StringSet.add)
  117 +
  118 +let map_prefs_schema senses schema =
  119 + Xlist.map schema (fun t ->
  120 + if Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.Pro) || Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.ProNG) then t else
  121 + {t with WalTypes.sel_prefs = Xlist.fold t.WalTypes.sel_prefs [] (fun l s ->
  122 + if StringSet.mem senses s then s :: l else l)})
  123 +
  124 +let disambiguate_senses paths =
  125 + let prefs = Xlist.fold paths (StringSet.singleton "ALL") (fun prefs t ->
  126 + Xlist.fold t.valence prefs (fun prefs -> function
  127 + _,WalTypes.Frame(_,schema) -> get_prefs_schema prefs schema
  128 + | _,WalTypes.LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema
  129 + | _,WalTypes.ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in
  130 + let hipero = Xlist.fold paths (StringSet.singleton "ALL") (fun hipero t ->
  131 + Xlist.fold t.senses hipero (fun hipero (_,l,_) ->
  132 + Xlist.fold l hipero StringSet.add)) in
  133 + let senses = StringSet.intersection prefs hipero in
  134 + let is_zero = StringSet.mem hipero "0" in
  135 + let senses = if is_zero then StringSet.add senses "0" else senses in
  136 + Xlist.map paths (fun t ->
  137 + {t with valence = if is_zero then t.valence else
  138 + Xlist.map t.valence (function
  139 + n,WalTypes.Frame(a,schema) -> n,WalTypes.Frame(a,map_prefs_schema senses schema)
  140 + | n,WalTypes.LexFrame(s,p,r,schema) -> n,WalTypes.LexFrame(s,p,r,map_prefs_schema senses schema)
  141 + | n,WalTypes.ComprepFrame(s,p,r,schema) -> n,WalTypes.ComprepFrame(s,p,r,map_prefs_schema senses schema));
  142 + senses = Xlist.map t.senses (fun (s,l,w) ->
  143 + s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)})
  144 +
  145 +(*let single_sense (paths,last) =
  146 + List.rev (Xlist.rev_map paths (fun t ->
  147 + let sense =
  148 + if t.senses = [] then [] else
  149 + [Xlist.fold t.senses ("",[],-.max_float) (fun (max_meaning,max_hipero,max_weight) (meaning,hipero,weight) ->
  150 + if max_weight >= weight then max_meaning,max_hipero,max_weight else meaning,hipero,weight)] in
  151 + {t with senses=sense})), last*)
  152 +
  153 +open WalTypes
  154 +
  155 +(*let single_schema schemata =
  156 + let map = Xlist.fold schemata StringMap.empty (fun map schema ->
  157 + let t = WalStringOf.schema (List.sort compare (Xlist.fold schema [] (fun l s ->
  158 + if s.gf <> ARG && s.gf <> ADJUNCT then {s with role=""; role_attr=""; sel_prefs=[]} :: l else
  159 + if s.cr <> [] || s.ce <> [] then {s with role=""; role_attr=""; sel_prefs=[]} :: l else l))) in
  160 + StringMap.add_inc map t [schema] (fun l -> schema :: l)) in
  161 + StringMap.fold map [] (fun l _ schemata ->
  162 + let map = Xlist.fold schemata StringMap.empty (fun map schema ->
  163 + Xlist.fold schema map (fun map s ->
  164 + let t = WalStringOf.schema [{s with role=""; role_attr=""; sel_prefs=[]}] in
  165 + StringMap.add_inc map t [s] (fun l -> s :: l))) in
  166 + let schema = StringMap.fold map [] (fun schema _ l ->
  167 + let s = List.hd l in
  168 + {s with sel_prefs=Xlist.fold s.sel_prefs [] (fun l t -> if t = "0" || t = "T" then t :: l else l)} :: schema) in
  169 + schema :: l)*)
  170 +
  171 +let remove_meaning = function
  172 + DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a)
  173 + | EmptyAtrs m -> EmptyAtrs []
  174 + | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*))
  175 + | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*))
  176 + | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a)
  177 + | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a)
  178 + | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a)
  179 + | _ -> failwith "remove_meaning"
  180 +
  181 +
  182 +(*let single_frame (paths,last) =
  183 + List.rev (Xlist.rev_map paths (fun t ->
  184 + let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
  185 + Frame(attrs,schema) ->
  186 + let attrs = remove_meaning attrs in
  187 + lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema]) (fun (_,l) -> attrs, schema :: l)
  188 + | frame -> frame :: lex_frames, frames) in
  189 + let frames = StringMap.fold frames lex_frames (fun frames _ (attrs,schemata) ->
  190 + Xlist.fold (single_schema schemata) frames (fun frames frame -> Frame(attrs,frame) :: frames)) in
  191 + {t with valence=frames})), last *)
  192 +
  193 +let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *)
  194 + Phrase(NP(Case "dat")) -> l
  195 + | Phrase(NP(Case "inst")) -> l
  196 + | Phrase(PrepNP _) -> l
  197 + | Phrase(PrepAdjP _) -> l
  198 + | Phrase(NumP (Case "dat")) -> l
  199 + | Phrase(NumP (Case "inst")) -> l
  200 + | Phrase(PrepNumP _) -> l
  201 + | Phrase(ComprepNP _) -> l
  202 + | Phrase(ComparNP _) -> l
  203 + | Phrase(ComparPP _) -> l
  204 + | Phrase(IP) -> l
  205 + | Phrase(CP _) -> l
  206 + | Phrase(NCP(Case "dat",_,_)) -> l
  207 + | Phrase(NCP(Case "inst",_,_)) -> l
  208 + | Phrase(PrepNCP _) -> l
  209 +(* | Phrase(PadvP) -> l *)
  210 + | Phrase(AdvP) -> l
  211 + | Phrase(PrepP) -> l
  212 + | Phrase(Or) -> l
  213 + | Phrase(Qub) -> l
  214 + | Phrase(Adja) -> l
  215 + | Phrase(Inclusion) -> l
  216 + | Phrase Pro -> Phrase Null :: l
  217 + | t -> t :: l
  218 +
  219 +let simplify_position_noun l = function
  220 + Phrase(NP(Case "gen")) -> l
  221 + | Phrase(NP(Case "nom")) -> l
  222 + | Phrase(NP(CaseAgr)) -> l
  223 + | Phrase(PrepNP _) -> l
  224 + | Phrase(AdjP AllAgr) -> l
  225 + | Phrase(NumP (Case "gen")) -> l
  226 + | Phrase(NumP (Case "nom")) -> l
  227 + | Phrase(NumP (CaseAgr)) -> l
  228 + | Phrase(PrepNumP _) -> l
  229 + | Phrase(ComprepNP _) -> l
  230 + | Phrase(ComparNP _) -> l
  231 + | Phrase(ComparPP _) -> l
  232 + | Phrase(IP) -> l
  233 + | Phrase(NCP(Case "gen",_,_)) -> l
  234 + | Phrase(PrepNCP _) -> l
  235 + | Phrase(PrepP) -> l
  236 + | Phrase(Qub) -> l
  237 + | Phrase(Adja) -> l
  238 + | Phrase(Inclusion) -> l
  239 + | Phrase Pro -> Phrase Null :: l
  240 + | t -> t :: l
  241 +
  242 +let simplify_position_adj l = function
  243 + Phrase(AdvP) -> l
  244 + | t -> t :: l
  245 +
  246 +let simplify_position_adv l = function
  247 + Phrase(AdvP) -> l
  248 + | t -> t :: l
  249 +
  250 +
  251 +let simplify_position pos l s =
  252 + let morfs = match pos with
  253 + "verb" -> List.rev (Xlist.fold s.morfs [] simplify_position_verb)
  254 + | "noun" -> List.rev (Xlist.fold s.morfs [] simplify_position_noun)
  255 + | "adj" -> List.rev (Xlist.fold s.morfs [] simplify_position_adj)
  256 + | "adv" -> List.rev (Xlist.fold s.morfs [] simplify_position_adv)
  257 + | _ -> s.morfs in
  258 + match morfs with
  259 + [] -> l
  260 + | [Phrase Null] -> l
  261 + | _ -> {s with morfs=morfs} :: l
  262 +
  263 +let simplify_schemata pos schemata =
  264 + let schemata = Xlist.fold schemata StringMap.empty (fun schemata (schema,frame) ->
  265 + let schema = List.sort compare (Xlist.fold schema [] (fun l s ->
  266 + let s = {s with role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; morfs=List.sort compare s.morfs} in
  267 + if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else
  268 +(* if s.cr <> [] || s.ce <> [] then s :: l else *)
  269 + simplify_position pos l s)) in
  270 + StringMap.add_inc schemata (WalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in
  271 + StringMap.fold schemata [] (fun l _ s -> s :: l)
  272 +
  273 +(* FIXME: problem ComprepNP i PrepNCP *)
  274 +(* FIXME: problem gdy ten sam token występuje w kilku ścieżkach *)
  275 +let generate_verb_prep_adjuncts preps =
  276 + Xlist.map preps (fun (lemma,case) -> WalFrames.verb_prep_adjunct_schema_field lemma case)
  277 +
  278 +let generate_verb_comprep_adjuncts compreps =
  279 + Xlist.map compreps (fun lemma -> WalFrames.verb_comprep_adjunct_schema_field lemma)
  280 +
  281 +let generate_verb_compar_adjuncts compars =
  282 + Xlist.map compars (fun lemma -> WalFrames.verb_compar_adjunct_schema_field lemma)
  283 +
  284 +let generate_noun_prep_adjuncts preps =
  285 + WalFrames.noun_prep_adjunct_schema_field preps
  286 +
  287 +let generate_noun_compar_adjuncts compars =
  288 + WalFrames.noun_compar_adjunct_schema_field compars
  289 +
  290 +let generate_adj_compar_adjuncts compars =
  291 + WalFrames.noun_compar_adjunct_schema_field compars
  292 +
  293 +let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"]
  294 +
  295 +let generate_prep_adjunct_tokens paths =
  296 + let map = Xlist.fold paths StringMap.empty (fun map t ->
  297 + match t.token with
  298 + Lemma(lemma,"prep",interp) ->
  299 + let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in
  300 + if StringSet.mem compars lemma then map else
  301 + Xlist.fold interp map (fun map -> function
  302 + [cases] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
  303 + | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
  304 + | _ -> map)
  305 + | _ -> map) in
  306 + StringMap.fold map [] (fun l _ v -> v :: l)
  307 +
  308 +let generate_comprep_adjunct_tokens paths =
  309 + let lemmas = Xlist.fold paths StringSet.empty (fun lemmas t ->
  310 + match t.token with
  311 + Lemma(lemma,_,_) -> StringSet.add lemmas lemma
  312 + | _ -> lemmas) in
  313 + StringMap.fold WalFrames.comprep_reqs [] (fun compreps comprep reqs ->
  314 + let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in
  315 + if b then comprep :: compreps else compreps)
  316 +
  317 +let generate_compar_adjunct_tokens paths =
  318 + let set = Xlist.fold paths StringSet.empty (fun set t ->
  319 + match t.token with
  320 + Lemma(lemma,"prep",interp) ->
  321 + if not (StringSet.mem compars lemma) then set else
  322 + StringSet.add set lemma
  323 + | _ -> set) in
  324 + StringSet.to_list set
  325 +
  326 +let is_measure = function
  327 + NounAtrs(_,_,Common "measure") -> true
  328 + | _ -> false
  329 +
  330 +let assign_simplified_valence paths =
  331 + let preps = generate_prep_adjunct_tokens paths in
  332 + let compreps = generate_comprep_adjunct_tokens paths in
  333 + let compars = generate_compar_adjunct_tokens paths in
  334 + let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in
  335 + let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in
  336 + let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in
  337 + let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in
  338 + let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in
  339 + let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in
  340 + let verb_adjuncts = WalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in
  341 + let noun_adjuncts = WalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
  342 + let noun_measure_adjuncts = WalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
  343 + let adj_adjuncts = WalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in
  344 + let adv_adjuncts = WalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in
  345 + List.rev (Xlist.rev_map paths (fun t ->
  346 + let pos = match t.token with
  347 + Lemma(_,pos,_) -> WalFrames.simplify_pos pos
  348 + | _ -> "" in
  349 + let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
  350 + _,(Frame(attrs,schema) as frame) ->
  351 + let attrs = remove_meaning attrs in
  352 + lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l)
  353 + | _,frame -> frame :: lex_frames, frames) in
  354 + let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame ->
  355 + (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in
  356 + let simp_frames,full_frames,_ = StringMap.fold frames (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) _ (attrs,schemata) ->
  357 + Xlist.fold (simplify_schemata pos schemata) (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) (schema,frames) ->
  358 + let schema = match pos with
  359 + "verb" -> schema @ verb_adjuncts
  360 + | "noun" -> schema @ (if is_measure attrs then noun_measure_adjuncts else noun_adjuncts)
  361 + | "adj" -> schema @ adj_adjuncts
  362 + | "adv" -> schema @ adv_adjuncts
  363 + | _ -> schema in
  364 + (n,Frame(attrs,schema)) :: simp_frames,
  365 + Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames),
  366 + n+1)) in
  367 + {t with simple_valence=simp_frames; valence=full_frames}))
  368 +
  369 +(* FIXME: dodać do walencji preferencje selekcyjne nadrzędników symboli: dzień, godzina, rysunek itp. *)
  370 +(* FIXME: sprawdzić czy walencja nazw własnych jest dobrze zrobiona. *)
  371 +
  372 +(* let first_id = 1 (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
  373 +
  374 +let add_ids (paths,last) next_id =
  375 + let paths,next_id = Xlist.fold ((*List.rev*) paths) ([],next_id) (fun (paths,id) t ->
  376 + {t with id=id} :: paths, id+1) in
  377 + (paths,last),next_id *)
  378 +
  379 +
  380 +
  381 +let parse query =
  382 +(* print_endline "a1"; *)
  383 + let l = Xunicode.classified_chars_of_utf8_string query in
  384 +(* print_endline "a2"; *)
  385 + let l = PreTokenizer.tokenize l in
  386 +(* print_endline "a3"; *)
  387 + let l = PrePatterns.normalize_tokens [] l in
  388 +(* print_endline "a4"; *)
  389 + let l = PrePatterns.find_replacement_patterns l in
  390 +(* print_endline "a5"; *)
  391 + let l = PrePatterns.remove_spaces [] l in
  392 + let l = PrePatterns.find_abr_patterns PreAcronyms.abr_patterns l in
  393 + let l = PrePatterns.normalize_tokens [] l in
  394 +(* print_endline "a6"; *)
  395 + let paths = PrePaths.translate_into_paths l in
  396 +(* print_endline "a7"; *)
  397 + let paths = PrePaths.lemmatize paths in
  398 +(* print_endline "a8"; *)
  399 + let paths,_ = PreMWE.process paths in
  400 +(* print_endline "a12"; *)
  401 + let paths = find_proper_names paths in
  402 +(* print_endline "a13"; *)
  403 + let paths = modify_weights paths in
  404 + let paths = translate_digs paths in
  405 + let paths = assign_senses paths in
  406 +(* print_endline "a14"; *)
  407 + let paths = assign_valence paths in
  408 +(* print_endline "a15"; *)
  409 + let paths = combine_interps paths in
  410 +(* print_endline "a16"; *)
  411 + let paths = disambiguate_senses paths in
  412 + let paths = assign_simplified_valence paths in
  413 + let paths = PreSemantics.assign_semantics paths in
  414 +(* print_endline "a16"; *)
  415 + let paths = select_tokens paths in
  416 +(* print_endline "a17"; *)
  417 +(* let paths = if !single_sense_flag then single_sense paths else paths in
  418 + let paths = if !single_frame_flag then single_frame paths else paths in*)
  419 + (*let paths, next_id = add_ids paths next_id in
  420 + let paths = prepare_indexes paths in*)
  421 +(* print_endline "a18"; *)
  422 + paths(*, next_id*)
  423 +(* print_endline (PrePaths.to_string paths); *)
  424 +(* let paths =
  425 + if PrePaths.no_possible_path (PrePaths.map paths PreLemmatization.remove_postags) then
  426 + PrePaths.map paths process_ign
  427 + else paths in
  428 + let paths = PrePaths.map paths PreLemmatization.remove_postags in
  429 + let paths = PreCaseShift.manage_lower_upper_case paths in (* FIXME: niepotrzebnie powiększa pierwszy token (przymiotniki partykuły itp.) *)
  430 + let paths = PreLemmatization.combine_interps paths in
  431 +(* print_endline (PrePaths.to_string paths); *)*)
  432 +
  433 +let parse_conll tokens dep_paths = (* FIXME: sprawdzić, czy zachowana jest kolejność elementów paths !!! *)
  434 + let paths = List.rev (Int.fold 1 (Array.length dep_paths - 1) [] (fun paths conll_id ->
  435 + let id,_,_ = dep_paths.(conll_id) in
  436 + ExtArray.get tokens id :: paths)) in
  437 + (* print_endline "a12"; *)
  438 + let paths = find_proper_names paths in
  439 + (* print_endline "a13"; *)
  440 + let paths = modify_weights paths in
  441 + let paths = PreWordnet.assign_senses paths in
  442 + (* print_endline "a14"; *)
  443 + (* let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *) *)
  444 + (* print_endline "a15"; *)
  445 + let paths = assign_valence paths in
  446 + (* print_endline "a16"; *)
  447 + let paths = disambiguate_senses paths in
  448 + let paths = assign_simplified_valence paths in
  449 + let paths = PreSemantics.assign_semantics paths in
  450 + (* print_endline "a16"; *)
  451 + let _ = Xlist.fold paths 1 (fun conll_id t ->
  452 + let id,_,_ = dep_paths.(conll_id) in
  453 + ExtArray.set tokens id t;
  454 + conll_id + 1) in
  455 + ()
  456 +*)
... ...
pre/preSemantics.ml renamed to lexSemantics/ENIAMlexSemanticsData.ml
1 1 (*
2   - * ENIAM: Categorial Syntactic-Semantic Parser for Polish
  2 + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
3 3 * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
4 4 * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
5 5 *
6   - * This program is free software: you can redistribute it and/or modify
7   - * it under the terms of the GNU General Public License as published by
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
8 8 * the Free Software Foundation, either version 3 of the License, or
9 9 * (at your option) any later version.
10 10 *
11   - * This program is distributed in the hope that it will be useful,
  11 + * This library is distributed in the hope that it will be useful,
12 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14   - * GNU General Public License for more details.
  14 + * GNU Lesser General Public License for more details.
15 15 *
16   - * You should have received a copy of the GNU General Public License
  16 + * You should have received a copy of the GNU Lesser General Public License
17 17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 18 *)
19 19  
... ...
lexSemantics/ENIAMlexSemanticsTypes.ml 0 → 100644
  1 +(*
  2 + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
  3 + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +open ENIAMtokenizerTypes
  21 +open Xstd
  22 +
  23 +type labels = {
  24 + number: string;
  25 + case: string;
  26 + gender: string;
  27 + person: string;
  28 + aspect: string;
  29 + }
  30 +
  31 +type semantics =
  32 + Normal
  33 + | Special of string list
  34 +(* | SpecialNoun of type_arg list * type_term
  35 + | SpecialMod of string * (type_arg list * type_term)*)
  36 + | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *)
  37 +
  38 +type lex_sem = {
  39 + e: labels;
  40 + valence: (int * ENIAMwalTypes.frame) list;
  41 + simple_valence: (int * ENIAMwalTypes.frame) list;
  42 + senses: (string * string list * float) list;
  43 + lroles: string * string;
  44 + semantics: semantics;
  45 + }
  46 +
  47 +let empty_labels = {
  48 + number="";
  49 + case="";
  50 + gender="";
  51 + person="";
  52 + aspect="";
  53 + }
  54 +
  55 +let empty_lex_sem = {
  56 + e=empty_labels; valence=[]; simple_valence=[]; senses=[];
  57 + lroles="",""; semantics=Normal}
  58 +
  59 +let proper_names_filename = resource_path ^ "/lexSemantics/proper_names_sgjp_polimorf.tab"
  60 +let proper_names_filename2 = resource_path ^ "/lexSemantics/proper_names.tab"
... ...
lexSemantics/README 0 → 100644
  1 +ENIAMsemValence Version 1.0 :
  2 +-----------------------
  3 +
  4 +ENIAMsemValence is a library that assigns tokens with lexicosemantic information.
  5 +It recognizes named entities and assigns thematic roles,
  6 +senses, valence and other semantic information to tokens.
  7 +
  8 +Install
  9 +-------
  10 +
  11 +ENIAMsemValence requires OCaml version 4.02.3 compiler
  12 +together with Xlib library version 3.1 or later,
  13 +ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0
  14 +and ENIAMsubsyntax library version 1.0.
  15 +
  16 +In order to install type:
  17 +
  18 +make install
  19 +
  20 +by default, ENIAMsemValence is installed in the 'ocamlc -where'/eniam directory.
  21 +you can change it by editing the Makefile.
  22 +
  23 +In order to test library type:
  24 +make test
  25 +./test
  26 +
  27 +By default ENIAMsemValence looks for resources in /usr/share/eniam directory.
  28 +However this behaviour may be changed by setting end exporting ENIAM_RESOURCE_PATH
  29 +environment variable.
  30 +
  31 +Credits
  32 +-------
  33 +Copyright © 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  34 +Copyright © 2016 Institute of Computer Science Polish Academy of Sciences
  35 +
  36 +The library uses the following licensed resources:
  37 +
  38 +SGJP: Grammatical Dictionary of Polish, version 20151020
  39 +Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin
  40 +Woliński, Robert Wołosz, Danuta Skowrońska
  41 +http://sgjp.pl
  42 +
  43 +Licence
  44 +-------
  45 +
  46 +This library is free software: you can redistribute it and/or modify
  47 +it under the terms of the GNU Lesser General Public License as published by
  48 +the Free Software Foundation, either version 3 of the License, or
  49 +(at your option) any later version.
  50 +
  51 +This library is distributed in the hope that it will be useful,
  52 +but WITHOUT ANY WARRANTY; without even the implied warranty of
  53 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  54 +GNU Lesser General Public License for more details.
  55 +
  56 +You should have received a copy of the GNU Lesser General Public License
  57 +along with this program. If not, see <http://www.gnu.org/licenses/>.
... ...
lexSemantics/lgpl-3.0.txt 0 → 100644
  1 + GNU LESSER GENERAL PUBLIC LICENSE
  2 + Version 3, 29 June 2007
  3 +
  4 + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 + Everyone is permitted to copy and distribute verbatim copies
  6 + of this license document, but changing it is not allowed.
  7 +
  8 +
  9 + This version of the GNU Lesser General Public License incorporates
  10 +the terms and conditions of version 3 of the GNU General Public
  11 +License, supplemented by the additional permissions listed below.
  12 +
  13 + 0. Additional Definitions.
  14 +
  15 + As used herein, "this License" refers to version 3 of the GNU Lesser
  16 +General Public License, and the "GNU GPL" refers to version 3 of the GNU
  17 +General Public License.
  18 +
  19 + "The Library" refers to a covered work governed by this License,
  20 +other than an Application or a Combined Work as defined below.
  21 +
  22 + An "Application" is any work that makes use of an interface provided
  23 +by the Library, but which is not otherwise based on the Library.
  24 +Defining a subclass of a class defined by the Library is deemed a mode
  25 +of using an interface provided by the Library.
  26 +
  27 + A "Combined Work" is a work produced by combining or linking an
  28 +Application with the Library. The particular version of the Library
  29 +with which the Combined Work was made is also called the "Linked
  30 +Version".
  31 +
  32 + The "Minimal Corresponding Source" for a Combined Work means the
  33 +Corresponding Source for the Combined Work, excluding any source code
  34 +for portions of the Combined Work that, considered in isolation, are
  35 +based on the Application, and not on the Linked Version.
  36 +
  37 + The "Corresponding Application Code" for a Combined Work means the
  38 +object code and/or source code for the Application, including any data
  39 +and utility programs needed for reproducing the Combined Work from the
  40 +Application, but excluding the System Libraries of the Combined Work.
  41 +
  42 + 1. Exception to Section 3 of the GNU GPL.
  43 +
  44 + You may convey a covered work under sections 3 and 4 of this License
  45 +without being bound by section 3 of the GNU GPL.
  46 +
  47 + 2. Conveying Modified Versions.
  48 +
  49 + If you modify a copy of the Library, and, in your modifications, a
  50 +facility refers to a function or data to be supplied by an Application
  51 +that uses the facility (other than as an argument passed when the
  52 +facility is invoked), then you may convey a copy of the modified
  53 +version:
  54 +
  55 + a) under this License, provided that you make a good faith effort to
  56 + ensure that, in the event an Application does not supply the
  57 + function or data, the facility still operates, and performs
  58 + whatever part of its purpose remains meaningful, or
  59 +
  60 + b) under the GNU GPL, with none of the additional permissions of
  61 + this License applicable to that copy.
  62 +
  63 + 3. Object Code Incorporating Material from Library Header Files.
  64 +
  65 + The object code form of an Application may incorporate material from
  66 +a header file that is part of the Library. You may convey such object
  67 +code under terms of your choice, provided that, if the incorporated
  68 +material is not limited to numerical parameters, data structure
  69 +layouts and accessors, or small macros, inline functions and templates
  70 +(ten or fewer lines in length), you do both of the following:
  71 +
  72 + a) Give prominent notice with each copy of the object code that the
  73 + Library is used in it and that the Library and its use are
  74 + covered by this License.
  75 +
  76 + b) Accompany the object code with a copy of the GNU GPL and this license
  77 + document.
  78 +
  79 + 4. Combined Works.
  80 +
  81 + You may convey a Combined Work under terms of your choice that,
  82 +taken together, effectively do not restrict modification of the
  83 +portions of the Library contained in the Combined Work and reverse
  84 +engineering for debugging such modifications, if you also do each of
  85 +the following:
  86 +
  87 + a) Give prominent notice with each copy of the Combined Work that
  88 + the Library is used in it and that the Library and its use are
  89 + covered by this License.
  90 +
  91 + b) Accompany the Combined Work with a copy of the GNU GPL and this license
  92 + document.
  93 +
  94 + c) For a Combined Work that displays copyright notices during
  95 + execution, include the copyright notice for the Library among
  96 + these notices, as well as a reference directing the user to the
  97 + copies of the GNU GPL and this license document.
  98 +
  99 + d) Do one of the following:
  100 +
  101 + 0) Convey the Minimal Corresponding Source under the terms of this
  102 + License, and the Corresponding Application Code in a form
  103 + suitable for, and under terms that permit, the user to
  104 + recombine or relink the Application with a modified version of
  105 + the Linked Version to produce a modified Combined Work, in the
  106 + manner specified by section 6 of the GNU GPL for conveying
  107 + Corresponding Source.
  108 +
  109 + 1) Use a suitable shared library mechanism for linking with the
  110 + Library. A suitable mechanism is one that (a) uses at run time
  111 + a copy of the Library already present on the user's computer
  112 + system, and (b) will operate properly with a modified version
  113 + of the Library that is interface-compatible with the Linked
  114 + Version.
  115 +
  116 + e) Provide Installation Information, but only if you would otherwise
  117 + be required to provide such information under section 6 of the
  118 + GNU GPL, and only to the extent that such information is
  119 + necessary to install and execute a modified version of the
  120 + Combined Work produced by recombining or relinking the
  121 + Application with a modified version of the Linked Version. (If
  122 + you use option 4d0, the Installation Information must accompany
  123 + the Minimal Corresponding Source and Corresponding Application
  124 + Code. If you use option 4d1, you must provide the Installation
  125 + Information in the manner specified by section 6 of the GNU GPL
  126 + for conveying Corresponding Source.)
  127 +
  128 + 5. Combined Libraries.
  129 +
  130 + You may place library facilities that are a work based on the
  131 +Library side by side in a single library together with other library
  132 +facilities that are not Applications and are not covered by this
  133 +License, and convey such a combined library under terms of your
  134 +choice, if you do both of the following:
  135 +
  136 + a) Accompany the combined library with a copy of the same work based
  137 + on the Library, uncombined with any other library facilities,
  138 + conveyed under the terms of this License.
  139 +
  140 + b) Give prominent notice with the combined library that part of it
  141 + is a work based on the Library, and explaining where to find the
  142 + accompanying uncombined form of the same work.
  143 +
  144 + 6. Revised Versions of the GNU Lesser General Public License.
  145 +
  146 + The Free Software Foundation may publish revised and/or new versions
  147 +of the GNU Lesser General Public License from time to time. Such new
  148 +versions will be similar in spirit to the present version, but may
  149 +differ in detail to address new problems or concerns.
  150 +
  151 + Each version is given a distinguishing version number. If the
  152 +Library as you received it specifies that a certain numbered version
  153 +of the GNU Lesser General Public License "or any later version"
  154 +applies to it, you have the option of following the terms and
  155 +conditions either of that published version or of any later version
  156 +published by the Free Software Foundation. If the Library as you
  157 +received it does not specify a version number of the GNU Lesser
  158 +General Public License, you may choose any version of the GNU Lesser
  159 +General Public License ever published by the Free Software Foundation.
  160 +
  161 + If the Library as you received it specifies that a proxy can decide
  162 +whether future versions of the GNU Lesser General Public License shall
  163 +apply, that proxy's public statement of acceptance of any version is
  164 +permanent authorization for you to choose that version for the
  165 +Library.
... ...
lexSemantics/makefile 0 → 100644
  1 +OCAMLC=ocamlc
  2 +OCAMLOPT=ocamlopt
  3 +OCAMLDEP=ocamldep
  4 +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
  5 +OCAMLFLAGS=$(INCLUDES) -g
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa #eniam-lexSemantics.cmxa
  7 +INSTALLDIR=`ocamlc -where`/eniam
  8 +
  9 +SOURCES= ENIAMlexSemanticsTypes.ml ENIAMlexSemantics.ml
  10 +
  11 +all: eniam-lexSemantics.cma eniam-lexSemantics.cmxa
  12 +
  13 +install: all
  14 + mkdir -p $(INSTALLDIR)
  15 + cp eniam-lexSemantics.cmxa eniam-lexSemantics.a eniam-lexSemantics.cma $(INSTALLDIR)
  16 + cp ENIAMlexSemanticsTypes.cmi ENIAMlexSemantics.cmi $(INSTALLDIR)
  17 + cp ENIAMlexSemanticsTypes.cmx ENIAMlexSemantics.cmx $(INSTALLDIR)
  18 + mkdir -p /usr/share/eniam/lexSemantics
  19 + cp resources/* /usr/share/eniam/lexSemantics
  20 + ln -s /usr/share/eniam/lexSemantics/proper_names_20160104.tab /usr/share/eniam/lexSemantics/proper_names.tab
  21 + ln -s /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab
  22 +
  23 +eniam-lexSemantics.cma: $(SOURCES)
  24 + ocamlc -linkall -a -o eniam-lexSemantics.cma $(OCAMLFLAGS) $^
  25 +
  26 +eniam-lexSemantics.cmxa: $(SOURCES)
  27 + ocamlopt -linkall -a -o eniam-lexSemantics.cmxa $(INCLUDES) $^
  28 +
  29 +test: test.ml
  30 + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml
  31 +
  32 +.SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx
  33 +
  34 +.mll.ml:
  35 + ocamllex $<
  36 +
  37 +.mly.mli:
  38 + ocamlyacc $<
  39 +
  40 +.mly.ml:
  41 + ocamlyacc $<
  42 +
  43 +.ml.cmo:
  44 + $(OCAMLC) $(OCAMLFLAGS) -c $<
  45 +
  46 +.mli.cmi:
  47 + $(OCAMLC) $(OCAMLFALGS) -c $<
  48 +
  49 +.ml.cmx:
  50 + $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
  51 +
  52 +clean:
  53 + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test
... ...
resources/SGJP/README renamed to lexSemantics/resources/README
1   -Files in this folder were created on the basis of
  1 +File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of
  2 +
2 3 SGJP: Grammatical Dictionary of Polish, version 20151020
3 4 Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin
4 5 Woliński, Robert Wołosz, Danuta Skowrońska
... ...
resources/proper_names_20160104.tab renamed to lexSemantics/resources/proper_names_20160104.tab
resources/SGJP/proper_names_sgjp_polimorf_20151020.tab renamed to lexSemantics/resources/proper_names_sgjp_polimorf_20151020.tab
lexSemantics/test.ml 0 → 100644
  1 +(*
  2 + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
  3 + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +
  21 +let test_strings = [
  22 + "Szpak frunie zimą.";
  23 + "Kot miauczy w październiku.";
  24 +(* "a gdybym miałem";
  25 + "A Gdy Miałem";
  26 + "GDY MIAŁEM";
  27 + "I II III IV V VI VII VIII IX X MCXIV MXC";
  28 + "Kiedy Piotr Prabucki, przewodniczący Komisji Budżetu PeKaO";
  29 + "25 idzie 20.";
  30 + "Kot. Kot. kot.";
  31 + "25.";
  32 + "25.888.231";
  33 + "Ala 25.888.231.111 ma.";
  34 + "Ala 25.888.031,011.";
  35 + "Ala -25.888.031,011.";
  36 + "Ala -25 .";
  37 + "Ala -1° C 3° ciepła 20—30°C od 180° do 260°C około 6° poniżej horyzontu.";
  38 + "Ala 22-25 .";
  39 + "Ala 22.5.2000-25.5.2001 .";*)
  40 +(* "Np. Ala.";*)
  41 + (* "w. dom.";
  42 + "tzn.";
  43 + "c.d.n."; *)
  44 +(* "Arabia Saudyjska biegnie.";
  45 + "Cauchy'ego ONZ-owska biegnie.";*)
  46 + (* "TE-cie E-e.";
  47 + "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE.";
  48 + "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; *)
  49 +(* "Tom idzie.";*)
  50 + (* "Miałem miał."; *)
  51 +(* "Szpak śpiewa.";
  52 + "Ala ma kota.";
  53 + "Ale mają kota:"*)
  54 + ]
  55 +
  56 +let _ =
  57 + print_endline "Testy wbudowane";
  58 + Xlist.iter test_strings (fun s ->
  59 + print_endline ("\nTEST: " ^ s);
  60 + let text,tokens = ENIAMsubsyntax.parse_text s in
  61 + let lex_sems = ENIAMlexSemantics.assign tokens text in
  62 + (* print_endline (ENIAMtokenizer.xml_of tokens); *)
  63 + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token)));
  64 +(* print_endline "Testy użytkownika.";
  65 + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
  66 + let s = ref (read_line ()) in
  67 + while !s <> "" do
  68 + let tokens = ENIAMtokenizer.parse !s in
  69 + (* print_endline (ENIAMtokenizer.xml_of tokens); *)
  70 + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token));
  71 + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
  72 + s := read_line ()
  73 + done;*)
  74 + ()
... ...
tokenizer/ENIAMtokenizerTypes.ml
... ... @@ -42,7 +42,7 @@ type token =
42 42  
43 43 (* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających
44 44 informacje o poszczególnych tokenach *)
45   -and token_record = {
  45 +type token_record = {
46 46 orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token *)
47 47 corr_orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token z poprawionymi błędami *)
48 48 beg: int; (* pozycja początkowa tokenu względem początku akapitu *)
... ...