wstępna wersja biblioteki eniam-lexSemantics-1.0

Wojciech Jaworski
1 parent d3e13e84
Showing 14 changed files with 880 additions and 13 deletions
integration/README
integration/makefile
integration/test
lexSemantics/ENIAMlexSemantics.ml
pre/preSemantics.ml → lexSemantics/ENIAMlexSemanticsData.ml
lexSemantics/ENIAMlexSemanticsTypes.ml
lexSemantics/README
lexSemantics/lgpl-3.0.txt
lexSemantics/makefile
resources/SGJP/README → lexSemantics/resources/README
resources/proper_names_20160104.tab → lexSemantics/resources/proper_names_20160104.tab
resources/SGJP/proper_names_sgjp_polimorf_20151020.tab → lexSemantics/resources/proper_names_sgjp_polimorf_20151020.tab
lexSemantics/test.ml
tokenizer/ENIAMtokenizerTypes.ml
@@ -8,8 +8,9 @@ Install
  
 ENIAMintegration requires OCaml version 4.02.3 compiler
 together with Xlib library version 3.1 or later,
-ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0
-and ENIAMsubsyntax library version 1.0.
+ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0,
+ENIAMsubsyntax library version 1.0, ENIAMwalenty library version 1.0
+and ENIAMplWordnet library version 1.0.
  
 In order to install type:
  
@@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
 OCAMLDEP=ocamldep
 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
 OCAMLFLAGS=$(INCLUDES) -g
-OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
+OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa
 INSTALLDIR=`ocamlc -where`/eniam
  
 SOURCES= ENIAM_CONLL.ml ENIAMpreIntegration.ml
@@ -25,7 +25,7 @@ eniam-integration.cmxa: $(SOURCES)
 	ocamlopt -linkall -a -o eniam-integration.cmxa $(INCLUDES) $^
  
 test: test.ml
-	$(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml
+	$(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml
  
 concraft_test: concraft_test.ml
 	$(OCAMLOPT) -o concraft_test $(OCAMLOPTFLAGS) concraft_test.ml
@@ -51,4 +51,4 @@ concraft_test: concraft_test.ml
 	$(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
  
 clean:
-	rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test
+	rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test concraft_test
+(*
+ *  ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
+ *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
+ *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
+ *
+ *  This library is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *)
+
+open ENIAMtokenizerTypes
+open ENIAMlexSemanticsTypes
+open Xstd
+
+let load_proper_name proper = function
+    [lemma; types] ->
+         let types = Str.split (Str.regexp "|") types in
+         StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
+  | l -> failwith ("proper_names: " ^ String.concat " " l)
+
+let proper_names =
+  let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in
+  let proper = File.fold_tab proper_names_filename2 StringMap.empty load_proper_name in
+  proper
+
+let remove l s =
+  Xlist.fold l [] (fun l t ->
+    if s = t then l else t :: l)
+
+let find_proper_names tokens i t =
+  match t.token with
+    Lemma(lemma,pos,interp) ->
+        if StringMap.mem proper_names lemma then
+          let t = {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
+                  attrs=remove t.attrs "notvalidated proper"} in
+          ExtArray.set tokens i t else
+        if Xlist.mem t.attrs "notvalidated proper" then
+          let t = {t with token=Proper(lemma,pos,interp,[])} in
+          ExtArray.set tokens i t
+  | _ -> ()
+
+let find_senses t = (* FIXME: sensy zawierające 'się' *)
+  match t.token with
+    Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos
+  | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses
+  | _ -> []
+
+
+let assign tokens text =
+  let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in
+  let _ = ExtArray.add lex_sems empty_lex_sem in
+  Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
+    let token = ExtArray.get tokens i in
+    find_proper_names tokens i token;
+    let senses = find_senses token in
+    let lex_sem = {empty_lex_sem with senses=senses} in
+    let j = ExtArray.add lex_sems lex_sem in
+    if j <> i then failwith "assign_semantic_valence")
+
+(*
+(*   print_endline "a14"; *)
+  let paths = assign_valence paths in
+(*   print_endline "a15"; *)
+(*   print_endline "a16"; *)
+  let paths = disambiguate_senses paths in
+  let paths = assign_simplified_valence paths in
+  let paths = PreSemantics.assign_semantics paths in
+(*   print_endline "a16"; *)
+
+
+
+
+
+let assign_valence paths =
+  let lexemes = Xlist.fold paths StringMap.empty (fun lexemes t ->
+    match t.token with
+      Lemma(lemma,pos,_) ->
+        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos)
+    | Proper(lemma,pos,_,_) ->
+        let pos = match pos with
+          "subst" -> "psubst"
+        | "depr" -> "pdepr"
+        | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in
+        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *)
+    | _ -> lexemes) in
+  let valence = WalFrames.find_frames lexemes in
+  List.rev (Xlist.rev_map paths (fun t ->
+    match t.token with
+      Lemma(lemma,pos,_) -> {t with valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
+    | Proper(lemma,pos,interp,_) -> {t with valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
+                                                         (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*));
+                                            token=Lemma(lemma,pos,interp)}
+    | _ -> t))
+
+(**********************************************************************************)
+
+(* let prepare_indexes (paths,_) =
+  let set = Xlist.fold paths IntSet.empty (fun set t ->
+    IntSet.add (IntSet.add set t.beg) t.next) in
+  let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x ->
+    IntMap.add map x n, n+1) in
+  List.rev (Xlist.rev_map paths (fun t ->
+    {t with lnode=IntMap.find map t.beg; rnode=IntMap.find map t.next})), last - 1 *)
+
+let get_prefs_schema prefs schema =
+  Xlist.fold schema prefs (fun prefs t ->
+    Xlist.fold t.WalTypes.sel_prefs prefs StringSet.add)
+
+let map_prefs_schema senses schema =
+  Xlist.map schema (fun t ->
+    if Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.Pro) || Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.ProNG) then t else
+    {t with WalTypes.sel_prefs = Xlist.fold t.WalTypes.sel_prefs [] (fun l s ->
+      if StringSet.mem senses s then s :: l else l)})
+
+let disambiguate_senses paths =
+  let prefs = Xlist.fold paths (StringSet.singleton "ALL") (fun prefs t ->
+    Xlist.fold t.valence prefs (fun prefs -> function
+      _,WalTypes.Frame(_,schema) -> get_prefs_schema prefs schema
+    | _,WalTypes.LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema
+    | _,WalTypes.ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in
+  let hipero = Xlist.fold paths (StringSet.singleton "ALL") (fun hipero t ->
+    Xlist.fold t.senses hipero (fun hipero (_,l,_) ->
+      Xlist.fold l hipero StringSet.add)) in
+  let senses = StringSet.intersection prefs hipero in
+  let is_zero = StringSet.mem hipero "0" in
+  let senses = if is_zero then StringSet.add senses "0" else senses in
+  Xlist.map paths (fun t ->
+    {t with valence = if is_zero then t.valence else
+        Xlist.map t.valence (function
+          n,WalTypes.Frame(a,schema) -> n,WalTypes.Frame(a,map_prefs_schema senses schema)
+        | n,WalTypes.LexFrame(s,p,r,schema) -> n,WalTypes.LexFrame(s,p,r,map_prefs_schema senses schema)
+        | n,WalTypes.ComprepFrame(s,p,r,schema) -> n,WalTypes.ComprepFrame(s,p,r,map_prefs_schema senses schema));
+      senses = Xlist.map t.senses (fun (s,l,w) ->
+        s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)})
+
+(*let single_sense (paths,last) =
+  List.rev (Xlist.rev_map paths (fun t ->
+    let sense =
+      if t.senses = [] then [] else
+      [Xlist.fold t.senses ("",[],-.max_float) (fun (max_meaning,max_hipero,max_weight) (meaning,hipero,weight) ->
+        if max_weight >= weight then max_meaning,max_hipero,max_weight else meaning,hipero,weight)] in
+    {t with senses=sense})), last*)
+
+open WalTypes
+
+(*let single_schema schemata =
+  let map = Xlist.fold schemata StringMap.empty (fun map schema ->
+    let t = WalStringOf.schema (List.sort compare (Xlist.fold schema [] (fun l s ->
+      if s.gf <> ARG && s.gf <> ADJUNCT then {s with role=""; role_attr=""; sel_prefs=[]} :: l else
+      if s.cr <> [] || s.ce <> [] then {s with role=""; role_attr=""; sel_prefs=[]} :: l else l))) in
+    StringMap.add_inc map t [schema] (fun l -> schema :: l)) in
+  StringMap.fold map [] (fun l _ schemata ->
+    let map = Xlist.fold schemata StringMap.empty (fun map schema ->
+      Xlist.fold schema map (fun map s ->
+        let t = WalStringOf.schema [{s with role=""; role_attr=""; sel_prefs=[]}] in
+        StringMap.add_inc map t [s] (fun l -> s :: l))) in
+    let schema = StringMap.fold map [] (fun schema _ l ->
+      let s = List.hd l in
+      {s with sel_prefs=Xlist.fold s.sel_prefs [] (fun l t -> if t = "0" || t = "T" then t :: l else l)} :: schema) in
+    schema :: l)*)
+
+let remove_meaning = function
+    DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a)
+  | EmptyAtrs m -> EmptyAtrs []
+  | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*))
+  | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*))
+  | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a)
+  | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a)
+  | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a)
+  | _ -> failwith "remove_meaning"
+
+
+(*let single_frame (paths,last) =
+  List.rev (Xlist.rev_map paths (fun t ->
+    let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
+        Frame(attrs,schema) ->
+          let attrs = remove_meaning attrs in
+          lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema]) (fun (_,l) -> attrs, schema :: l)
+      | frame -> frame :: lex_frames, frames) in
+    let frames = StringMap.fold frames lex_frames (fun frames _ (attrs,schemata) ->
+      Xlist.fold (single_schema schemata) frames (fun frames frame -> Frame(attrs,frame) :: frames)) in
+    {t with valence=frames})), last    *)
+
+let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *)
+    Phrase(NP(Case "dat")) -> l
+  | Phrase(NP(Case "inst")) -> l
+  | Phrase(PrepNP _) -> l
+  | Phrase(PrepAdjP _) -> l
+  | Phrase(NumP (Case "dat")) -> l
+  | Phrase(NumP (Case "inst")) -> l
+  | Phrase(PrepNumP _) -> l
+  | Phrase(ComprepNP _) -> l
+  | Phrase(ComparNP _) -> l
+  | Phrase(ComparPP _) -> l
+  | Phrase(IP) -> l
+  | Phrase(CP _) -> l
+  | Phrase(NCP(Case "dat",_,_)) -> l
+  | Phrase(NCP(Case "inst",_,_)) -> l
+  | Phrase(PrepNCP _) -> l
+(*   | Phrase(PadvP) -> l *)
+  | Phrase(AdvP) -> l
+  | Phrase(PrepP) -> l
+  | Phrase(Or) -> l
+  | Phrase(Qub) -> l
+  | Phrase(Adja) -> l
+  | Phrase(Inclusion) -> l
+  | Phrase Pro -> Phrase Null :: l
+  | t -> t :: l
+
+let simplify_position_noun l = function
+    Phrase(NP(Case "gen")) -> l
+  | Phrase(NP(Case "nom")) -> l
+  | Phrase(NP(CaseAgr)) -> l
+  | Phrase(PrepNP _) -> l
+  | Phrase(AdjP AllAgr) -> l
+  | Phrase(NumP (Case "gen")) -> l
+  | Phrase(NumP (Case "nom")) -> l
+  | Phrase(NumP (CaseAgr)) -> l
+  | Phrase(PrepNumP _) -> l
+  | Phrase(ComprepNP _) -> l
+  | Phrase(ComparNP _) -> l
+  | Phrase(ComparPP _) -> l
+  | Phrase(IP) -> l
+  | Phrase(NCP(Case "gen",_,_)) -> l
+  | Phrase(PrepNCP _) -> l
+  | Phrase(PrepP) -> l
+  | Phrase(Qub) -> l
+  | Phrase(Adja) -> l
+  | Phrase(Inclusion) -> l
+  | Phrase Pro -> Phrase Null :: l
+  | t -> t :: l
+
+let simplify_position_adj l = function
+    Phrase(AdvP) -> l
+  | t -> t :: l
+
+let simplify_position_adv l = function
+    Phrase(AdvP) -> l
+  | t -> t :: l
+
+
+let simplify_position pos l s =
+  let morfs = match pos with
+      "verb" -> List.rev (Xlist.fold s.morfs [] simplify_position_verb)
+    | "noun" -> List.rev (Xlist.fold s.morfs [] simplify_position_noun)
+    | "adj" -> List.rev (Xlist.fold s.morfs [] simplify_position_adj)
+    | "adv" -> List.rev (Xlist.fold s.morfs [] simplify_position_adv)
+    | _ -> s.morfs in
+  match morfs with
+    [] -> l
+  | [Phrase Null] -> l
+  | _ -> {s with morfs=morfs} :: l
+
+let simplify_schemata pos schemata =
+  let schemata = Xlist.fold schemata StringMap.empty (fun schemata (schema,frame) ->
+    let schema = List.sort compare (Xlist.fold schema [] (fun l s ->
+      let s = {s with role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; morfs=List.sort compare s.morfs} in
+      if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else
+(*       if s.cr <> [] || s.ce <> [] then s :: l else  *)
+      simplify_position pos l s)) in
+    StringMap.add_inc schemata (WalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in
+  StringMap.fold schemata [] (fun l _ s -> s :: l)
+
+(* FIXME: problem ComprepNP i PrepNCP *)
+(* FIXME: problem gdy ten sam token występuje w  kilku ścieżkach *)
+let generate_verb_prep_adjuncts preps =
+  Xlist.map preps (fun (lemma,case) -> WalFrames.verb_prep_adjunct_schema_field lemma case)
+
+let generate_verb_comprep_adjuncts compreps =
+  Xlist.map compreps (fun lemma -> WalFrames.verb_comprep_adjunct_schema_field lemma)
+
+let generate_verb_compar_adjuncts compars =
+  Xlist.map compars (fun lemma -> WalFrames.verb_compar_adjunct_schema_field lemma)
+
+let generate_noun_prep_adjuncts preps =
+  WalFrames.noun_prep_adjunct_schema_field preps
+
+let generate_noun_compar_adjuncts compars =
+  WalFrames.noun_compar_adjunct_schema_field compars
+
+let generate_adj_compar_adjuncts compars =
+  WalFrames.noun_compar_adjunct_schema_field compars
+
+let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"]
+
+let generate_prep_adjunct_tokens paths =
+  let map = Xlist.fold paths StringMap.empty (fun map t ->
+    match t.token with
+      Lemma(lemma,"prep",interp) ->
+        let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in
+        if StringSet.mem compars lemma then map else
+        Xlist.fold interp map (fun map -> function
+          [cases] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
+        | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
+        | _ -> map)
+    | _ -> map) in
+  StringMap.fold map [] (fun l _ v -> v :: l)
+
+let generate_comprep_adjunct_tokens paths =
+  let lemmas = Xlist.fold paths StringSet.empty (fun lemmas t ->
+    match t.token with
+      Lemma(lemma,_,_) -> StringSet.add lemmas lemma
+    | _ -> lemmas) in
+  StringMap.fold WalFrames.comprep_reqs [] (fun compreps comprep reqs ->
+    let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in
+    if b then comprep :: compreps else compreps)
+
+let generate_compar_adjunct_tokens paths =
+  let set = Xlist.fold paths StringSet.empty (fun set t ->
+    match t.token with
+      Lemma(lemma,"prep",interp) ->
+        if not (StringSet.mem compars lemma) then set else
+        StringSet.add set lemma
+    | _ -> set) in
+  StringSet.to_list set
+
+let is_measure = function
+    NounAtrs(_,_,Common "measure") -> true
+  | _ -> false
+
+let assign_simplified_valence paths =
+  let preps = generate_prep_adjunct_tokens paths in
+  let compreps = generate_comprep_adjunct_tokens paths in
+  let compars = generate_compar_adjunct_tokens paths in
+  let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in
+  let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in
+  let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in
+  let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in
+  let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in
+  let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in
+  let verb_adjuncts = WalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in
+  let noun_adjuncts = WalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
+  let noun_measure_adjuncts = WalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
+  let adj_adjuncts = WalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in
+  let adv_adjuncts = WalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in
+  List.rev (Xlist.rev_map paths (fun t ->
+    let pos = match t.token with
+        Lemma(_,pos,_) -> WalFrames.simplify_pos pos
+      | _ -> "" in
+    let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
+        _,(Frame(attrs,schema) as frame) ->
+          let attrs = remove_meaning attrs in
+          lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l)
+      | _,frame -> frame :: lex_frames, frames) in
+    let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame ->
+      (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in
+    let simp_frames,full_frames,_ = StringMap.fold frames (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) _ (attrs,schemata) ->
+      Xlist.fold (simplify_schemata pos schemata) (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) (schema,frames) ->
+        let schema = match pos with
+            "verb" -> schema @ verb_adjuncts
+          | "noun" -> schema @ (if is_measure attrs then noun_measure_adjuncts else noun_adjuncts)
+          | "adj" -> schema @ adj_adjuncts
+          | "adv" -> schema @ adv_adjuncts
+          | _ -> schema in
+        (n,Frame(attrs,schema)) :: simp_frames,
+        Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames),
+        n+1)) in
+    {t with simple_valence=simp_frames; valence=full_frames}))
+
+(* FIXME: dodać do walencji preferencje selekcyjne nadrzędników symboli: dzień, godzina, rysunek itp. *)
+(* FIXME: sprawdzić czy walencja nazw własnych jest dobrze zrobiona. *)
+
+(* let first_id = 1 (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
+
+let add_ids (paths,last) next_id =
+  let paths,next_id = Xlist.fold ((*List.rev*) paths) ([],next_id) (fun (paths,id) t ->
+    {t with id=id} :: paths, id+1) in
+  (paths,last),next_id *)
+
+
+
+let parse query =
+(*   print_endline "a1"; *)
+  let l = Xunicode.classified_chars_of_utf8_string query in
+(*   print_endline "a2"; *)
+  let l = PreTokenizer.tokenize l in
+(*   print_endline "a3"; *)
+  let l = PrePatterns.normalize_tokens [] l in
+(*   print_endline "a4"; *)
+  let l = PrePatterns.find_replacement_patterns l in
+(*   print_endline "a5"; *)
+  let l = PrePatterns.remove_spaces [] l in
+  let l = PrePatterns.find_abr_patterns PreAcronyms.abr_patterns l in
+  let l = PrePatterns.normalize_tokens [] l in
+(*   print_endline "a6"; *)
+  let paths = PrePaths.translate_into_paths l in
+(*   print_endline "a7"; *)
+  let paths = PrePaths.lemmatize paths in
+(*   print_endline "a8"; *)
+  let paths,_ = PreMWE.process paths in
+(*   print_endline "a12"; *)
+  let paths = find_proper_names paths in
+(*   print_endline "a13"; *)
+  let paths = modify_weights paths in
+  let paths = translate_digs paths in
+  let paths = assign_senses paths in
+(*   print_endline "a14"; *)
+  let paths = assign_valence paths in
+(*   print_endline "a15"; *)
+  let paths = combine_interps paths in
+(*   print_endline "a16"; *)
+  let paths = disambiguate_senses paths in
+  let paths = assign_simplified_valence paths in
+  let paths = PreSemantics.assign_semantics paths in
+(*   print_endline "a16"; *)
+  let paths = select_tokens paths in
+(*   print_endline "a17"; *)
+(*  let paths = if !single_sense_flag then single_sense paths else paths in
+  let paths = if !single_frame_flag then single_frame paths else paths in*)
+  (*let paths, next_id = add_ids paths next_id in
+  let paths = prepare_indexes paths in*)
+(*   print_endline "a18"; *)
+  paths(*, next_id*)
+(*     print_endline (PrePaths.to_string paths);     *)
+(*   let paths =
+    if PrePaths.no_possible_path (PrePaths.map paths PreLemmatization.remove_postags) then
+      PrePaths.map paths process_ign
+    else paths in
+  let paths = PrePaths.map paths PreLemmatization.remove_postags in
+  let paths = PreCaseShift.manage_lower_upper_case paths in (* FIXME: niepotrzebnie powiększa pierwszy token (przymiotniki partykuły itp.) *)
+  let paths = PreLemmatization.combine_interps paths in
+(*     print_endline (PrePaths.to_string paths);     *)*)
+
+let parse_conll tokens dep_paths = (* FIXME: sprawdzić, czy zachowana jest kolejność elementów paths !!! *)
+  let paths = List.rev (Int.fold 1 (Array.length dep_paths - 1) [] (fun paths conll_id ->
+    let id,_,_ = dep_paths.(conll_id) in
+    ExtArray.get tokens id :: paths)) in
+  (* print_endline "a12"; *)
+  let paths = find_proper_names paths in
+  (*   print_endline "a13"; *)
+  let paths = modify_weights paths in
+  let paths = PreWordnet.assign_senses paths in
+  (*   print_endline "a14"; *)
+    (* let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *) *)
+  (*   print_endline "a15"; *)
+  let paths = assign_valence paths in
+  (*   print_endline "a16"; *)
+  let paths = disambiguate_senses paths in
+  let paths = assign_simplified_valence paths in
+  let paths = PreSemantics.assign_semantics paths in
+  (*   print_endline "a16"; *)
+  let _ = Xlist.fold paths 1 (fun conll_id t ->
+    let id,_,_ = dep_paths.(conll_id) in
+    ExtArray.set tokens id t;
+    conll_id + 1) in
+  ()
+*)
 (*
- *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
+ *  ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
  *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
  *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
+ *  This library is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
  *  the Free Software Foundation, either version 3 of the License, or
  *  (at your option) any later version.
  *
- *  This program is distributed in the hope that it will be useful,
+ *  This library is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
+ *  GNU Lesser General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License
+ *  You should have received a copy of the GNU Lesser General Public License
  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *)
  
+(*
+ *  ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
+ *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
+ *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
+ *
+ *  This library is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *)
+
+open ENIAMtokenizerTypes
+open Xstd
+
+type labels = {
+  number: string;
+  case: string;
+  gender: string;
+  person: string;
+  aspect: string;
+  }
+
+type semantics =
+    Normal
+  | Special of string list
+(*  | SpecialNoun of type_arg list * type_term
+  | SpecialMod of string * (type_arg list * type_term)*)
+  | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *)
+
+type lex_sem = {
+  e: labels;
+  valence: (int * ENIAMwalTypes.frame) list;
+  simple_valence: (int * ENIAMwalTypes.frame) list;
+  senses: (string * string list * float) list;
+  lroles: string * string;
+  semantics: semantics;
+  }
+
+let empty_labels = {
+  number="";
+  case="";
+  gender="";
+  person="";
+  aspect="";
+  }
+
+let empty_lex_sem = {
+  e=empty_labels; valence=[]; simple_valence=[]; senses=[];
+  lroles="",""; semantics=Normal}
+
+let proper_names_filename = resource_path ^ "/lexSemantics/proper_names_sgjp_polimorf.tab"
+let proper_names_filename2 = resource_path ^ "/lexSemantics/proper_names.tab"
+ENIAMsemValence Version 1.0 :
+-----------------------
+
+ENIAMsemValence is a library that assigns tokens with lexicosemantic information.
+It recognizes named entities and assigns thematic roles,
+senses, valence and other semantic information to tokens.
+
+Install
+-------
+
+ENIAMsemValence requires OCaml version 4.02.3 compiler
+together with Xlib library version 3.1 or later,
+ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0
+and ENIAMsubsyntax library version 1.0.
+
+In order to install type:
+
+make install
+
+by default, ENIAMsemValence is installed in the 'ocamlc -where'/eniam directory.
+you can change it by editing the Makefile.
+
+In order to test library type:
+make test
+./test
+
+By default ENIAMsemValence looks for resources in /usr/share/eniam directory.
+However this behaviour may be changed by setting end exporting ENIAM_RESOURCE_PATH
+environment variable.
+
+Credits
+-------
+Copyright © 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
+Copyright © 2016 Institute of Computer Science Polish Academy of Sciences
+
+The library uses the following licensed resources:
+
+SGJP: Grammatical Dictionary of Polish, version 20151020
+Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin
+Woliński, Robert Wołosz, Danuta Skowrońska
+http://sgjp.pl
+
+Licence
+-------
+
+This library is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
+OCAMLC=ocamlc
+OCAMLOPT=ocamlopt
+OCAMLDEP=ocamldep
+INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
+OCAMLFLAGS=$(INCLUDES) -g
+OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa #eniam-lexSemantics.cmxa
+INSTALLDIR=`ocamlc -where`/eniam
+
+SOURCES= ENIAMlexSemanticsTypes.ml ENIAMlexSemantics.ml
+
+all: eniam-lexSemantics.cma eniam-lexSemantics.cmxa
+
+install: all
+	mkdir -p $(INSTALLDIR)
+	cp eniam-lexSemantics.cmxa eniam-lexSemantics.a eniam-lexSemantics.cma $(INSTALLDIR)
+	cp ENIAMlexSemanticsTypes.cmi ENIAMlexSemantics.cmi $(INSTALLDIR)
+	cp ENIAMlexSemanticsTypes.cmx ENIAMlexSemantics.cmx $(INSTALLDIR)
+	mkdir -p /usr/share/eniam/lexSemantics
+	cp resources/*  /usr/share/eniam/lexSemantics
+	ln -s /usr/share/eniam/lexSemantics/proper_names_20160104.tab /usr/share/eniam/lexSemantics/proper_names.tab
+	ln -s /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab
+
+eniam-lexSemantics.cma: $(SOURCES)
+	ocamlc -linkall -a -o eniam-lexSemantics.cma $(OCAMLFLAGS) $^
+
+eniam-lexSemantics.cmxa: $(SOURCES)
+	ocamlopt -linkall -a -o eniam-lexSemantics.cmxa $(INCLUDES) $^
+
+test: test.ml
+	$(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml
+
+.SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx
+
+.mll.ml:
+	ocamllex $<
+
+.mly.mli:
+	ocamlyacc $<
+
+.mly.ml:
+	ocamlyacc $<
+
+.ml.cmo:
+	$(OCAMLC) $(OCAMLFLAGS) -c $<
+
+.mli.cmi:
+	$(OCAMLC) $(OCAMLFALGS) -c $<
+
+.ml.cmx:
+	$(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
+
+clean:
+	rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test
-Files in this folder were created on the basis of 
+File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of 
+
 SGJP: Grammatical Dictionary of Polish, version 20151020
 Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin
 Woliński, Robert Wołosz, Danuta Skowrońska
+(*
+ *  ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
+ *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
+ *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
+ *
+ *  This library is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *)
+
+
+let test_strings = [
+  "Szpak frunie zimą.";
+  "Kot miauczy w październiku.";
+(*  "a gdybym miałem";
+  "A Gdy Miałem";
+  "GDY MIAŁEM";
+  "I II III IV V VI VII VIII IX X MCXIV MXC";
+  "Kiedy Piotr Prabucki, przewodniczący Komisji Budżetu PeKaO";
+  "25 idzie 20.";
+  "Kot. Kot. kot.";
+  "25.";
+  "25.888.231";
+  "Ala 25.888.231.111 ma.";
+  "Ala 25.888.031,011.";
+  "Ala -25.888.031,011.";
+  "Ala -25 .";
+  "Ala -1° C  3° ciepła 20—30°C od 180° do 260°C  około 6° poniżej horyzontu.";
+  "Ala 22-25 .";
+  "Ala 22.5.2000-25.5.2001 .";*)
+(*  "Np. Ala.";*)
+  (* "w. dom.";
+  "tzn.";
+  "c.d.n."; *)
+(*  "Arabia Saudyjska biegnie.";
+  "Cauchy'ego ONZ-owska biegnie.";*)
+  (* "TE-cie E-e.";
+  "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE.";
+  "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; *)
+(*  "Tom idzie.";*)
+  (* "Miałem miał."; *)
+(*  "Szpak śpiewa.";
+  "Ala ma kota.";
+  "Ale mają kota:"*)
+  ]
+
+let _ =
+  print_endline "Testy wbudowane";
+  Xlist.iter test_strings (fun s ->
+    print_endline ("\nTEST: " ^ s);
+    let text,tokens = ENIAMsubsyntax.parse_text s in
+    let lex_sems = ENIAMlexSemantics.assign tokens text in
+    (* print_endline (ENIAMtokenizer.xml_of tokens); *)
+    Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token)));
+(*  print_endline "Testy użytkownika.";
+  print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
+  let s = ref (read_line ()) in
+  while !s <> "" do
+    let tokens = ENIAMtokenizer.parse !s in
+    (* print_endline (ENIAMtokenizer.xml_of tokens); *)
+    Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token));
+    print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
+    s := read_line ()
+  done;*)
+  ()
@@ -42,7 +42,7 @@ type token =
  
 (* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających
    informacje o poszczególnych tokenach *)
-and token_record = {
+type token_record = {
   orth: string;		(* sekwencja znaków pierwotnego tekstu składająca się na token *)
   corr_orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token z poprawionymi błędami *)
   beg: int; 		(* pozycja początkowa tokenu względem początku akapitu *)