diff --git a/integration/README b/integration/README index 4937f70..6532b32 100644 --- a/integration/README +++ b/integration/README @@ -8,8 +8,9 @@ Install ENIAMintegration requires OCaml version 4.02.3 compiler together with Xlib library version 3.1 or later, -ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0 -and ENIAMsubsyntax library version 1.0. +ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0, +ENIAMsubsyntax library version 1.0, ENIAMwalenty library version 1.0 +and ENIAMplWordnet library version 1.0. In order to install type: diff --git a/integration/makefile b/integration/makefile index 2aabc38..2d417cc 100755 --- a/integration/makefile +++ b/integration/makefile @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt OCAMLDEP=ocamldep INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam OCAMLFLAGS=$(INCLUDES) -g -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa INSTALLDIR=`ocamlc -where`/eniam SOURCES= ENIAM_CONLL.ml ENIAMpreIntegration.ml @@ -25,7 +25,7 @@ eniam-integration.cmxa: $(SOURCES) ocamlopt -linkall -a -o eniam-integration.cmxa $(INCLUDES) $^ test: test.ml - $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml concraft_test: concraft_test.ml $(OCAMLOPT) -o concraft_test $(OCAMLOPTFLAGS) concraft_test.ml @@ -51,4 +51,4 @@ concraft_test: concraft_test.ml $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< clean: - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test concraft_test diff --git a/integration/test b/integration/test deleted file mode 100755 index b2e8dbe..0000000 --- a/integration/test +++ /dev/null diff --git a/lexSemantics/ENIAMlexSemantics.ml b/lexSemantics/ENIAMlexSemantics.ml new file mode 100644 index 0000000..3da5010 --- /dev/null +++ b/lexSemantics/ENIAMlexSemantics.ml @@ -0,0 +1,456 @@ +(* + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences + * + * This library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + *) + +open ENIAMtokenizerTypes +open ENIAMlexSemanticsTypes +open Xstd + +let load_proper_name proper = function + [lemma; types] -> + let types = Str.split (Str.regexp "|") types in + StringMap.add_inc proper lemma types (fun types2 -> types @ types2) + | l -> failwith ("proper_names: " ^ String.concat " " l) + +let proper_names = + let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in + let proper = File.fold_tab proper_names_filename2 StringMap.empty load_proper_name in + proper + +let remove l s = + Xlist.fold l [] (fun l t -> + if s = t then l else t :: l) + +let find_proper_names tokens i t = + match t.token with + Lemma(lemma,pos,interp) -> + if StringMap.mem proper_names lemma then + let t = {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma); + attrs=remove t.attrs "notvalidated proper"} in + ExtArray.set tokens i t else + if Xlist.mem t.attrs "notvalidated proper" then + let t = {t with token=Proper(lemma,pos,interp,[])} in + ExtArray.set tokens i t + | _ -> () + +let find_senses t = (* FIXME: sensy zawierające 'się' *) + match t.token with + Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos + | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses + | _ -> [] + + +let assign tokens text = + let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in + let _ = ExtArray.add lex_sems empty_lex_sem in + Int.iter 1 (ExtArray.size tokens - 1) (fun i -> + let token = ExtArray.get tokens i in + find_proper_names tokens i token; + let senses = find_senses token in + let lex_sem = {empty_lex_sem with senses=senses} in + let j = ExtArray.add lex_sems lex_sem in + if j <> i then failwith "assign_semantic_valence") + +(* +(* print_endline "a14"; *) + let paths = assign_valence paths in +(* print_endline "a15"; *) +(* print_endline "a16"; *) + let paths = disambiguate_senses paths in + let paths = assign_simplified_valence paths in + let paths = PreSemantics.assign_semantics paths in +(* print_endline "a16"; *) + + + + + +let assign_valence paths = + let lexemes = Xlist.fold paths StringMap.empty (fun lexemes t -> + match t.token with + Lemma(lemma,pos,_) -> + StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) + | Proper(lemma,pos,_,_) -> + let pos = match pos with + "subst" -> "psubst" + | "depr" -> "pdepr" + | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in + StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *) + | _ -> lexemes) in + let valence = WalFrames.find_frames lexemes in + List.rev (Xlist.rev_map paths (fun t -> + match t.token with + Lemma(lemma,pos,_) -> {t with valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []} + | Proper(lemma,pos,interp,_) -> {t with valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) + (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*)); + token=Lemma(lemma,pos,interp)} + | _ -> t)) + +(**********************************************************************************) + +(* let prepare_indexes (paths,_) = + let set = Xlist.fold paths IntSet.empty (fun set t -> + IntSet.add (IntSet.add set t.beg) t.next) in + let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x -> + IntMap.add map x n, n+1) in + List.rev (Xlist.rev_map paths (fun t -> + {t with lnode=IntMap.find map t.beg; rnode=IntMap.find map t.next})), last - 1 *) + +let get_prefs_schema prefs schema = + Xlist.fold schema prefs (fun prefs t -> + Xlist.fold t.WalTypes.sel_prefs prefs StringSet.add) + +let map_prefs_schema senses schema = + Xlist.map schema (fun t -> + if Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.Pro) || Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.ProNG) then t else + {t with WalTypes.sel_prefs = Xlist.fold t.WalTypes.sel_prefs [] (fun l s -> + if StringSet.mem senses s then s :: l else l)}) + +let disambiguate_senses paths = + let prefs = Xlist.fold paths (StringSet.singleton "ALL") (fun prefs t -> + Xlist.fold t.valence prefs (fun prefs -> function + _,WalTypes.Frame(_,schema) -> get_prefs_schema prefs schema + | _,WalTypes.LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema + | _,WalTypes.ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in + let hipero = Xlist.fold paths (StringSet.singleton "ALL") (fun hipero t -> + Xlist.fold t.senses hipero (fun hipero (_,l,_) -> + Xlist.fold l hipero StringSet.add)) in + let senses = StringSet.intersection prefs hipero in + let is_zero = StringSet.mem hipero "0" in + let senses = if is_zero then StringSet.add senses "0" else senses in + Xlist.map paths (fun t -> + {t with valence = if is_zero then t.valence else + Xlist.map t.valence (function + n,WalTypes.Frame(a,schema) -> n,WalTypes.Frame(a,map_prefs_schema senses schema) + | n,WalTypes.LexFrame(s,p,r,schema) -> n,WalTypes.LexFrame(s,p,r,map_prefs_schema senses schema) + | n,WalTypes.ComprepFrame(s,p,r,schema) -> n,WalTypes.ComprepFrame(s,p,r,map_prefs_schema senses schema)); + senses = Xlist.map t.senses (fun (s,l,w) -> + s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)}) + +(*let single_sense (paths,last) = + List.rev (Xlist.rev_map paths (fun t -> + let sense = + if t.senses = [] then [] else + [Xlist.fold t.senses ("",[],-.max_float) (fun (max_meaning,max_hipero,max_weight) (meaning,hipero,weight) -> + if max_weight >= weight then max_meaning,max_hipero,max_weight else meaning,hipero,weight)] in + {t with senses=sense})), last*) + +open WalTypes + +(*let single_schema schemata = + let map = Xlist.fold schemata StringMap.empty (fun map schema -> + let t = WalStringOf.schema (List.sort compare (Xlist.fold schema [] (fun l s -> + if s.gf <> ARG && s.gf <> ADJUNCT then {s with role=""; role_attr=""; sel_prefs=[]} :: l else + if s.cr <> [] || s.ce <> [] then {s with role=""; role_attr=""; sel_prefs=[]} :: l else l))) in + StringMap.add_inc map t [schema] (fun l -> schema :: l)) in + StringMap.fold map [] (fun l _ schemata -> + let map = Xlist.fold schemata StringMap.empty (fun map schema -> + Xlist.fold schema map (fun map s -> + let t = WalStringOf.schema [{s with role=""; role_attr=""; sel_prefs=[]}] in + StringMap.add_inc map t [s] (fun l -> s :: l))) in + let schema = StringMap.fold map [] (fun schema _ l -> + let s = List.hd l in + {s with sel_prefs=Xlist.fold s.sel_prefs [] (fun l t -> if t = "0" || t = "T" then t :: l else l)} :: schema) in + schema :: l)*) + +let remove_meaning = function + DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a) + | EmptyAtrs m -> EmptyAtrs [] + | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*)) + | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*)) + | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a) + | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a) + | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a) + | _ -> failwith "remove_meaning" + + +(*let single_frame (paths,last) = + List.rev (Xlist.rev_map paths (fun t -> + let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function + Frame(attrs,schema) -> + let attrs = remove_meaning attrs in + lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema]) (fun (_,l) -> attrs, schema :: l) + | frame -> frame :: lex_frames, frames) in + let frames = StringMap.fold frames lex_frames (fun frames _ (attrs,schemata) -> + Xlist.fold (single_schema schemata) frames (fun frames frame -> Frame(attrs,frame) :: frames)) in + {t with valence=frames})), last *) + +let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *) + Phrase(NP(Case "dat")) -> l + | Phrase(NP(Case "inst")) -> l + | Phrase(PrepNP _) -> l + | Phrase(PrepAdjP _) -> l + | Phrase(NumP (Case "dat")) -> l + | Phrase(NumP (Case "inst")) -> l + | Phrase(PrepNumP _) -> l + | Phrase(ComprepNP _) -> l + | Phrase(ComparNP _) -> l + | Phrase(ComparPP _) -> l + | Phrase(IP) -> l + | Phrase(CP _) -> l + | Phrase(NCP(Case "dat",_,_)) -> l + | Phrase(NCP(Case "inst",_,_)) -> l + | Phrase(PrepNCP _) -> l +(* | Phrase(PadvP) -> l *) + | Phrase(AdvP) -> l + | Phrase(PrepP) -> l + | Phrase(Or) -> l + | Phrase(Qub) -> l + | Phrase(Adja) -> l + | Phrase(Inclusion) -> l + | Phrase Pro -> Phrase Null :: l + | t -> t :: l + +let simplify_position_noun l = function + Phrase(NP(Case "gen")) -> l + | Phrase(NP(Case "nom")) -> l + | Phrase(NP(CaseAgr)) -> l + | Phrase(PrepNP _) -> l + | Phrase(AdjP AllAgr) -> l + | Phrase(NumP (Case "gen")) -> l + | Phrase(NumP (Case "nom")) -> l + | Phrase(NumP (CaseAgr)) -> l + | Phrase(PrepNumP _) -> l + | Phrase(ComprepNP _) -> l + | Phrase(ComparNP _) -> l + | Phrase(ComparPP _) -> l + | Phrase(IP) -> l + | Phrase(NCP(Case "gen",_,_)) -> l + | Phrase(PrepNCP _) -> l + | Phrase(PrepP) -> l + | Phrase(Qub) -> l + | Phrase(Adja) -> l + | Phrase(Inclusion) -> l + | Phrase Pro -> Phrase Null :: l + | t -> t :: l + +let simplify_position_adj l = function + Phrase(AdvP) -> l + | t -> t :: l + +let simplify_position_adv l = function + Phrase(AdvP) -> l + | t -> t :: l + + +let simplify_position pos l s = + let morfs = match pos with + "verb" -> List.rev (Xlist.fold s.morfs [] simplify_position_verb) + | "noun" -> List.rev (Xlist.fold s.morfs [] simplify_position_noun) + | "adj" -> List.rev (Xlist.fold s.morfs [] simplify_position_adj) + | "adv" -> List.rev (Xlist.fold s.morfs [] simplify_position_adv) + | _ -> s.morfs in + match morfs with + [] -> l + | [Phrase Null] -> l + | _ -> {s with morfs=morfs} :: l + +let simplify_schemata pos schemata = + let schemata = Xlist.fold schemata StringMap.empty (fun schemata (schema,frame) -> + let schema = List.sort compare (Xlist.fold schema [] (fun l s -> + let s = {s with role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; morfs=List.sort compare s.morfs} in + if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else +(* if s.cr <> [] || s.ce <> [] then s :: l else *) + simplify_position pos l s)) in + StringMap.add_inc schemata (WalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in + StringMap.fold schemata [] (fun l _ s -> s :: l) + +(* FIXME: problem ComprepNP i PrepNCP *) +(* FIXME: problem gdy ten sam token występuje w kilku ścieżkach *) +let generate_verb_prep_adjuncts preps = + Xlist.map preps (fun (lemma,case) -> WalFrames.verb_prep_adjunct_schema_field lemma case) + +let generate_verb_comprep_adjuncts compreps = + Xlist.map compreps (fun lemma -> WalFrames.verb_comprep_adjunct_schema_field lemma) + +let generate_verb_compar_adjuncts compars = + Xlist.map compars (fun lemma -> WalFrames.verb_compar_adjunct_schema_field lemma) + +let generate_noun_prep_adjuncts preps = + WalFrames.noun_prep_adjunct_schema_field preps + +let generate_noun_compar_adjuncts compars = + WalFrames.noun_compar_adjunct_schema_field compars + +let generate_adj_compar_adjuncts compars = + WalFrames.noun_compar_adjunct_schema_field compars + +let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"] + +let generate_prep_adjunct_tokens paths = + let map = Xlist.fold paths StringMap.empty (fun map t -> + match t.token with + Lemma(lemma,"prep",interp) -> + let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in + if StringSet.mem compars lemma then map else + Xlist.fold interp map (fun map -> function + [cases] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case)) + | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case)) + | _ -> map) + | _ -> map) in + StringMap.fold map [] (fun l _ v -> v :: l) + +let generate_comprep_adjunct_tokens paths = + let lemmas = Xlist.fold paths StringSet.empty (fun lemmas t -> + match t.token with + Lemma(lemma,_,_) -> StringSet.add lemmas lemma + | _ -> lemmas) in + StringMap.fold WalFrames.comprep_reqs [] (fun compreps comprep reqs -> + let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in + if b then comprep :: compreps else compreps) + +let generate_compar_adjunct_tokens paths = + let set = Xlist.fold paths StringSet.empty (fun set t -> + match t.token with + Lemma(lemma,"prep",interp) -> + if not (StringSet.mem compars lemma) then set else + StringSet.add set lemma + | _ -> set) in + StringSet.to_list set + +let is_measure = function + NounAtrs(_,_,Common "measure") -> true + | _ -> false + +let assign_simplified_valence paths = + let preps = generate_prep_adjunct_tokens paths in + let compreps = generate_comprep_adjunct_tokens paths in + let compars = generate_compar_adjunct_tokens paths in + let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in + let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in + let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in + let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in + let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in + let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in + let verb_adjuncts = WalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in + let noun_adjuncts = WalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in + let noun_measure_adjuncts = WalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in + let adj_adjuncts = WalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in + let adv_adjuncts = WalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in + List.rev (Xlist.rev_map paths (fun t -> + let pos = match t.token with + Lemma(_,pos,_) -> WalFrames.simplify_pos pos + | _ -> "" in + let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function + _,(Frame(attrs,schema) as frame) -> + let attrs = remove_meaning attrs in + lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l) + | _,frame -> frame :: lex_frames, frames) in + let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame -> + (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in + let simp_frames,full_frames,_ = StringMap.fold frames (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) _ (attrs,schemata) -> + Xlist.fold (simplify_schemata pos schemata) (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) (schema,frames) -> + let schema = match pos with + "verb" -> schema @ verb_adjuncts + | "noun" -> schema @ (if is_measure attrs then noun_measure_adjuncts else noun_adjuncts) + | "adj" -> schema @ adj_adjuncts + | "adv" -> schema @ adv_adjuncts + | _ -> schema in + (n,Frame(attrs,schema)) :: simp_frames, + Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames), + n+1)) in + {t with simple_valence=simp_frames; valence=full_frames})) + +(* FIXME: dodać do walencji preferencje selekcyjne nadrzędników symboli: dzień, godzina, rysunek itp. *) +(* FIXME: sprawdzić czy walencja nazw własnych jest dobrze zrobiona. *) + +(* let first_id = 1 (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) + +let add_ids (paths,last) next_id = + let paths,next_id = Xlist.fold ((*List.rev*) paths) ([],next_id) (fun (paths,id) t -> + {t with id=id} :: paths, id+1) in + (paths,last),next_id *) + + + +let parse query = +(* print_endline "a1"; *) + let l = Xunicode.classified_chars_of_utf8_string query in +(* print_endline "a2"; *) + let l = PreTokenizer.tokenize l in +(* print_endline "a3"; *) + let l = PrePatterns.normalize_tokens [] l in +(* print_endline "a4"; *) + let l = PrePatterns.find_replacement_patterns l in +(* print_endline "a5"; *) + let l = PrePatterns.remove_spaces [] l in + let l = PrePatterns.find_abr_patterns PreAcronyms.abr_patterns l in + let l = PrePatterns.normalize_tokens [] l in +(* print_endline "a6"; *) + let paths = PrePaths.translate_into_paths l in +(* print_endline "a7"; *) + let paths = PrePaths.lemmatize paths in +(* print_endline "a8"; *) + let paths,_ = PreMWE.process paths in +(* print_endline "a12"; *) + let paths = find_proper_names paths in +(* print_endline "a13"; *) + let paths = modify_weights paths in + let paths = translate_digs paths in + let paths = assign_senses paths in +(* print_endline "a14"; *) + let paths = assign_valence paths in +(* print_endline "a15"; *) + let paths = combine_interps paths in +(* print_endline "a16"; *) + let paths = disambiguate_senses paths in + let paths = assign_simplified_valence paths in + let paths = PreSemantics.assign_semantics paths in +(* print_endline "a16"; *) + let paths = select_tokens paths in +(* print_endline "a17"; *) +(* let paths = if !single_sense_flag then single_sense paths else paths in + let paths = if !single_frame_flag then single_frame paths else paths in*) + (*let paths, next_id = add_ids paths next_id in + let paths = prepare_indexes paths in*) +(* print_endline "a18"; *) + paths(*, next_id*) +(* print_endline (PrePaths.to_string paths); *) +(* let paths = + if PrePaths.no_possible_path (PrePaths.map paths PreLemmatization.remove_postags) then + PrePaths.map paths process_ign + else paths in + let paths = PrePaths.map paths PreLemmatization.remove_postags in + let paths = PreCaseShift.manage_lower_upper_case paths in (* FIXME: niepotrzebnie powiększa pierwszy token (przymiotniki partykuły itp.) *) + let paths = PreLemmatization.combine_interps paths in +(* print_endline (PrePaths.to_string paths); *)*) + +let parse_conll tokens dep_paths = (* FIXME: sprawdzić, czy zachowana jest kolejność elementów paths !!! *) + let paths = List.rev (Int.fold 1 (Array.length dep_paths - 1) [] (fun paths conll_id -> + let id,_,_ = dep_paths.(conll_id) in + ExtArray.get tokens id :: paths)) in + (* print_endline "a12"; *) + let paths = find_proper_names paths in + (* print_endline "a13"; *) + let paths = modify_weights paths in + let paths = PreWordnet.assign_senses paths in + (* print_endline "a14"; *) + (* let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *) *) + (* print_endline "a15"; *) + let paths = assign_valence paths in + (* print_endline "a16"; *) + let paths = disambiguate_senses paths in + let paths = assign_simplified_valence paths in + let paths = PreSemantics.assign_semantics paths in + (* print_endline "a16"; *) + let _ = Xlist.fold paths 1 (fun conll_id t -> + let id,_,_ = dep_paths.(conll_id) in + ExtArray.set tokens id t; + conll_id + 1) in + () +*) diff --git a/pre/preSemantics.ml b/lexSemantics/ENIAMlexSemanticsData.ml index 90a4a00..47827cc 100644 --- a/pre/preSemantics.ml +++ b/lexSemantics/ENIAMlexSemanticsData.ml @@ -1,19 +1,19 @@ (* - * ENIAM: Categorial Syntactic-Semantic Parser for Polish + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by + * This library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. *) diff --git a/lexSemantics/ENIAMlexSemanticsTypes.ml b/lexSemantics/ENIAMlexSemanticsTypes.ml new file mode 100644 index 0000000..6d6ee05 --- /dev/null +++ b/lexSemantics/ENIAMlexSemanticsTypes.ml @@ -0,0 +1,60 @@ +(* + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences + * + * This library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + *) + +open ENIAMtokenizerTypes +open Xstd + +type labels = { + number: string; + case: string; + gender: string; + person: string; + aspect: string; + } + +type semantics = + Normal + | Special of string list +(* | SpecialNoun of type_arg list * type_term + | SpecialMod of string * (type_arg list * type_term)*) + | PrepSemantics of (string * string * StringSet.t * string list) list (* role,role_attr,hipero,sel_prefs *) + +type lex_sem = { + e: labels; + valence: (int * ENIAMwalTypes.frame) list; + simple_valence: (int * ENIAMwalTypes.frame) list; + senses: (string * string list * float) list; + lroles: string * string; + semantics: semantics; + } + +let empty_labels = { + number=""; + case=""; + gender=""; + person=""; + aspect=""; + } + +let empty_lex_sem = { + e=empty_labels; valence=[]; simple_valence=[]; senses=[]; + lroles="",""; semantics=Normal} + +let proper_names_filename = resource_path ^ "/lexSemantics/proper_names_sgjp_polimorf.tab" +let proper_names_filename2 = resource_path ^ "/lexSemantics/proper_names.tab" diff --git a/lexSemantics/README b/lexSemantics/README new file mode 100644 index 0000000..9678665 --- /dev/null +++ b/lexSemantics/README @@ -0,0 +1,57 @@ +ENIAMsemValence Version 1.0 : +----------------------- + +ENIAMsemValence is a library that assigns tokens with lexicosemantic information. +It recognizes named entities and assigns thematic roles, +senses, valence and other semantic information to tokens. + +Install +------- + +ENIAMsemValence requires OCaml version 4.02.3 compiler +together with Xlib library version 3.1 or later, +ENIAMtokenizer library version 1.0, ENIAMmorphology library version 1.0 +and ENIAMsubsyntax library version 1.0. + +In order to install type: + +make install + +by default, ENIAMsemValence is installed in the 'ocamlc -where'/eniam directory. +you can change it by editing the Makefile. + +In order to test library type: +make test +./test + +By default ENIAMsemValence looks for resources in /usr/share/eniam directory. +However this behaviour may be changed by setting end exporting ENIAM_RESOURCE_PATH +environment variable. + +Credits +------- +Copyright © 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> +Copyright © 2016 Institute of Computer Science Polish Academy of Sciences + +The library uses the following licensed resources: + +SGJP: Grammatical Dictionary of Polish, version 20151020 +Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin +Woliński, Robert Wołosz, Danuta Skowrońska +http://sgjp.pl + +Licence +------- + +This library is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. diff --git a/lexSemantics/lgpl-3.0.txt b/lexSemantics/lgpl-3.0.txt new file mode 100644 index 0000000..65c5ca8 --- /dev/null +++ b/lexSemantics/lgpl-3.0.txt @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/lexSemantics/makefile b/lexSemantics/makefile new file mode 100644 index 0000000..59a4704 --- /dev/null +++ b/lexSemantics/makefile @@ -0,0 +1,53 @@ +OCAMLC=ocamlc +OCAMLOPT=ocamlopt +OCAMLDEP=ocamldep +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam +OCAMLFLAGS=$(INCLUDES) -g +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-plWordnet.cmxa eniam-walenty.cmxa #eniam-lexSemantics.cmxa +INSTALLDIR=`ocamlc -where`/eniam + +SOURCES= ENIAMlexSemanticsTypes.ml ENIAMlexSemantics.ml + +all: eniam-lexSemantics.cma eniam-lexSemantics.cmxa + +install: all + mkdir -p $(INSTALLDIR) + cp eniam-lexSemantics.cmxa eniam-lexSemantics.a eniam-lexSemantics.cma $(INSTALLDIR) + cp ENIAMlexSemanticsTypes.cmi ENIAMlexSemantics.cmi $(INSTALLDIR) + cp ENIAMlexSemanticsTypes.cmx ENIAMlexSemantics.cmx $(INSTALLDIR) + mkdir -p /usr/share/eniam/lexSemantics + cp resources/* /usr/share/eniam/lexSemantics + ln -s /usr/share/eniam/lexSemantics/proper_names_20160104.tab /usr/share/eniam/lexSemantics/proper_names.tab + ln -s /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf_20151020.tab /usr/share/eniam/lexSemantics/proper_names_sgjp_polimorf.tab + +eniam-lexSemantics.cma: $(SOURCES) + ocamlc -linkall -a -o eniam-lexSemantics.cma $(OCAMLFLAGS) $^ + +eniam-lexSemantics.cmxa: $(SOURCES) + ocamlopt -linkall -a -o eniam-lexSemantics.cmxa $(INCLUDES) $^ + +test: test.ml + $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml + +.SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx + +.mll.ml: + ocamllex $< + +.mly.mli: + ocamlyacc $< + +.mly.ml: + ocamlyacc $< + +.ml.cmo: + $(OCAMLC) $(OCAMLFLAGS) -c $< + +.mli.cmi: + $(OCAMLC) $(OCAMLFALGS) -c $< + +.ml.cmx: + $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< + +clean: + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test diff --git a/resources/SGJP/README b/lexSemantics/resources/README index cbc51d3..4ec5ba5 100644 --- a/resources/SGJP/README +++ b/lexSemantics/resources/README @@ -1,4 +1,5 @@ -Files in this folder were created on the basis of +File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of + SGJP: Grammatical Dictionary of Polish, version 20151020 Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin Woliński, Robert Wołosz, Danuta Skowrońska diff --git a/resources/proper_names_20160104.tab b/lexSemantics/resources/proper_names_20160104.tab index a0a3125..a0a3125 100644 --- a/resources/proper_names_20160104.tab +++ b/lexSemantics/resources/proper_names_20160104.tab diff --git a/resources/SGJP/proper_names_sgjp_polimorf_20151020.tab b/lexSemantics/resources/proper_names_sgjp_polimorf_20151020.tab index 51df848..51df848 100644 --- a/resources/SGJP/proper_names_sgjp_polimorf_20151020.tab +++ b/lexSemantics/resources/proper_names_sgjp_polimorf_20151020.tab diff --git a/lexSemantics/test.ml b/lexSemantics/test.ml new file mode 100644 index 0000000..19d0415 --- /dev/null +++ b/lexSemantics/test.ml @@ -0,0 +1,74 @@ +(* + * ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information. + * Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> + * Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences + * + * This library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + *) + + +let test_strings = [ + "Szpak frunie zimą."; + "Kot miauczy w październiku."; +(* "a gdybym miałem"; + "A Gdy Miałem"; + "GDY MIAŁEM"; + "I II III IV V VI VII VIII IX X MCXIV MXC"; + "Kiedy Piotr Prabucki, przewodniczący Komisji Budżetu PeKaO"; + "25 idzie 20."; + "Kot. Kot. kot."; + "25."; + "25.888.231"; + "Ala 25.888.231.111 ma."; + "Ala 25.888.031,011."; + "Ala -25.888.031,011."; + "Ala -25 ."; + "Ala -1° C 3° ciepła 20—30°C od 180° do 260°C około 6° poniżej horyzontu."; + "Ala 22-25 ."; + "Ala 22.5.2000-25.5.2001 .";*) +(* "Np. Ala.";*) + (* "w. dom."; + "tzn."; + "c.d.n."; *) +(* "Arabia Saudyjska biegnie."; + "Cauchy'ego ONZ-owska biegnie.";*) + (* "TE-cie E-e."; + "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE."; + "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; *) +(* "Tom idzie.";*) + (* "Miałem miał."; *) +(* "Szpak śpiewa."; + "Ala ma kota."; + "Ale mają kota:"*) + ] + +let _ = + print_endline "Testy wbudowane"; + Xlist.iter test_strings (fun s -> + print_endline ("\nTEST: " ^ s); + let text,tokens = ENIAMsubsyntax.parse_text s in + let lex_sems = ENIAMlexSemantics.assign tokens text in + (* print_endline (ENIAMtokenizer.xml_of tokens); *) + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token))); +(* print_endline "Testy użytkownika."; + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; + let s = ref (read_line ()) in + while !s <> "" do + let tokens = ENIAMtokenizer.parse !s in + (* print_endline (ENIAMtokenizer.xml_of tokens); *) + Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token)); + print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; + s := read_line () + done;*) + () diff --git a/tokenizer/ENIAMtokenizerTypes.ml b/tokenizer/ENIAMtokenizerTypes.ml index c644bb9..3031d51 100644 --- a/tokenizer/ENIAMtokenizerTypes.ml +++ b/tokenizer/ENIAMtokenizerTypes.ml @@ -42,7 +42,7 @@ type token = (* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających informacje o poszczególnych tokenach *) -and token_record = { +type token_record = { orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token *) corr_orth: string; (* sekwencja znaków pierwotnego tekstu składająca się na token z poprawionymi błędami *) beg: int; (* pozycja początkowa tokenu względem początku akapitu *)