ENIAMwalReduce.ml 12.2 KB
(*
 *  ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
 *  Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMwalTypes
open Xstd

let create_phrase_reqs s (reqs,noreqs) = function
  | PrepNP(prep,_) -> StringMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | PrepAdjP(prep,_) -> StringMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | PrepNCP(prep,_,_,_) -> StringMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | ComparP(prep,_) -> StringMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | FixedP(prep) -> StringMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | SimpleLexArg(lex,_) -> StringMap.add_inc reqs s (StringSet.singleton lex) (fun set -> StringSet.add set lex), noreqs
  | LexArg(_,lex,_) -> StringMap.add_inc reqs s (StringSet.singleton lex) (fun set -> StringSet.add set lex), noreqs
  | MorfId _ -> failwith "create_phrase_reqs"
  | _ -> reqs, StringSet.add noreqs s

let create_phrase_reqs2 s (reqs,noreqs) = function
  | PrepNP(prep,_) -> IntMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | PrepAdjP(prep,_) -> IntMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | PrepNCP(prep,_,_,_) -> IntMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | ComparP(prep,_) -> IntMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | FixedP(prep) -> IntMap.add_inc reqs s (StringSet.singleton prep) (fun set -> StringSet.add set prep), noreqs
  | SimpleLexArg(lex,_) -> IntMap.add_inc reqs s (StringSet.singleton lex) (fun set -> StringSet.add set lex), noreqs
  | LexArg(_,lex,_) -> IntMap.add_inc reqs s (StringSet.singleton lex) (fun set -> StringSet.add set lex), noreqs
  | MorfId _ -> failwith "create_phrase_reqs2"
  | _ -> reqs, IntSet.add noreqs s

let create_comprep_reqs entries =
  let reqs,noreqs,reqs2 = Entries.fold entries (StringMap.empty,StringSet.empty,StringMap.empty) (fun (reqs,noreqs,reqs2) _ lemma -> function
        ComprepNPEntry(s,NoRestr,[p]) ->
        let reqs,noreqs = Xlist.fold p.morfs (reqs,noreqs) (create_phrase_reqs s) in
        reqs,noreqs,StringMap.add_inc reqs2 s (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
      | ComprepNPEntry(s,NoRestr,_) -> reqs, StringSet.add noreqs s, reqs2
      | ComprepNPEntry _ -> failwith "create_comprep_reqs"
      | _ -> reqs,noreqs,reqs2) in
  StringMap.fold reqs StringMap.empty (fun reqs s l ->
      if StringSet.mem noreqs s then reqs else StringMap.add reqs s l),reqs2

let create_lexarg_reqs entries =
  let reqs,noreqs = Entries.fold entries (IntMap.empty,IntSet.empty) (fun (reqs,noreqs) _ _ -> function
        LexEntry(id,_,_,NoRestr,[p]) -> Xlist.fold p.morfs (reqs,noreqs) (create_phrase_reqs2 id)
      | LexEntry(id,_,_,NoRestr,_) -> reqs, IntSet.add noreqs id
      | _ -> reqs,noreqs) in
  IntMap.fold reqs IntMap.empty (fun reqs s l ->
      if IntSet.mem noreqs s then reqs else IntMap.add reqs s l)

let comprep_reqs,comprep_reqs2 = create_comprep_reqs ENIAMwalParser.entries
let lexarg_reqs = create_lexarg_reqs ENIAMwalParser.entries

(* let rec assign_pro_args schema =
  Xlist.map schema (fun s ->
      let morfs = match s.morfs with
          (E p) :: l -> E Pro :: (E p) :: l
        | [LexPhrase _] as morfs -> morfs
        | [Phrase(FixedP _)] as morfs -> morfs
        | [Phrase(Lex _)] as morfs -> morfs
        (*    | [Phrase Refl] as morfs -> morfs
              | [Phrase Recip] as morfs -> morfs*)
        | Phrase Null :: _ as morfs -> morfs
        | Phrase Pro :: _ as morfs -> morfs
        | morfs -> if s.gf <> SUBJ && s.cr = [] && s.ce = [] then (Phrase Null) :: morfs else (Phrase Pro) :: morfs in (* FIXME: ustalić czy są inne przypadki uzgodnienia *)
      (*     let morfs = assign_pro_args_lex morfs in *) (* bez pro wewnątrz leksykalizacji *)
      {s with morfs=morfs}) *)

(*let assign_pro_args_lex morfs =
  Xlist.map morfs (function
      Lex(morf,specs,lex,restr) -> LexN(morf,specs,lex,assign_pro_args_restr restr)
    | LexNum(morf,lex1,lex2,restr) -> LexNum(morf,lex1,lex2,assign_pro_args_restr restr)
    | LexCompar(morf,l) -> LexCompar(morf,make_gfs_lex l)
    | morf -> morf)

  and assign_pro_args_restr = function
    Natr -> Natr
  | Ratr1 schema -> Ratr1(assign_pro_args schema)
  | Atr1 schema -> Atr1(assign_pro_args schema)
  | Ratr schema -> Ratr(assign_pro_args schema)
  | Atr schema -> Atr(assign_pro_args schema)*)

exception ImpossibleSchema

let rec reduce_comp test_lexemes = function
    Comp s -> if test_lexemes s then Comp s else raise Not_found
  | Zeby -> if test_lexemes "żeby" || test_lexemes "że" then Zeby else raise Not_found
  | Gdy -> if test_lexemes "gdy" || test_lexemes "gdyby" then Gdy else raise Not_found
  | CompUndef -> failwith "reduce_comp"

let reduce_phrase (test_comprep_reqs,test_comprep_reqs2,test_lexarg_reqs,test_lexemes) = function
  | PrepNP(prep,case) as phrase -> if test_lexemes prep then phrase else raise Not_found
  | PrepAdjP(prep,case) as phrase -> if test_lexemes prep then phrase else raise Not_found
  | ComprepNP(prep) as phrase  -> if test_comprep_reqs prep && test_comprep_reqs2 prep then phrase else raise Not_found
  | ComparP(prep,case) as phrase  -> if test_lexemes prep then phrase else raise Not_found
  | CP(ctype,comp) -> CP(ctype,reduce_comp test_lexemes comp)
  | NCP(case,ctype,comp) -> if test_lexemes "to" then NCP(case,ctype,reduce_comp test_lexemes comp) else raise Not_found
  | PrepNCP(prep,case,ctype,comp) -> if test_lexemes prep && test_lexemes "to" then PrepNCP(prep,case,ctype,reduce_comp test_lexemes comp) else raise Not_found
  | SimpleLexArg(lemma,_) as phrase  -> if test_lexemes lemma then phrase else raise Not_found
  | LexArg(id,lemma,_) as phrase  -> if test_lexemes lemma && test_lexarg_reqs id then phrase else raise Not_found
  | FixedP lemma as phrase  -> if test_lexemes lemma then phrase else raise Not_found
  | phrase -> phrase

let rec reduce_morfs tests = function
    [] -> []
  | morf :: l -> (try [reduce_phrase tests morf]
                  with Not_found -> []) @ reduce_morfs tests l

let rec reduce_schema2 tests = function
    [] -> []
  | s :: l ->
    let morfs = reduce_morfs tests s.morfs in
    if morfs = [] then reduce_schema2 tests l else
      {s with morfs=morfs} :: reduce_schema2 tests l

let rec reduce_schema tests = function
    [] -> []
  | s :: l ->
    let morfs = reduce_morfs tests s.morfs in
    if morfs = [] then raise ImpossibleSchema else
      {s with morfs=morfs} :: reduce_schema tests l

let reduce_entries lexemes entries =
  StringMap.map entries (fun entries ->
      StringSet.fold lexemes StringMap.empty (fun reduced lemma ->
          try StringMap.add reduced lemma (StringMap.find entries lemma)
          with Not_found -> reduced))

let merge_schema phrases schema =
  Xlist.map schema (fun p ->
      let morfs = List.flatten (Xlist.map p.morfs (function
            MorfId id -> (try IntMap.find phrases id with Not_found -> failwith "merge_schema")
          | _ -> failwith "merge_schema")) in
      {p with morfs=morfs})

let merge_entries phrases entries =
  Entries.map entries (fun _ _ (opinion,neg,pred,aspect,schema) ->
      opinion,neg,pred,aspect,merge_schema phrases schema)

let merge_entries_conn phrases meanings entries =
  Entries.map entries (fun _ _ (sopinion,fopinion,meaning_ids,neg,pred,aspect,schema) ->
      let meanings = Xlist.map meaning_ids (fun id ->
          try IntMap.find meanings id with Not_found -> failwith "merge_entries_conn") in
      sopinion,fopinion,meanings,neg,pred,aspect,merge_schema phrases schema)

let create_tests comprep_reqs comprep_reqs2 lexarg_reqs lexemes =
  (fun s ->
     if StringMap.mem comprep_reqs s then
       not (StringSet.is_empty (StringSet.intersection (StringMap.find comprep_reqs s) lexemes))
     else true),
  (fun s ->
     if StringMap.mem comprep_reqs s then
       not (StringSet.is_empty (StringSet.intersection (StringMap.find comprep_reqs2 s) lexemes))
     else true),
  (fun s ->
     if IntMap.mem lexarg_reqs s then
       not (StringSet.is_empty (StringSet.intersection (IntMap.find lexarg_reqs s) lexemes))
     else true),
  StringSet.mem lexemes


let select_entries_full phrases entries schemata connected meanings comprep_reqs comprep_reqs2 lexarg_reqs lexemes =
  let tests = create_tests comprep_reqs comprep_reqs2 lexarg_reqs lexemes in
  let entries = reduce_entries lexemes entries in
  let schemata = reduce_entries lexemes schemata in
  let connected = reduce_entries lexemes connected in
  let schemata = merge_entries phrases schemata in
  let entries = Entries.flatten_map entries (fun _ _ entry ->
    try (match entry with
          | LexEntry(id,lemma,pos,NoRestr,schema) -> [LexEntry(id,lemma,pos,NoRestr,reduce_schema tests schema)]
          | ComprepNPEntry(s,NoRestr,schema) -> [ComprepNPEntry(s,NoRestr,reduce_schema tests schema)]
          |  _ -> [entry])
    with ImpossibleSchema -> []) in
  let schemata = Entries.map schemata (fun _ _ (opinion,neg,pred,aspect,schema) ->
      opinion,neg,pred,aspect,reduce_schema2 tests schema) in
  let connected = merge_entries_conn phrases meanings connected in
  entries,schemata,connected

let select_all_entries phrases entries schemata connected meanings =
  let schemata = merge_entries phrases schemata in
  let connected = merge_entries_conn phrases meanings connected in
  entries,schemata,connected

let select_entries lexemes =
  select_entries_full ENIAMwalParser.phrases ENIAMwalParser.entries ENIAMwalParser.schemata
    ENIAMwalParser.connected ENIAMwalParser.meanings comprep_reqs comprep_reqs2 lexarg_reqs lexemes

(* let entries,schemata,connected =
  (* let lexemes = StringSet.of_list ["Ala"; "ma"; "kot"] in *)
  let lexemes = StringSet.of_list ["dorastać"; "dorobić"; "po"; "bok"; "na"] in
  select_entries ENIAMwalParser.phrases ENIAMwalParser.entries ENIAMwalParser.schemata
    ENIAMwalParser.connected ENIAMwalParser.meanings comprep_reqs comprep_reqs2 lexarg_reqs lexemes *)

(* TODO
   - uwzględnienie cech morfoskładniowych - np usunięcie schematów wymagających negacji, gdy nie ma "nie"
   - dodanie adjunctów - pamiętać o padvp
*)
(* TODO
   - leksykalizacje bez schema
*)

(* let _ =
  StringMap.iter comprep_reqs (fun s set ->
      Printf.printf "%s: %s\n" s (String.concat " " (StringSet.to_list set))) *)

(* let _ =
  StringMap.iter comprep_reqs2 (fun s set ->
      Printf.printf "%s: %s\n" s (String.concat " " (StringSet.to_list set))) *)

(* let _ =
  IntMap.iter lexarg_reqs (fun s set ->
      Printf.printf "%d: %s\n" s (String.concat " " (StringSet.to_list set))) *)

(* let _ =
  Entries.iter entries (fun pos lemma entry ->
      Printf.printf "%s\t%s\t%s\n" pos lemma (ENIAMwalStringOf.lex_entry entry));
  Entries.iter schemata (fun pos lemma (_,_,_,_,schema) ->
      Printf.printf "%s\t%s\t%s\n" pos lemma (ENIAMwalStringOf.schema schema));
  Xlist.iter (Entries.find ENIAMwalParser.schemata "verb" "dorobić") (fun (_,_,_,_,schema) ->
      let schema = merge_schema ENIAMwalParser.phrases schema in
      Printf.printf "%s\n" (ENIAMwalStringOf.schema schema));
  Xlist.iter (Entries.find ENIAMwalParser.schemata "verb" "dorastać") (fun (_,_,_,_,schema) ->
      let schema = merge_schema ENIAMwalParser.phrases schema in
      Printf.printf "%s\n" (ENIAMwalStringOf.schema schema));
  () *)