ENIAMtokenizer.ml 9.5 KB
(*
 *  ENIAMtokenizer, a tokenizer for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open ENIAMtokenizerTypes
open ENIAMinflexion

let load_set filename set =
  Xlist.fold (File.load_lines filename) set StringSet.add

let initialize () =
  ENIAMinflexion.initialize ();
  ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns ();
  ENIAMurl.top_level_domains := ENIAMurl.load_top_level_domains ();
  known_lemmata := File.catch_no_file (load_set known_lemmata_filename) StringSet.empty;
  known_orths := File.catch_no_file (load_set known_orths_filename) StringSet.empty;
  known_lemmata := File.catch_no_file (load_set user_known_lemmata_filename) !known_lemmata;
  known_orths := File.catch_no_file (load_set user_known_orths_filename) !known_orths;
  known_lemmata :=
    Xlist.fold !theories_paths !known_lemmata (fun set path ->
      File.catch_no_file (load_set (path ^ "/known_lemmata.tab")) set);
  known_orths :=
    Xlist.fold !theories_paths !known_orths (fun set path ->
      File.catch_no_file (load_set (path ^ "/known_orths.tab")) set);
  ()

let string_of =
  ENIAMtokens.string_of_tokens

let get_ontological_category lemma pos tags =
  [StringSet.mem !known_lemmata lemma,false,false,"X",tags] (* FIXME: todo *)

(* let lemmatize_strings l =
  let l1,l2 = Xlist.fold l ([],[]) (fun (l1,l2) (s,obs,lem) ->
    let l = ENIAMinflexion.get_interpretations s in
    Xlist.fold l (l1,l2) (fun (l1,l2) i ->
      if i.ENIAMinflexion.status = ENIAMinflexion.LemmaVal ||
         i.ENIAMinflexion.status = ENIAMinflexion.LemmaAlt then (i,obs,lem) :: l1, l2 else
      if StringSet.mem !known_lemmata i.ENIAMinflexion.lemma then (i,obs,lem) :: l1, l2 else l1, (i,obs,lem) :: l2)) in
  if l1 = [] then l2,false else l1,true *)

let calculate_priority2 star has_agl_suf equal_case =
  if has_agl_suf || (not equal_case) || star = ENIAMmorphologyTypes.Aux2 then max_int else 3

let calculate_priority is_in_lexicon lemma_in_sgjp is_validated_by_sgjp has_no_sgjp_tag star has_poss_ndm_tag has_agl_suf equal_case =
  match is_in_lexicon,lemma_in_sgjp,is_validated_by_sgjp,has_no_sgjp_tag,(star = ENIAMmorphologyTypes.Ndm),has_poss_ndm_tag with
    true,true,true,_,_,_ -> 1
  | true,true,false,true,_,_ -> 1
  | true,true,false,false,_,_ -> calculate_priority2 star has_agl_suf equal_case
  | true,false,_,_,true,true -> 1
  | true,false,_,_,true,false -> calculate_priority2 star has_agl_suf equal_case
  | true,false,_,_,false,_ -> 1
  | false,_,true,_,_,_ -> 2
  | false,_,false,_,_,_ -> calculate_priority2 star has_agl_suf equal_case

let lemmatize_strings2 lemma pos tags best_prior best_l is_validated_by_sgjp star has_agl_suf has_equal_cases =
  Xlist.fold (get_ontological_category lemma pos tags) (best_prior,best_l) (fun (best_prior,best_l) (is_in_lexicon,has_no_sgjp_tag,has_poss_ndm_tag,cat,tags) ->
    let prior = calculate_priority is_in_lexicon
      (StringSet.mem !ENIAMinflexion.lemmata lemma) is_validated_by_sgjp has_no_sgjp_tag
      star has_poss_ndm_tag has_agl_suf has_equal_cases in
    if prior > best_prior then best_prior,best_l else
    if prior < best_prior then prior,[(*obs,lem,*)lemma,pos,tags,cat] else
    best_prior,((*obs,lem,*)lemma,pos,tags,cat) :: best_l)

let lemmatize_strings has_agl_suf l =
  Xlist.fold l (1000,[]) (fun (best_prior,best_l) (s,obs,lem) ->
    Xlist.fold (get_interpretations s) (best_prior,best_l) (fun (best_prior,best_l) i ->
      let is_validated_by_sgjp = i.status = LemmaVal || i.status = LemmaAlt in
      Xlist.fold (ENIAMtagset.parse i.interp) (best_prior,best_l) (fun (best_prior,best_l) (pos,tags) ->  (* zakładam, że tagi generowane przez analizator morfologiczny są poprawne i nie mają _ *)
        lemmatize_strings2 i.lemma pos tags best_prior best_l is_validated_by_sgjp i.star has_agl_suf (obs = lem))))

let lemmatize_token has_agl_suf = function
    SmallLetter(uc,lc) -> lemmatize_strings has_agl_suf [uc,(SL : letter_size),(CL : letter_size);lc,SL,SL]
  | CapLetter(uc,lc) -> lemmatize_strings has_agl_suf [uc,(CL : letter_size),(CL : letter_size);lc,CL,SL]
  | AllSmall(uc,fc,lc) -> lemmatize_strings has_agl_suf [uc,(AS : letter_size),AC;fc,AS,FC;lc,AS,AS]
  | AllCap(uc,fc,lc) -> lemmatize_strings has_agl_suf [uc,(AC : letter_size),AC;fc,AC,FC;lc,AC,AS]
  | FirstCap(uc,fc,lc) -> lemmatize_strings has_agl_suf [uc,(FC : letter_size),AC;fc,FC,FC;lc,FC,AS]
  | SomeCap(uc,orth,lc) -> lemmatize_strings has_agl_suf [uc,(SC : letter_size),AC;orth,SC,SC;lc,SC,AS]
  | Lemma(lemma,pos,interp) ->
      Xlist.fold interp (1000,[]) (fun (best_prior,best_l) tags ->
        lemmatize_strings2 lemma pos tags best_prior best_l true ENIAMmorphologyTypes.Star has_agl_suf true)
  | t -> 1000,[]

let is_known_orth = function
    SmallLetter(uc,lc) -> true
  | CapLetter(uc,lc) -> true
  | AllSmall(uc,fc,lc) -> StringSet.mem !known_orths uc || StringSet.mem !known_orths fc || StringSet.mem !known_orths lc
  | AllCap(uc,fc,lc) -> StringSet.mem !known_orths uc || StringSet.mem !known_orths fc || StringSet.mem !known_orths lc
  | FirstCap(uc,fc,lc) -> StringSet.mem !known_orths uc || StringSet.mem !known_orths fc || StringSet.mem !known_orths lc
  | SomeCap(uc,orth,lc) -> StringSet.mem !known_orths uc || StringSet.mem !known_orths orth || StringSet.mem !known_orths lc
  | Lemma _ -> false
  | t -> true

let rec lemmatize_rec = function
    Token t ->
      (* print_endline ("lemmatize_rec: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token); *)
      (*let l,b = lemmatize_token (Xlist.mem t.attrs HasAglSuffix) t.token in
      if Xlist.mem t.attrs HasAglSuffix && not b then Variant[],false else
      let t = {t with attrs=Xlist.remove_all t.attrs HasAglSuffix} in
      let attrs = if b then t.attrs else LemmNotVal :: t.attrs in
      let l = List.flatten (Xlist.rev_map l (fun (i,obs,lem) ->
        if (not b) && obs <> lem then [] else
        Xlist.rev_map (ENIAMtagset.parse i.ENIAMinflexion.interp) (fun (pos,tags) ->  (* zakładam, że tagi generowane przez analizator morfologiczny są poprawne i nie mają _ *)
          Token{t with token=Lemma(i.ENIAMinflexion.lemma,pos,[tags]); attrs=attrs}))) in (* FIXME obs,lem,i.ENIAMinflexion.star,i.ENIAMinflexion.freq,i.ENIAMinflexion.tags *)
      if is_known_orth t.token then Variant(Token t :: l), true else Variant l, b*)
      let has_agl_suf = Xlist.mem t.attrs HasAglSuffix in
      let prior,l = lemmatize_token has_agl_suf t.token in
      let t = {t with attrs=Xlist.remove_all t.attrs HasAglSuffix} in
      let attrs = if prior <= 2 then t.attrs else LemmNotVal :: t.attrs in
      let l = Xlist.rev_map l (fun ((*obs,lem,*)lemma,pos,tags,cat) ->
        Token{t with token=Lemma(lemma,pos,[tags]); attrs=attrs}) in
      if is_known_orth t.token && not has_agl_suf then [Token t],Variant l,prior  else [],Variant l,prior
  | Seq l ->
      (try
        let l,prior = Xlist.fold l ([],0) (fun (l,prior) t ->
          let t1,t2,prior2 = lemmatize_rec t in
          match t1, t2 with
            [], Variant[] -> raise Not_found
          | [t], Variant[] -> t :: l, prior
          | [], _ -> t2 :: l, max prior prior2
          | _ -> failwith "lemmatize_rec") in
        if prior = 0 then [Seq(List.rev l)], Variant[], 1000
        else [], Seq(List.rev l), prior
      with Not_found -> [],Variant[],1000)
       (* if t1 <>
          if t = Variant[] then raise Not_found else
          t :: l, b && b2) in
        Seq(List.rev l), b
      with Not_found -> Variant[],false)*)
  | Variant l ->
      let l1,l2,prior = Xlist.fold l ([],[],1000) (fun (l1,l2,best_prior) t ->
        let t1,t2,prior = lemmatize_rec t in
        if prior > best_prior then t1 @ l1,l2,best_prior else
        if prior < best_prior then t1 @ l1,[t2],prior else
        t1 @ l1,t2 :: l2,best_prior) in
      l1,Variant l2,prior
      (* let l1,l2 = Xlist.fold l ([],[]) (fun (l1,l2) t ->
        let t,b = lemmatize_rec t in
        if t = Variant[] then l1,l2 else
        if b then t :: l1, l2 else l1, t :: l2) in
      if l1 = [] then Variant l2,false else Variant l1,true *)

let lemmatize l =
  List.rev (Xlist.rev_map l (fun t ->
    let t1,t2,_ = lemmatize_rec t in
    Variant (t2 :: t1)))
    (* fst (lemmatize_rec t))) *)

let parse query =
  let l = Xunicode.classified_chars_of_utf8_string query in
  let l = ENIAMtokens.tokenize l in
  let l = ENIAMpatterns.normalize_tokens [] l in
  let l = ENIAMpatterns.find_replacement_patterns l in
  let l = ENIAMpatterns.remove_spaces [] l in
  let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.abr_patterns l in
  (* let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.query_patterns l in *)
  let l = ENIAMpatterns.normalize_tokens [] l in
  let l = lemmatize l in
  let l = ENIAMpatterns.normalize_tokens [] l in
  (* let l = ENIAMpatterns.process_interpunction [] l in
  let l = ENIAMpatterns.normalize_tokens [] l in *)
  (* let l = ENIAMpatterns.manage_query_boundaries l in *)
  l