preProcessing.ml 30.2 KB

Edit Raw Blame History

(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open PreTypes

(* uwagi:
jak przetwarzać num:comp
czy rzeczownik niepoliczalny w liczbie mnogiej jest nadal niepoliczalny np. "Wody szumią."
trzeba zrobić słownik mwe, i nazw własnych
trzeba zweryfikować słownik niepoliczalnych
przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga
*)


(**********************************************************************************)

let proper_names =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file Paths.proper_names_filename) in
  let l2 = Str.split_delim (Str.regexp "\n") (File.load_file Paths.proper_names_filename2) in
  Xlist.fold (l2 @ l) StringMap.empty (fun proper line ->
    if String.length line = 0 then proper else
    if String.get line 0 = '#' then proper else
    match Str.split_delim (Str.regexp "\t") line with
      [lemma; types] ->
        let types = Str.split (Str.regexp "|") types in
        StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
    | _ -> failwith ("proper_names: " ^ line))

let remove l s =
  Xlist.fold l [] (fun l t ->
    if s = t then l else t :: l)

let find_proper_names (paths,last) =
  List.rev (Xlist.rev_map paths (fun t ->
    match t.token with
      Lemma(lemma,pos,interp) ->
        if StringMap.mem proper_names lemma then
          {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
                  attrs=remove t.attrs "notvalidated proper"}
        else
          if Xlist.mem t.attrs "notvalidated proper" then
            {t with token=Proper(lemma,pos,interp,[])}
          else t
    | _ -> t)), last

(**********************************************************************************)

module OrderedStringList = struct

  type t = string list

  let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)

end

module OrderedStringListList = struct

  type t = string list list

  let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)

end

module StringListMap = Xmap.Make(OrderedStringList)
module StringListListMap = Xmap.Make(OrderedStringListList)
module StringListListSet = Xset.Make(OrderedStringListList)

type tree = T of tree StringListMap.t | S of StringSet.t

let single_tags = function
    [_] :: _ -> true
  | _ -> false

let rec make_tree interp =
  if single_tags interp then S (StringSet.of_list (List.flatten (List.flatten interp))) else
  let map = Xlist.fold interp StringListMap.empty (fun map tags ->
    StringListMap.add_inc map (List.hd tags) [List.tl tags] (fun l -> (List.tl tags) :: l)) in
  T(StringListMap.map map make_tree)

let is_s_tree map =
  StringListListMap.fold map false (fun b _ -> function
      S _ -> true
    | T _ -> b)

let rec fold_tree_rec rev s f = function
    S set -> f s (List.rev rev) set
  | T map -> StringListMap.fold map s (fun s tag tree ->
       fold_tree_rec (tag :: rev) s f tree)

let fold_tree tree s f = fold_tree_rec [] s f tree

let rec combine_interps_rec map =
  if is_s_tree map then
    StringListListMap.fold map [] (fun interp tail_tags -> function
        S tag -> ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) :: interp
      | _ -> failwith "combine_interps_rec")
  else
    let map = StringListListMap.fold map StringListListMap.empty (fun map tail_tags tree ->
      fold_tree tree map (fun map head_tags tag ->
        StringListListMap.add_inc map ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) [head_tags] (fun l -> head_tags :: l))) in
    combine_interps_rec (StringListListMap.map map make_tree)

let combine_interp interp =
  let map = StringListListMap.add StringListListMap.empty [] (make_tree interp) in
  combine_interps_rec map

let combine_pos = StringSet.of_list ["subst"; "depr"; "ppron12"; "ppron3"; "siebie"; "adj"; "num"; "ger"; "praet"; "fin"; "impt"; "imps"; "pcon"; "ppas"; "pact";
  "inf"; "bedzie"; "aglt"; "winien"; "pant"; "prep"]

let combine_interps (paths,last) =
  List.rev (Xlist.rev_map paths (fun t ->
    match t.token with
      Lemma(lemma,pos,interp) ->
(*         Printf.printf "%s %s %s\n" lemma pos (PreTokenizer.string_of_interps interp); *)
        let interp =
          if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
          else interp in
        let interp =
          if StringSet.mem combine_pos pos then combine_interp interp else
          StringListListSet.to_list (StringListListSet.of_list interp) in
        {t with token=Lemma(lemma,pos,interp)}
    | _ -> t)), last

(**********************************************************************************)

let translate_digs (paths,last) =
  Xlist.map paths (fun t ->
    match t.token with
      Dig(lemma,"dig") -> t
    | Dig(lemma,"intnum") -> {t with token=Lemma(lemma,"intnum",[[]])}
    | Dig(lemma,"realnum") -> {t with token=Lemma(lemma,"realnum",[[]])}
    | Dig(lemma,"year") -> {t with token=Proper(lemma,"year",[[]],["rok"])}
    | Dig(lemma,"month") -> t (*{t with token=Proper(lemma,"month",[[]],["miesiąc"])}*)
    | Dig(lemma,"hour") -> {t with token=Proper(lemma,"hour",[[]],["godzina"])}
    | Dig(lemma,"day") -> {t with token=Proper(lemma,"day",[[]],["dzień"])}
    | Dig(lemma,"minute") -> t (*{t with token=Proper(lemma,"minute",[[]],["minuta"])}*)
    | Dig(lemma,"2dig") -> t
    | Dig(lemma,"3dig") -> t
    | Dig(lemma,"pref3dig") -> t
    | RomanDig(lemma,"roman") -> {t with token=Lemma(lemma,"roman",[[]]); attrs=t.attrs}
    | RomanDig(lemma,"month") -> t (*{t with token=Proper(lemma,"symbol",[[]],["month"]); attrs="roman" :: t.attrs}*)
    | Dig(lemma,"ordnum") -> {t with token=Lemma(lemma,"ordnum",[[]])}
    | Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
    | Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
    | Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
    | Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
    | Compound("day-month",[Dig(d,"day");Dig(m,"month")]) -> {t with token=Proper(d ^ "." ^ m,"day-month",[[]],["data"])}
    | Compound("hour-minute",[Dig(h,"hour");Dig(m,"minute")]) -> {t with token=Proper(h ^ ":" ^ m,"hour-minute",[[]],["godzina"])}
    | Compound("match-result",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Proper(x ^ ":" ^ y,"match-result",[[]],["rezultat"])}
    | Compound("intnum-interval",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"intnum-interval",[[]])}
    | Compound("roman-interval",[RomanDig(x,"roman");RomanDig(y,"roman")]) -> {t with token=Lemma(x ^ "-" ^ y,"roman-interval",[[]]); attrs=t.attrs}
    | Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
    | Compound("realnum-interval",[Dig(x,"intnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
    | Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
    | Compound("date-interval",[Compound("date",[Dig(d1,"day");Dig(m1,"month");Dig(y1,"year")]);
        Compound("date",[Dig(d2,"day");Dig(m2,"month");Dig(y2,"year")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "." ^ y1 ^ "-" ^ d2 ^ "." ^ m2 ^ "." ^ y2,"date-interval",[[]],["interwał"])}
    | Compound("day-month-interval",[Compound("day-month",[Dig(d1,"day");Dig(m1,"month")]);
        Compound("day-month",[Dig(d2,"day");Dig(m2,"month")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "-" ^ d2 ^ "." ^ m2,"day-month-interval",[[]],["interwał"])}
    | Compound("day-interval",[Dig(d1,"day");Dig(d2,"day")]) -> {t with token=Proper(d1 ^ "-" ^ d2,"day-interval",[[]],["interwał"])}
    | Compound("month-interval",[Dig(m1,"month");Dig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"])}
    | Compound("month-interval",[RomanDig(m1,"month");RomanDig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"]); attrs="roman" :: t.attrs}
    | Compound("year-interval",[Dig(y1,"year");Dig(y2,"year")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
    | Compound("year-interval",[Dig(y1,"year");Dig(y2,"2dig")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
    | Compound("hour-minute-interval",[Compound("hour-minute",[Dig(h1,"hour");Dig(m1,"minute")]);Compound("hour-minute",[Dig(h2,"hour");Dig(m2,"minute")])]) ->
       {t with token=Proper(h1 ^ ":" ^ m1 ^ "-" ^ h2 ^ ":" ^ m2,"hour-minute-interval",[[]],["interwał"])}
    | Compound("hour-interval",[Dig(h1,"hour");Dig(h2,"hour")]) -> {t with token=Proper(h1 ^ "-" ^ h2,"hour-interval",[[]],["interwał"])}
    | Compound("minute-interval",[Dig(m1,"minute");Dig(m2,"minute")]) -> t (*{t with token=Proper(m1 ^ "-" ^ m2,"minute-interval",[[]],["interwał"])}*)
    | Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])}
    | Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])}
    | Dig(cat,_) -> failwith ("translate_digs: Dig " ^ cat)
    | RomanDig(cat,_) -> failwith ("translate_digs: Romandig " ^ cat)
    | Compound(cat,_) as t -> failwith ("translate_digs: " ^ PreTokenizer.string_of_token t)
    | _ -> t), last

let assign_valence (paths,last) =
  let lexemes = Xlist.fold paths StringMap.empty (fun lexemes t ->
    match t.token with
      Lemma(lemma,pos,_) ->
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos)
    | Proper(lemma,pos,_,_) ->
        let pos = match pos with
          "subst" -> "psubst"
        | "depr" -> "pdepr"
        | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *)
    | _ -> lexemes) in
  let valence = WalFrames.find_frames lexemes in
  List.rev (Xlist.rev_map paths (fun t ->
    match t.token with
      Lemma(lemma,pos,_) -> {t with valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
    | Proper(lemma,pos,interp,_) -> {t with valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
                                                         (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*));
                                            token=Lemma(lemma,pos,interp)}
    | _ -> t)), last

(**********************************************************************************)

let prepare_indexes (paths,_) =
  let set = Xlist.fold paths IntSet.empty (fun set t ->
    IntSet.add (IntSet.add set t.beg) t.next) in
  let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x ->
    IntMap.add map x n, n+1) in
  List.rev (Xlist.rev_map paths (fun t ->
    {t with lnode=IntMap.find map t.beg; rnode=IntMap.find map t.next})), last - 1

let select_tokens (paths,last) =
  List.rev (Xlist.fold paths [] (fun paths t ->
    match t.token with
(*      RomanDig(v,cat) -> {t with token=Lemma(v,cat,[[]])} :: paths
    | Interp orth -> {t with token=Lemma(orth,"interp",[[]])} :: paths
    | Dig(value,cat) -> {t with token=Lemma(value,cat,[[]])} :: paths
    | Other2 orth -> {t with token=Lemma(orth,"unk",[[]])} :: paths
    | Lemma(lemma,cat,interp) -> t :: paths
    | Proper _ -> failwith "select_tokens"
    | Compound _ -> t :: paths*)
(*       RomanDig(v,cat) -> t :: paths *)
    | Interp orth -> t :: paths
(*     | Dig(value,cat) -> t :: paths *)
    | Other2 orth -> t :: paths
    | Lemma(lemma,cat,interp) -> t :: paths
    | Proper _ -> failwith "select_tokens"
(*     | Compound _ -> t :: paths *)
    | _ -> paths)), last

let get_prefs_schema prefs schema =
  Xlist.fold schema prefs (fun prefs t ->
    Xlist.fold t.WalTypes.sel_prefs prefs StringSet.add)

let map_prefs_schema senses schema =
  Xlist.map schema (fun t ->
    if Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.Pro) || Xlist.mem t.WalTypes.morfs (WalTypes.Phrase WalTypes.ProNG) then t else
    {t with WalTypes.sel_prefs = Xlist.fold t.WalTypes.sel_prefs [] (fun l s ->
      if StringSet.mem senses s then s :: l else l)})

let disambiguate_senses (paths,last) =
  let prefs = Xlist.fold paths (StringSet.singleton "ALL") (fun prefs t ->
    Xlist.fold t.valence prefs (fun prefs -> function
      _,WalTypes.Frame(_,schema) -> get_prefs_schema prefs schema
    | _,WalTypes.LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema
    | _,WalTypes.ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in
  let hipero = Xlist.fold paths (StringSet.singleton "ALL") (fun hipero t ->
    Xlist.fold t.senses hipero (fun hipero (_,l,_) ->
      Xlist.fold l hipero StringSet.add)) in
  let senses = StringSet.intersection prefs hipero in
  let is_zero = StringSet.mem hipero "0" in
  let senses = if is_zero then StringSet.add senses "0" else senses in
  Xlist.map paths (fun t ->
    {t with valence = if is_zero then t.valence else
        Xlist.map t.valence (function
          n,WalTypes.Frame(a,schema) -> n,WalTypes.Frame(a,map_prefs_schema senses schema)
        | n,WalTypes.LexFrame(s,p,r,schema) -> n,WalTypes.LexFrame(s,p,r,map_prefs_schema senses schema)
        | n,WalTypes.ComprepFrame(s,p,r,schema) -> n,WalTypes.ComprepFrame(s,p,r,map_prefs_schema senses schema));
      senses = Xlist.map t.senses (fun (s,l,w) ->
        s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)}), last

let load_lemma_frequencies filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l StringMap.empty (fun map line ->
    if String.length line = 0 then map else
    if String.get line 0 = '#' then map else
    match Str.split_delim (Str.regexp "\t") line with
      [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.))
    | _ -> failwith ("load_lemma_frequencies: " ^ line))

let lemma_frequencies = load_lemma_frequencies Paths.lemma_frequencies_filename

let modify_weights (paths,last) =
  List.rev (Xlist.fold paths [] (fun paths t ->
    let w = Xlist.fold t.attrs t.weight (fun w -> function
        "token not found" -> w -. 25.
      | "lemma not validated"-> w -. 20.
      | "notvalidated proper" -> w -. 1.
      | "lemmatized as lowercase" -> w -. 0.1
      | _ -> w) in
    let w = match t.token with
        Lemma(lemma,cat,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
      | Proper(lemma,cat,_,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
      | _ -> w in
    {t with weight = w} :: paths)),last

(*let single_sense (paths,last) =
  List.rev (Xlist.rev_map paths (fun t ->
    let sense =
      if t.senses = [] then [] else
      [Xlist.fold t.senses ("",[],-.max_float) (fun (max_meaning,max_hipero,max_weight) (meaning,hipero,weight) ->
        if max_weight >= weight then max_meaning,max_hipero,max_weight else meaning,hipero,weight)] in
    {t with senses=sense})), last*)

open WalTypes

(*let single_schema schemata =
  let map = Xlist.fold schemata StringMap.empty (fun map schema ->
    let t = WalStringOf.schema (List.sort compare (Xlist.fold schema [] (fun l s ->
      if s.gf <> ARG && s.gf <> ADJUNCT then {s with role=""; role_attr=""; sel_prefs=[]} :: l else
      if s.cr <> [] || s.ce <> [] then {s with role=""; role_attr=""; sel_prefs=[]} :: l else l))) in
    StringMap.add_inc map t [schema] (fun l -> schema :: l)) in
  StringMap.fold map [] (fun l _ schemata ->
    let map = Xlist.fold schemata StringMap.empty (fun map schema ->
      Xlist.fold schema map (fun map s ->
        let t = WalStringOf.schema [{s with role=""; role_attr=""; sel_prefs=[]}] in
        StringMap.add_inc map t [s] (fun l -> s :: l))) in
    let schema = StringMap.fold map [] (fun schema _ l ->
      let s = List.hd l in
      {s with sel_prefs=Xlist.fold s.sel_prefs [] (fun l t -> if t = "0" || t = "T" then t :: l else l)} :: schema) in
    schema :: l)*)

let remove_meaning = function
    DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a)
  | EmptyAtrs m -> EmptyAtrs []
  | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*))
  | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*))
  | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a)
  | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a)
  | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a)
  | _ -> failwith "remove_meaning"


(*let single_frame (paths,last) =
  List.rev (Xlist.rev_map paths (fun t ->
    let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
        Frame(attrs,schema) ->
          let attrs = remove_meaning attrs in
          lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema]) (fun (_,l) -> attrs, schema :: l)
      | frame -> frame :: lex_frames, frames) in
    let frames = StringMap.fold frames lex_frames (fun frames _ (attrs,schemata) ->
      Xlist.fold (single_schema schemata) frames (fun frames frame -> Frame(attrs,frame) :: frames)) in
    {t with valence=frames})), last    *)

let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *)
    Phrase(NP(Case "dat")) -> l
  | Phrase(NP(Case "inst")) -> l
  | Phrase(PrepNP _) -> l
  | Phrase(PrepAdjP _) -> l
  | Phrase(NumP (Case "dat")) -> l
  | Phrase(NumP (Case "inst")) -> l
  | Phrase(PrepNumP _) -> l
  | Phrase(ComprepNP _) -> l
  | Phrase(ComparNP _) -> l
  | Phrase(ComparPP _) -> l
  | Phrase(IP) -> l
  | Phrase(CP _) -> l
  | Phrase(NCP(Case "dat",_,_)) -> l
  | Phrase(NCP(Case "inst",_,_)) -> l
  | Phrase(PrepNCP _) -> l
(*   | Phrase(PadvP) -> l *)
  | Phrase(AdvP) -> l
  | Phrase(PrepP) -> l
  | Phrase(Or) -> l
  | Phrase(Qub) -> l
  | Phrase(Adja) -> l
  | Phrase(Inclusion) -> l
  | Phrase Pro -> Phrase Null :: l
  | t -> t :: l

let simplify_position_noun l = function
    Phrase(NP(Case "gen")) -> l
  | Phrase(NP(Case "nom")) -> l
  | Phrase(NP(CaseAgr)) -> l
  | Phrase(PrepNP _) -> l
  | Phrase(AdjP AllAgr) -> l
  | Phrase(NumP (Case "gen")) -> l
  | Phrase(NumP (Case "nom")) -> l
  | Phrase(NumP (CaseAgr)) -> l
  | Phrase(PrepNumP _) -> l
  | Phrase(ComprepNP _) -> l
  | Phrase(ComparNP _) -> l
  | Phrase(ComparPP _) -> l
  | Phrase(IP) -> l
  | Phrase(NCP(Case "gen",_,_)) -> l
  | Phrase(PrepNCP _) -> l
  | Phrase(PrepP) -> l
  | Phrase(Qub) -> l
  | Phrase(Adja) -> l
  | Phrase(Inclusion) -> l
  | Phrase Pro -> Phrase Null :: l
  | t -> t :: l

let simplify_position_adj l = function
    Phrase(AdvP) -> l
  | t -> t :: l

let simplify_position_adv l = function
    Phrase(AdvP) -> l
  | t -> t :: l


let simplify_position pos l s =
  let morfs = match pos with
      "verb" -> List.rev (Xlist.fold s.morfs [] simplify_position_verb)
    | "noun" -> List.rev (Xlist.fold s.morfs [] simplify_position_noun)
    | "adj" -> List.rev (Xlist.fold s.morfs [] simplify_position_adj)
    | "adv" -> List.rev (Xlist.fold s.morfs [] simplify_position_adv)
    | _ -> s.morfs in
  match morfs with
    [] -> l
  | [Phrase Null] -> l
  | _ -> {s with morfs=morfs} :: l

let simplify_schemata pos schemata =
  let schemata = Xlist.fold schemata StringMap.empty (fun schemata (schema,frame) ->
    let schema = List.sort compare (Xlist.fold schema [] (fun l s ->
      let s = {s with role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; morfs=List.sort compare s.morfs} in
      if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else
(*       if s.cr <> [] || s.ce <> [] then s :: l else  *)
      simplify_position pos l s)) in
    StringMap.add_inc schemata (WalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in
  StringMap.fold schemata [] (fun l _ s -> s :: l)

(* FIXME: problem ComprepNP i PrepNCP *)
(* FIXME: problem gdy ten sam token występuje w  kilku ścieżkach *)
let generate_verb_prep_adjuncts preps =
  Xlist.map preps (fun (lemma,case) -> WalFrames.verb_prep_adjunct_schema_field lemma case)

let generate_verb_comprep_adjuncts compreps =
  Xlist.map compreps (fun lemma -> WalFrames.verb_comprep_adjunct_schema_field lemma)

let generate_verb_compar_adjuncts compars =
  Xlist.map compars (fun lemma -> WalFrames.verb_compar_adjunct_schema_field lemma)

let generate_noun_prep_adjuncts preps =
  WalFrames.noun_prep_adjunct_schema_field preps

let generate_noun_compar_adjuncts compars =
  WalFrames.noun_compar_adjunct_schema_field compars

let generate_adj_compar_adjuncts compars =
  WalFrames.noun_compar_adjunct_schema_field compars

let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"]

let generate_prep_adjunct_tokens paths =
  let map = Xlist.fold paths StringMap.empty (fun map t ->
    match t.token with
      Lemma(lemma,"prep",interp) ->
        let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in
        if StringSet.mem compars lemma then map else
        Xlist.fold interp map (fun map -> function
          [cases] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
        | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
        | _ -> map)
    | _ -> map) in
  StringMap.fold map [] (fun l _ v -> v :: l)

let generate_comprep_adjunct_tokens paths =
  let lemmas = Xlist.fold paths StringSet.empty (fun lemmas t ->
    match t.token with
      Lemma(lemma,_,_) -> StringSet.add lemmas lemma
    | _ -> lemmas) in
  StringMap.fold WalFrames.comprep_reqs [] (fun compreps comprep reqs ->
    let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in
    if b then comprep :: compreps else compreps)

let generate_compar_adjunct_tokens paths =
  let set = Xlist.fold paths StringSet.empty (fun set t ->
    match t.token with
      Lemma(lemma,"prep",interp) ->
        if not (StringSet.mem compars lemma) then set else
        StringSet.add set lemma
    | _ -> set) in
  StringSet.to_list set

let is_measure = function
    NounAtrs(_,_,Common "measure") -> true
  | _ -> false

let assign_simplified_valence (paths,last) =
  let preps = generate_prep_adjunct_tokens paths in
  let compreps = generate_comprep_adjunct_tokens paths in
  let compars = generate_compar_adjunct_tokens paths in
  let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in
  let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in
  let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in
  let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in
  let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in
  let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in
  let verb_adjuncts = WalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in
  let noun_adjuncts = WalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
  let noun_measure_adjuncts = WalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
  let adj_adjuncts = WalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in
  let adv_adjuncts = WalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in
  List.rev (Xlist.rev_map paths (fun t ->
    let pos = match t.token with
        Lemma(_,pos,_) -> WalFrames.simplify_pos pos
      | _ -> "" in
    let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
        _,(Frame(attrs,schema) as frame) ->
          let attrs = remove_meaning attrs in
          lex_frames, StringMap.add_inc frames (WalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l)
      | _,frame -> frame :: lex_frames, frames) in
    let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame ->
      (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in
    let simp_frames,full_frames,_ = StringMap.fold frames (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) _ (attrs,schemata) ->
      Xlist.fold (simplify_schemata pos schemata) (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) (schema,frames) ->
        let schema = match pos with
            "verb" -> schema @ verb_adjuncts
          | "noun" -> schema @ (if is_measure attrs then noun_measure_adjuncts else noun_adjuncts)
          | "adj" -> schema @ adj_adjuncts
          | "adv" -> schema @ adv_adjuncts
          | _ -> schema in
        (n,Frame(attrs,schema)) :: simp_frames,
        Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames),
        n+1)) in
    {t with simple_valence=simp_frames; valence=full_frames})), last

(* FIXME: dodać do walencji preferencje selekcyjne nadrzędników symboli: dzień, godzina, rysunek itp. *)
(* FIXME: sprawdzić czy walencja nazw własnych jest dobrze zrobiona. *)

let add_ids (paths,last) =
  let paths,next_id = Xlist.fold ((*List.rev*) paths) ([],1) (fun (paths,id) t -> (* id=0 jest zarezerwowane dla pro *)
    {t with id=id} :: paths, id+1) in
  (paths,last),next_id

let parse query =
(*   print_endline "a1"; *)
  let l = Xunicode.classified_chars_of_utf8_string query in
(*   print_endline "a2"; *)
  let l = PreTokenizer.tokenize l in
(*   print_endline "a3"; *)
  let l = PrePatterns.normalize_tokens [] l in
(*   print_endline "a4"; *)
  let l = PrePatterns.find_replacement_patterns l in
(*   print_endline "a5"; *)
  let l = PrePatterns.remove_spaces [] l in
  let l = PrePatterns.find_abr_patterns PreAcronyms.abr_patterns l in
  let l = PrePatterns.normalize_tokens [] l in
(*   print_endline "a6"; *)
  let paths = PrePaths.translate_into_paths l in
(*   print_endline "a7"; *)
  let paths = PrePaths.lemmatize paths in
(*   print_endline "a8"; *)
  let paths = PreMWE.process paths in
(*   print_endline "a12"; *)
  let paths = find_proper_names paths in
(*   print_endline "a13"; *)
  let paths = modify_weights paths in
  let paths = translate_digs paths in
  let paths = PreWordnet.assign_senses paths in
(*   print_endline "a14"; *)
  let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *)
(*   print_endline "a15"; *)
  let paths = assign_valence paths in
(*   print_endline "a16"; *)
  let paths = disambiguate_senses paths in
  let paths = assign_simplified_valence paths in
  let paths = PreSemantics.assign_semantics paths in
(*   print_endline "a16"; *)
  let paths = select_tokens paths in
(*   print_endline "a17"; *)
(*  let paths = if !single_sense_flag then single_sense paths else paths in
  let paths = if !single_frame_flag then single_frame paths else paths in*)
  let paths, next_id = add_ids paths in
  let paths = prepare_indexes paths in
(*   print_endline "a18"; *)
  paths, next_id
(*     print_endline (PrePaths.to_string paths);     *)
(*   let paths =
    if PrePaths.no_possible_path (PrePaths.map paths PreLemmatization.remove_postags) then
      PrePaths.map paths process_ign
    else paths in
  let paths = PrePaths.map paths PreLemmatization.remove_postags in
  let paths = PreCaseShift.manage_lower_upper_case paths in (* FIXME: niepotrzebnie powiększa pierwszy token (przymiotniki partykuły itp.) *)
  let paths = PreLemmatization.combine_interps paths in
(*     print_endline (PrePaths.to_string paths);     *)*)

let split_into_sentences par paths last next_id =
  let paths,last,next_id = PreSentences.find_sentences (paths,last) next_id in
  PreSentences.extract_sentences par (paths,last), next_id
  (* [{pid="";pbeg=(-1); plen=(-1); psentence=StructSentence(paths,last,next_id)}] *)

let parse_text = function
    RawText query ->
      print_endline query;
      AltText[Raw,RawText query; Struct,StructText (Xlist.map (Xstring.split "\n" query) (fun par ->
        let (paths,last : PreTypes.token_record list * int), next_id = parse par in
        let sentences, next_id = split_into_sentences par paths last next_id in
        AltParagraph[Raw,RawParagraph par; Struct,StructParagraph(sentences,next_id)]))]
  | _ -> failwith "parse_text: not implemented"

let rec main_loop in_chan out_chan =
  (* let query = input_line in_chan in *)
  (* print_endline "main_loop 1"; *)
  let query = (Marshal.from_channel in_chan : text) in
  (* print_endline "main_loop 2"; *)
  if query = RawText "" then () else (
  (try
(*     let time0 = Sys.time () in *)
    let utime0 = Unix.gettimeofday () in
  (* print_endline "main_loop 3"; *)
    let text = parse_text query in
  (* print_endline "main_loop 4"; *)
    (* let (paths,last : (int * int * PreTypes.token_record) list * int), next_id = parse query in *)
(*     let time2 = Sys.time () in *)
    let utime2 = Unix.gettimeofday () in
(*     Printf.printf "time=%f utime=%f\n%!" (time2 -. time0) (utime2 -. utime0); *)
    Marshal.to_channel out_chan (text(*paths,last,next_id*),"",utime2 -. utime0) [];
  (* print_endline "main_loop 5"; *)
    ()
(*     output_string out_chan (Xml.to_string_fmt (PrePaths.to_xml paths) ^ "\n") *)
  with e ->
    Marshal.to_channel out_chan (RawText ""(*[],0*),Printexc.to_string e,0.) []);
(*    output_string out_chan (Xml.to_string_fmt (Xml.Element("error",[],
      [Xml.PCData (Printexc.to_string e)])) ^ "\n"));*)
  flush out_chan;
  main_loop in_chan out_chan)

(* let _ = main_loop stdin stdout *)

let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,Paths.pre_port)

let _ =
  Gc.compact ();
  print_endline "Ready!";
  Unix.establish_server main_loop sockaddr