ENIAMmstFeatures.ml 1.88 KB
open Xstd
open ENIAMmstModel
open ENIAMtokenizerTypes
open ENIAM_LCGtypes

type disamb_info = {
  tree: node array;
  tokens: token_env ExtArray.t;
  paths: (int * int) IntMap.t
}

let create_dist_str (data: disamb_info) (parent: node) (child: node) =
  let (lp, rp) = IntMap.find data.paths parent.id in
  let (lc, rc) = IntMap.find data.paths child.id in
  let is_ra = (ExtArray.get data.tokens parent.id).beg >
              (ExtArray.get data.tokens child.id).beg in
  let arc_str =
    if is_ra then
      "RA"
    else
      "LA" in
  let dist =
    if is_ra then
      lp - rc
    else
      rp - lc in
  let dist_str =
    if dist > 10 then
      "10"
    else if dist > 5 then
      "5"
    else
      string_of_int (dist - 1) in
  "&" ^ arc_str ^ "&" ^ dist_str



let apply_features features fv =
  List.fold_left (|>) fv features

let add_linear_features f_type (obs: string array) first second distStr fv =
  fv

let add_two_obs_features prefix item1F1 item1F2 item2F1 item2F2 distStr fv =
  let add_diststr str = [str; str^"*"^distStr] in
  let flist = List.map ((^) prefix)[
      "2FF1="^item1F1;
      "2FF1="^item1F1^" "^item1F2;
      "2FF1="^item1F1^" "^item1F2^" "^item2F2;
      "2FF1="^item1F1^" "^item1F2^" "^item2F2^" "^item2F1;
      "2FF2="^item1F1^" "^item2F1;
      "2FF3="^item1F1^" "^item2F2;
      "2FF4="^item1F2^" "^item2F1^" "^item2F2;
      "2FF5="^item1F2^" "^item2F2;
      "2FF6="^item2F1^" "^item2F2;
      "2FF7="^item1F2;
      "2FF8="^item2F1;
      "2FF9="^item2F2;
    ] in
  let funs = List.map (MST_Model.add_feature) (List.flatten (List.map add_diststr flist)) in
  apply_features funs fv

let score_edge (data: disamb_info) (parent: node) (child: node) =
  let fv = MST_Model.empty_fv in
  let dist_str = create_dist_str data parent child in
  let fv = add_two_obs_features "HCC"
      parent.lemma parent.pos child.lemma child.pos dist_str fv in
  MST_Model.score_fv fv