ENIAMmstFeatures.ml 2.5 KB
open Xstd
open ENIAMmstModel
open ENIAMtokenizerTypes
open ENIAM_LCGtypes

type disamb_info = {
  tree: node array;
  tokens: token_env ExtArray.t;
  paths: (int * int) IntMap.t
}

let create_dist_str (data: disamb_info) (parent: node) (child: node) =
  try
    let (lp, rp) = IntMap.find data.paths parent.id in
    let (lc, rc) = IntMap.find data.paths child.id in
    let is_ra = (ExtArray.get data.tokens parent.id).beg >
                (ExtArray.get data.tokens child.id).beg in
    let arc_str =
      if is_ra then
        "RA"
      else
        "LA" in
    let dist =
      if is_ra then
        lp - rc
      else
        lc - rp in
    if dist < 0 then ""
    else
      let dist_str =
        if dist > 10 then
          "10"
        else if dist > 5 then
          "5"
        else
          string_of_int (dist - 1) in
      "&" ^ arc_str ^ "&" ^ dist_str
  with
    _ -> ""

let apply_features features fv =
  List.fold_left (|>) fv features

let add_linear_features f_type (obs: string array) first second distStr fv =
  fv

let add_two_obs_features prefix item1F1 item1F2 item2F1 item2F2 distStr fv =
  let add_diststr str =
    if distStr != "" then
      [str; str^"*"^distStr]
    else
      [str] in
  let flist = List.map ((^) prefix)[
      "2FF1="^item1F1;
      "2FF1="^item1F1^" "^item1F2;
      "2FF1="^item1F1^" "^item1F2^" "^item2F2;
      "2FF1="^item1F1^" "^item1F2^" "^item2F2^" "^item2F1;
      "2FF2="^item1F1^" "^item2F1;
      "2FF3="^item1F1^" "^item2F2;
      "2FF4="^item1F2^" "^item2F1^" "^item2F2;
      "2FF5="^item1F2^" "^item2F2;
      "2FF6="^item2F1^" "^item2F2;
      "2FF7="^item1F2;
      "2FF8="^item2F1;
      "2FF9="^item2F2;
    ] in
  let funs = List.map (MST_Model.add_feature) (List.flatten (List.map add_diststr flist)) in
  apply_features funs fv

let correct_for_frequency (score: float) (parent: node) (child: node) =
  let case =
    if child.pos = "subst" then
      match List.find (fun attr -> fst attr = "CASE") child.attrs |> snd with
        Val s -> Some s
      | _ -> failwith "correct_for_frequency, case attribute not Val"
    else
      None in
  let freq_score = Freq_Model.get_freq_score child.lemma child.pos case in
  score +. 4.0 *. freq_score

let score_edge (data: disamb_info) (parent: node) (child: node) =
  let fv = MST_Model.empty_fv in
  let dist_str = create_dist_str data parent child in
  let fv = add_two_obs_features "HCC"
      parent.lemma parent.pos child.lemma child.pos dist_str fv in
  let score = MST_Model.score_fv fv in
  correct_for_frequency score parent child