suffix_tree.ml 3.65 KB
open Xstd

type star = Productive | Star | Ndm

type rule = {star: star; pref: string; find: string; set: string; tags: (string * string) list;
  interp: string; id: string; freq: int}

let parse_star = function
    "" -> Productive
  | "*" -> Star
  | "ndm" -> Ndm
  | s -> failwith "parse_star"

let string_of_star = function
    Productive -> ""
  | Star -> "*"
  | Ndm -> "ndm"

let print_rule file rule =
  Printf.fprintf file "%s\t%d\t%s\t%s\t%s\t%s\t%s\n" rule.id rule.freq (string_of_star rule.star)
    rule.pref rule.find rule.set rule.interp

module CharTrees = struct

  type t = M of t CharMap.t * rule list

  let empty = M(CharMap.empty,[])

  let rec add_path_rules rule orth_suf i (M(map,rules)) =
    if i = -1 then M(map,rule :: rules) else
    let tree = try CharMap.find map (String.get orth_suf i) with Not_found -> empty in
    let tree = add_path_rules rule orth_suf (i-1) tree in
    M(CharMap.add map (String.get orth_suf i) tree,rules)

  let create_char_tree rules =
    let tree = Xlist.fold rules empty (fun tree rule ->
      add_path_rules rule rule.find (String.length rule.find - 1) tree) in
    tree

  let create rules =
    let prefix_map = Xlist.fold rules StringMap.empty (fun prefix_map rule ->
      StringMap.add_inc prefix_map rule.pref [rule] (fun l -> rule :: l)) in
    StringMap.fold prefix_map [] (fun trees prefix rules -> (prefix, create_char_tree rules) :: trees)

  let rec find_rec l i orth (M(map,rules)) =
    if i = 0 then Xlist.fold rules l (fun l rule -> ("", rule) :: l) else
    let l = try find_rec l (i-1) orth (CharMap.find map (String.get orth (i-1))) with Not_found -> l in
    Xlist.fold rules l (fun l rule -> (String.sub orth 0 i, rule) :: l)

  let find trees orth =
    Xlist.fold trees [] (fun found (pref,tree) ->
      (* print_endline pref; *)
      if Xstring.check_prefix pref orth then (
        let orth = Xstring.cut_prefix pref orth in
        (* printf "%s %d " orth (Xlist.size found); *)
        let found = find_rec found (String.length orth) orth tree in
        (* printf "%d\n%!" (Xlist.size found); *)
        (* Xlist.iter found (fun (stem,rule) -> printf "F %s\t%s\n" stem (string_of_rule rule)); *)
        found)
      else found)

  let add_char c rule =
    let s = String.make 1 c in
    {rule with find=s ^ rule.find; set=s ^ rule.set}

  let rec disjoint_rec super (M(map,rules)) =
    let rules = rules @ super in
    if CharMap.is_empty map then M(map,rules) else
    M(CharMap.mapi map (fun c tree ->
      disjoint_rec (Xlist.rev_map rules (add_char c)) tree),[])

  let disjoint trees =
    Xlist.rev_map trees (fun (pref,tree) ->
      pref, disjoint_rec [] tree)

  let rec print_rules_rec file (M(map,rules)) =
    Xlist.iter rules (print_rule file);
    CharMap.iter map (fun _ tree -> print_rules_rec file tree)

  let print_rules filename trees =
    File.file_out filename (fun file ->
      Xlist.iter trees (fun (_,tree) ->
        print_rules_rec file tree))

end

let load_freq_rules filename =
  File.fold_tab filename [] (fun rules -> function
    [id; freq; star; pref; find; set; interp] ->
       {id=id; freq=int_of_string freq; star=parse_star star; pref=pref; find=find; set=set;
        tags=[]; interp=interp} :: rules
  | _ -> failwith "load_freq_rules")

let load_rules resource_path rules_filename =
  let rules = load_freq_rules (resource_path ^ rules_filename) in
  let rules = CharTrees.create rules in
  let rules = CharTrees.disjoint rules in
  rules

let rules_filename = "freq_rules.tab"
let adj_rules_filename = "adj_freq_rules.tab"
let resource_path = "../resources/SGJP/"

let _ =
  let rules = load_rules resource_path adj_rules_filename in
  CharTrees.print_rules "results/disjoint_rules_adj" rules