freqListInterps.ml 8.39 KB
open Xstd

let uniq lst =
        let seen = Hashtbl.create (List.length lst) in
        List.filter (fun x -> let tmp = not (Hashtbl.mem seen x) in
                Hashtbl.replace seen x ();
                tmp) lst

let rec join delim lst =
        if lst = [] then ""
        else List.hd lst ^ (if List.tl lst = [] then ""
                            else delim ^ join delim (List.tl lst))

let slice lst b e =
        let rec slice_step lst b e accm =
        if lst = [] || b >= e then []
        else if e = 0 then accm
        else if b > 0 then slice_step (List.tl lst) (b-1) (e-1) accm
        else if b = 0 then
                slice_step (List.tl lst) (b-1) (e-1) (List.hd lst :: accm)
        else slice_step lst 0 e accm
        in
        List.rev (slice_step lst b e [])

let str_contains str frag =
        try (Str.search_forward (Str.regexp frag) str 0; true)
        with Not_found -> false

let longest_str lst =
        let rec longest_str_step lst win winlen =
                if lst = [] then win
                else if String.length (List.hd lst) > winlen 
                then longest_str_step (List.tl lst) (List.hd lst)
                                                (String.length (List.hd lst))
                else longest_str_step (List.tl lst) win winlen
        in
        longest_str_step lst "" 0

let shortest_str lst =
        let rec shortest_str_step lst win winlen =
                if lst = [] then win
                else if String.length (List.hd lst) < winlen 
                then shortest_str_step (List.tl lst) (List.hd lst)
                                                (String.length (List.hd lst))
                else shortest_str_step (List.tl lst) win winlen
        in
        shortest_str_step lst "" 999999

let most_samechars lst patrn =
        (* Return a string from lst that shares of the same characters in its
         * beginning with patrn. If no characters are the same, return the 
         * patrn. *)
        let rec samechars s1 s2 cnt =
                if s1 = "" || s2 = "" then cnt
                else if String.get s1 0 = String.get s2 0
                then samechars (String.sub s1 0 ((String.length s1)-1))
                (String.sub s1 0 ((String.length s1)-1)) (cnt+1)
                else cnt
        in
        let rec most_samechars_step lst patrn win winscr =
                if lst = [] then win
                else let score = (samechars (List.hd lst) patrn 0) in
                if score > winscr 
                then most_samechars_step (List.tl lst) patrn (List.hd lst) score
                else most_samechars_step (List.tl lst) patrn win winscr
        in
        most_samechars_step lst patrn patrn 0

let strrev s =
        let rec strrev_step s ns =
                if s = "" then ns
                else strrev_step (String.sub s 1 ((String.length s)-1))
                                ((String.make 1 (String.get s 0)) ^ ns)
        in
        strrev_step s ""

let print_strlst lst =
        Printf.printf "[";
        Xlist.iter
        lst
        (fun e -> Printf.printf " %s " e);
        Printf.printf "]"

let variants_of tag = 
        (* variants_of takes a tag as string and returns all its variants as a
         * list of strings, unfolding all the variants *)
        let rec fold_as_str = fun l ->
                if l = [] then "" else (List.hd l)
                (* add the colon if needed: *)
                ^ (if (List.tl l) = [] then (fold_as_str (List.tl l))
                else (":" ^ fold_as_str (List.tl l)))
        in
        uniq
        (Xlist.rev_map
                (Xlist.multiply_list
                        (List.rev (* tag elems got reversed earlier *) 
                                (Xlist.rev_map
                                (Xstring.split ":" (Str.global_replace
                                        (Str.regexp "n[0-9]") "n" tag))
                                (fun elem ->
                                Xstring.split "\." elem))))
                fold_as_str)

let create_interp_map fname =
        (* create_interp takes an fname and returns a string map, indexed by
         * short tags, to the long (full) tags *)
        let interps = File.load_tab fname (function [_; tag; _] -> tag
        | [] -> failwith "Empty entry in the interp file"
        | _::_ -> failwith "Malformatted entry in the interp file")
        in
        Xlist.fold interps StringMap.empty
        (fun smap tag ->
                let tag = Str.global_replace (Str.regexp ("\(n[0-9]\.?\)+"))
                        "n" tag in
                (Xlist.fold
                (variants_of tag)
                smap
                (fun smap short_tag ->
                        ((StringMap.add_inc smap short_tag [tag]
                        (fun l -> tag :: l))))))

type sgjp_entry = { sg_orth: string; sg_lemma: string; sg_interp: string}

let create_sgjp_map fname =
        (* load SGJP from the fname, as a map indexed by word forms *)
        let clean_lemma lm = List.hd (Xstring.split ":" lm) in
        Xlist.fold
        (File.load_tab fname
        (function [sg_orth; sg_lemma; sg_interp; _] ->
                {sg_orth=sg_orth; sg_lemma=clean_lemma sg_lemma;
                sg_interp=((*shorten_tag *)sg_interp)}
        | [] -> failwith "Empty entry in SGJP file"
        | _::_ -> {sg_orth=""; sg_lemma=""; sg_interp=""}))
        StringMap.empty
        (fun smap etr ->
                ((StringMap.add_inc smap etr.sg_orth [etr]
                (fun l -> etr :: l))))

type freq_entry = { orth:string; lemma:string; interp:string; frq:string;
        compos:string; sgjp_status:string; word_type:string; corr:string }

let check_sgjp_interp etr sgjp =
        (* return interpretation for the entry, if possible, using sgjp (map) *)
        try (let retrieved = (StringMap.find sgjp etr.orth) in
            if List.length retrieved = 1
            && (List.hd retrieved).sg_lemma = etr.lemma
            then (List.hd retrieved).sg_interp
            else "AMBIG-" ^ etr.interp)
        with Not_found -> "AMBIG-" ^ etr.interp

let simplify_qub tg =
        (* change qub:nwok/wok into plain qub *)
        try (Str.search_forward (Str.regexp "qub") tg 0;
             List.hd(Xstring.split ":" tg))
        with Not_found -> tg

let resolve_num etr inps =
        if not (str_contains etr.interp "num:") then etr
        else if (Str.string_match (Str.regexp "^[123456789IVXLCM]+$")
        etr.orth 0)
        then { etr with interp=longest_str inps }
        else { etr with interp=shortest_str inps }

let resolve_verb etr inps =
        let verbs = Xlist.rev_map ["praet:"; "imps:"; "imp"; "fin"; "inf";
        "ger:"; "pact:"; "ppas:"] (str_contains etr.interp) in
        let truth x = if x = true then true else false in
        if List.exists truth verbs
        then {etr with interp=strrev (most_samechars (Xlist.rev_map inps strrev)
                                                (strrev (etr.interp)))}
        else etr

let generalize etr interp_map sgjp_map =
        (* given an entry from frequency list, try to assign it a generalized
         * tag using interp_map and sgjp_map *)
        try (match uniq (StringMap.find interp_map ((*shorten_tag *)etr.interp))
        with
        | [] -> etr
        | h::[] -> { etr with interp=h }
        | inps -> resolve_verb (resolve_num
                        { etr with interp=(check_sgjp_interp etr sgjp_map) }
                        inps) inps)
        with Not_found -> Printf.printf "not found %s\n" etr.interp;
        {etr with interp= (simplify_qub etr.interp)}

let print_freq fname lst =
        let out = open_out fname in
        (Xlist.iter
        lst
        (fun etr -> Printf.fprintf out "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
        etr.orth etr.lemma etr.interp etr.frq etr.compos etr.sgjp_status
        etr.word_type etr.corr);
        close_out out)

let _ =
        print_freq "../resources/NKJP1M/NKJP1M-generalized-frequency.tab"
          (let interp_map = (create_interp_map "data/interps_general.tab") in
          let sgjp_map = create_sgjp_map "../../NLP resources/sgjp-20160724.tab"
          in
          let freq = File.load_tab
          "../resources/NKJP1M/NKJP1M-tagged-frequency.tab"
          (function [o;l;i;f;c;s;w;cr] -> { orth=o; lemma=l; interp=i; frq=f;
          compos=c; sgjp_status=s; word_type=w; corr=cr }
          | [] -> failwith "Empty entry in the freq file"
          | _::_ -> failwith "Malformatted entry in the freq file") in
          Xlist.rev_map freq
          (fun etr -> generalize etr interp_map sgjp_map))