freqUtils.ml 11 KB
(* Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Dec 2016-Jan 2017. *)

open Xstd

(* utils *)

let uniq lst =
        let seen = Hashtbl.create (List.length lst) in
        List.filter (fun x -> let tmp = not (Hashtbl.mem seen x) in
                Hashtbl.replace seen x ();
                tmp) lst

let rec join delim lst =
        (* Concatenate list of strings into one string, delimited by delim *)
        if lst = [] then ""
        else List.hd lst ^ (if List.tl lst = [] then ""
                            else delim ^ join delim (List.tl lst))

let slice lst b e =
        (* Given a list, return a slice from b to e *)
        let rec slice_step lst b e accm =
        if lst = [] || b >= e then []
        else if e = 0 then accm
        else if b > 0 then slice_step (List.tl lst) (b-1) (e-1) accm
        else if b = 0 then
                slice_step (List.tl lst) (b-1) (e-1) (List.hd lst :: accm)
        else slice_step lst 0 e accm
        in
        List.rev (slice_step lst b e [])

let str_contains str frag =
        try (Str.search_forward (Str.regexp frag) str 0; true)
        with Not_found -> false

let longest_str lst =
        let rec longest_str_step lst win winlen =
                if lst = [] then win
                else if String.length (List.hd lst) > winlen 
                then longest_str_step (List.tl lst) (List.hd lst)
                                                (String.length (List.hd lst))
                else longest_str_step (List.tl lst) win winlen
        in
        longest_str_step lst "" 0

let shortest_str lst =
        let rec shortest_str_step lst win winlen =
                if lst = [] then win
                else if String.length (List.hd lst) < winlen 
                then shortest_str_step (List.tl lst) (List.hd lst)
                                                (String.length (List.hd lst))
                else shortest_str_step (List.tl lst) win winlen
        in
        shortest_str_step lst "" 999999

let most_samechars lst patrn =
        (* Return a string from lst that shares of the same characters in its
         * beginning with patrn. If no characters are the same, return the 
         * patrn. *)
        let rec samechars s1 s2 cnt =
                if s1 = "" || s2 = "" then cnt
                else if String.get s1 0 = String.get s2 0
                then samechars (String.sub s1 0 ((String.length s1)-1))
                (String.sub s1 0 ((String.length s1)-1)) (cnt+1)
                else cnt
        in
        let rec most_samechars_step lst patrn win winscr =
                if lst = [] then win
                else let score = (samechars (List.hd lst) patrn 0) in
                if score > winscr 
                then most_samechars_step (List.tl lst) patrn (List.hd lst) score
                else most_samechars_step (List.tl lst) patrn win winscr
        in
        most_samechars_step lst patrn patrn 0

let prefer pfs lst =
        (* Return first element from pfs that was found in lst. Throw Not_found
         * if nothing was found. pfs can't contain empty string. *)
        if lst = [] || pfs = [] then raise Not_found
        (* for each element from pfs, try to find it on lst *)
        else let findings = List.map
        (fun p -> try (List.find (fun x -> x = p) lst) with Not_found -> "")
        pfs
        in
        (* try to find some non-false finding: *)
        List.find (fun f -> String.length f > 0) findings
        (* Not_found uncatched /\ *)

let strrev s =
        (* Reverse a string. *)
        let rec strrev_step s ns =
                if s = "" then ns
                else strrev_step (String.sub s 1 ((String.length s)-1))
                                ((String.make 1 (String.get s 0)) ^ ns)
        in
        strrev_step s ""

let print_strlst lst =
        Printf.printf "[";
        Xlist.iter
        lst
        (fun e -> Printf.printf " %s " e);
        Printf.printf "]"

let strmap_contains m e = try (StringMap.find m e; true)
                                with Not_found -> false

let map_from_list lst idx_fun =
        (* return a StringMap, containing all entries sorted by string hash
         * obtained by applying idx_fun to given element of lst *)
        Xlist.fold lst StringMap.empty
        (fun m elem -> StringMap.add_inc m (idx_fun elem) [elem]
                (fun l -> elem :: l))

(* working code *)

let neutx_to_neut tag = Str.global_replace
                        (Str.regexp ("\(n[0-9]\.?\)*n[0-9]"))
                        "n" tag

let lists_of_tag tag =
                (Xlist.rev_map
                (Xstring.split ":" (neutx_to_neut tag))
                (fun elem ->
                Xstring.split "\." elem))

let variants_of tag = 
        (* variants_of takes a tag as string and returns all its variants as a
         * list of strings, unfolding all the variants *)
        let rec fold_as_str = fun l ->
                if l = [] then "" else (List.hd l)
                (* add the colon if needed: *)
                ^ (if (List.tl l) = [] then (fold_as_str (List.tl l))
                else (":" ^ fold_as_str (List.tl l)))
        in
        uniq
        (Xlist.rev_map
                (Xlist.multiply_list
                        (List.rev (* tag elems got reversed earlier *) 
                        (lists_of_tag tag)))
                fold_as_str)

let regexp_of_tag tg =
        Str.regexp
        (join ":" (Xlist.rev_map (lists_of_tag tg)
        (fun variants -> "\(\(" ^ (join "\|" variants) ^ "\)\.?\)+")))

let create_interp_map itp_list =
        (* create_interp takes itp_list and creates a string map, indexed by
         * short tags, to the long (full) tags *)
        Xlist.fold itp_list StringMap.empty
        (fun smap tag ->
                let tag = neutx_to_neut tag in
                (Xlist.fold
                (variants_of tag)
                smap
                (fun smap short_tag ->
                        (StringMap.add_inc smap short_tag [tag]
                        (fun l -> tag :: l)))))

type sgjp_entry = { sg_orth: string; sg_lemma: string; sg_interp: string}

let create_sgjp_map_nohyph fname =
        (* load SGJP from the fname, as a map indexed by word forms, removing
         * all entries that contain a hyphen *)
        let clean_lemma lm = List.hd (Xstring.split ":" lm) in
        map_from_list
        (List.filter (fun s -> not (str_contains s.sg_orth "-"))
                (File.load_tab fname
                (function [sg_orth; sg_lemma; sg_interp; _; _] ->
                        {sg_orth=sg_orth; sg_lemma=clean_lemma sg_lemma;
                        sg_interp=(neutx_to_neut sg_interp)}
                | [] -> failwith "Empty entry in SGJP file"
                | _::_ -> {sg_orth=""; sg_lemma=""; sg_interp=""})))
        (fun etr -> etr.sg_orth)

type freq_entry = { orth:string; lemma:string; interp:string; frq:string;
        compos:string; sgjp_status:string; word_type:string; corr:string;
        rule_id:string; cat:string }

let check_sgjp_interp etr sgjp =
        (* return interpretation for the entry, if possible, using sgjp (map) *)
        try (let retrieved = (StringMap.find sgjp etr.orth) in
            if List.length retrieved = 1
            && (List.hd retrieved).sg_lemma = etr.lemma
            then (List.hd retrieved).sg_interp
            else "AMBIG-" ^ etr.interp)
        with Not_found -> "AMBIG-" ^ etr.interp

let print_list fname lst =
        (let out = open_out fname in
        (Xlist.iter lst (fun etr -> output_string out etr));
        close_out out)

let freq_etr_rules etr =
        (* Given a freq entry, return a list of matching freq_rule records, as
         * in rules.ml (load_freq_rules) *)
        Xlist.filter
                (Rules.CharTrees.find Inflexion.rules etr.orth)
                (fun o -> match o with
                (stem, rl) -> ((stem ^ rl.set) = etr.lemma)
                && (Str.string_match (regexp_of_tag etr.interp)
                        (neutx_to_neut rl.interp) 0))

let cat_of_tag tag = match List.hd (Xstring.split ":" tag) with
        | "subst" -> "noun" | "depr" -> "noun"
        | "adj" -> "adj" | "adja" -> "adj" | "adjc" -> "adj" | "adjp" -> "adj"
        | "adv" -> "adv"
        | "inf" -> "verb" | "praet"-> "verb" | "fin" -> "verb"
        | "ppas" -> "verb" | "pact" -> "verb" | "pacta" -> "verb"
        | "impt" -> "verb" | "imps" -> "verb" | "pcon" -> "verb"
        | "pant" -> "verb" | "ger" -> "verb"
        | "bedzie" -> "other" | "pred"-> "other" | "prep" -> "other"
        | "num" -> "other" | "aglt" -> "other" | "winien" -> "other"
        | "qub" -> "other" | "brev" -> "other" | "comp" -> "other"
        | "interj" -> "other" | "burk" -> "other" | "numcol" -> "other"
        | "conj" -> "other" | "ppron12" -> "other" | "ppron3" -> "other"
        | "ppron2" -> "other" | "ppron1" -> "other" | "ppron" -> "other"
        | "interp" -> "other" | "xxx" -> "other" | "siebie" -> "other"
        | "cond" -> "cond"
        | _ -> failwith
        (Printf.sprintf "unknown part of speech in %s" tag)

let tag_cats lst =
        Xlist.rev_map lst (fun etr -> {etr with cat = cat_of_tag etr.interp })

let sum_list_freq lst =
        Xlist.fold lst 0 (fun t etr -> t+(int_of_string etr.frq))

let map_interp_given_cat freq =
        (* map_interp_given_cat prepares a map from cat's to map of interps to
         * their (normalized->probability) scores *)
        let normalize scoretable =
                StringMap.fold scoretable StringMap.empty
                (fun sctb key vals ->
                        (* count total frequency for this cat *)
                        let total = float_of_int
                                (StringMap.fold vals 0 (fun t _ f -> t + f))
                        in
                        (StringMap.add sctb key
                                (* normalize for each interp *)
                                (StringMap.fold vals StringMap.empty
                                (fun newvals v s -> StringMap.add newvals v
                                 ((float_of_int s) /. total)))))
        in
        normalize
        (Xlist.fold freq StringMap.empty
        (fun m etr -> StringMap.add_inc m etr.cat
                (* create an empty map, and add the first entry: *)
                (StringMap.add StringMap.empty etr.interp
                        (int_of_string etr.frq))
                (* if map exists, add or increment entry for this interp: *)
                (fun itps -> StringMap.add_inc itps etr.interp
                          (int_of_string etr.frq)
                          (fun score -> score+(int_of_string etr.frq)))))

let is_root_form_tag tg =
        (* consult "Narodowy Korpus Języka Polskiego", Warszawa 2012, p. 69-70 *)
        if str_contains tg "^subst:sg:nom" then true
        else if str_contains tg "^subst:pl:nom.*:p[0-9]" then true
        else if str_contains tg "^\(adj.?|num|numcol\):sg:nom.*m1" then true
        else if str_contains tg "^ppron.*nom" then true
        else if str_contains tg "^inf" then true
        else if str_contains tg "^winien:sg.*m1" then true
        else if str_contains tg "^\(siebie|pred|prep|conj|comp|interj|burk|qub|xxx|interp\)"
        then true
        else false