freqListInterps.ml 14.5 KB
(* Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Dec 2016. *)

open Xstd

(* utils *)

let uniq lst =
        let seen = Hashtbl.create (List.length lst) in
        List.filter (fun x -> let tmp = not (Hashtbl.mem seen x) in
                Hashtbl.replace seen x ();
                tmp) lst

let rec join delim lst =
        (* Concatenate list of strings into one string, delimited by delim *)
        if lst = [] then ""
        else List.hd lst ^ (if List.tl lst = [] then ""
                            else delim ^ join delim (List.tl lst))

let slice lst b e =
        (* Given a list, return a slice from b to e *)
        let rec slice_step lst b e accm =
        if lst = [] || b >= e then []
        else if e = 0 then accm
        else if b > 0 then slice_step (List.tl lst) (b-1) (e-1) accm
        else if b = 0 then
                slice_step (List.tl lst) (b-1) (e-1) (List.hd lst :: accm)
        else slice_step lst 0 e accm
        in
        List.rev (slice_step lst b e [])

let str_contains str frag =
        try (Str.search_forward (Str.regexp frag) str 0; true)
        with Not_found -> false

let longest_str lst =
        let rec longest_str_step lst win winlen =
                if lst = [] then win
                else if String.length (List.hd lst) > winlen 
                then longest_str_step (List.tl lst) (List.hd lst)
                                                (String.length (List.hd lst))
                else longest_str_step (List.tl lst) win winlen
        in
        longest_str_step lst "" 0

let shortest_str lst =
        let rec shortest_str_step lst win winlen =
                if lst = [] then win
                else if String.length (List.hd lst) < winlen 
                then shortest_str_step (List.tl lst) (List.hd lst)
                                                (String.length (List.hd lst))
                else shortest_str_step (List.tl lst) win winlen
        in
        shortest_str_step lst "" 999999

let most_samechars lst patrn =
        (* Return a string from lst that shares of the same characters in its
         * beginning with patrn. If no characters are the same, return the 
         * patrn. *)
        let rec samechars s1 s2 cnt =
                if s1 = "" || s2 = "" then cnt
                else if String.get s1 0 = String.get s2 0
                then samechars (String.sub s1 0 ((String.length s1)-1))
                (String.sub s1 0 ((String.length s1)-1)) (cnt+1)
                else cnt
        in
        let rec most_samechars_step lst patrn win winscr =
                if lst = [] then win
                else let score = (samechars (List.hd lst) patrn 0) in
                if score > winscr 
                then most_samechars_step (List.tl lst) patrn (List.hd lst) score
                else most_samechars_step (List.tl lst) patrn win winscr
        in
        most_samechars_step lst patrn patrn 0

let prefer pfs lst =
        (* Return first element from pfs that was found in lst. Throw Not_found
         * if nothing was found. pfs can't contain empty string. *)
        if lst = [] || pfs = [] then raise Not_found
        (* for each element from pfs, try to find it on lst *)
        else let findings = List.map
        (fun p -> try (List.find (fun x -> x = p) lst) with Not_found -> "")
        pfs
        in
        (* try to find some non-false finding: *)
        List.find (fun f -> String.length f > 0) findings
        (* Not_found uncatched /\ *)

let strrev s =
        (* Reverse a string. *)
        let rec strrev_step s ns =
                if s = "" then ns
                else strrev_step (String.sub s 1 ((String.length s)-1))
                                ((String.make 1 (String.get s 0)) ^ ns)
        in
        strrev_step s ""

let print_strlst lst =
        Printf.printf "[";
        Xlist.iter
        lst
        (fun e -> Printf.printf " %s " e);
        Printf.printf "]"

let strmap_contains m e = try (StringMap.find m e; true)
                                with Not_found -> false

(* working code *)

let neutx_to_neut tag = Str.global_replace
                        (Str.regexp ("\(n[0-9]\.?\)*n[0-9]"))
                        "n" tag

let lists_of_tag tag =
                (Xlist.rev_map
                (Xstring.split ":" (neutx_to_neut tag))
                (fun elem ->
                Xstring.split "\." elem))

let variants_of tag = 
        (* variants_of takes a tag as string and returns all its variants as a
         * list of strings, unfolding all the variants *)
        let rec fold_as_str = fun l ->
                if l = [] then "" else (List.hd l)
                (* add the colon if needed: *)
                ^ (if (List.tl l) = [] then (fold_as_str (List.tl l))
                else (":" ^ fold_as_str (List.tl l)))
        in
        uniq
        (Xlist.rev_map
                (Xlist.multiply_list
                        (List.rev (* tag elems got reversed earlier *) 
                        (lists_of_tag tag)))
                fold_as_str)

let regexp_of_tag tg =
        Str.regexp
        (join ":" (Xlist.rev_map (lists_of_tag tg)
        (fun variants -> "\(\(" ^ (join "\|" variants) ^ "\)\.?\)+")))

let create_interp_map fname =
        (* create_interp takes an fname and returns a string map, indexed by
         * short tags, to the long (full) tags *)
        let interps = File.load_tab fname (function [_; tag; _] -> tag
        | [] -> failwith "Empty entry in the interp file"
        | _::_ -> failwith "Malformatted entry in the interp file")
        in
        Xlist.fold interps StringMap.empty
        (fun smap tag ->
                let tag = neutx_to_neut tag in
                (Xlist.fold
                (variants_of tag)
                smap
                (fun smap short_tag ->
                        (StringMap.add_inc smap short_tag [tag]
                        (fun l -> tag :: l)))))

type sgjp_entry = { sg_orth: string; sg_lemma: string; sg_interp: string}

let create_sgjp_map fname =
        (* load SGJP from the fname, as a map indexed by word forms *)
        let clean_lemma lm = List.hd (Xstring.split ":" lm) in
        Xlist.fold
        (File.load_tab fname
        (function [sg_orth; sg_lemma; sg_interp; _] ->
                {sg_orth=sg_orth; sg_lemma=clean_lemma sg_lemma;
                sg_interp=((*shorten_tag *)sg_interp)}
        | [] -> failwith "Empty entry in SGJP file"
        | _::_ -> {sg_orth=""; sg_lemma=""; sg_interp=""}))
        StringMap.empty
        (fun smap etr ->
                ((StringMap.add_inc smap etr.sg_orth [etr]
                (fun l -> etr :: l))))

type freq_entry = { orth:string; lemma:string; interp:string; frq:string;
        compos:string; sgjp_status:string; word_type:string; corr:string;
        rule_id:string; cat:string }

let check_sgjp_interp etr sgjp =
        (* return interpretation for the entry, if possible, using sgjp (map) *)
        try (let retrieved = (StringMap.find sgjp etr.orth) in
            if List.length retrieved = 1
            && (List.hd retrieved).sg_lemma = etr.lemma
            then (List.hd retrieved).sg_interp
            else "AMBIG-" ^ etr.interp)
        with Not_found -> "AMBIG-" ^ etr.interp

let simplify_qub tg =
        (* change qub:nwok/wok into plain qub *)
        try (Str.search_forward (Str.regexp "qub") tg 0;
             List.hd(Xstring.split ":" tg))
        with Not_found -> tg

let resolve_num etr inps =
        (* if word form of etr is purely numerical, assign to it the broadest
         * interpretation *)
        if not (str_contains etr.interp "num:") then etr
        else if (Str.string_match (Str.regexp "^[123456789IVXLCM]+$")
        etr.orth 0)
        then { etr with interp=longest_str inps }
        else { etr with interp=shortest_str inps }

let resolve_verb etr inps =
        (* choose interpretation of a verb that shares most chars with the
         * original one (should resolve perf.imperf issues etc.) *)
        let verbs = Xlist.rev_map ["praet:"; "imps:"; "imp"; "fin"; "inf";
        "ger:"; "pact:"; "ppas:"] (str_contains etr.interp) in
        let truth x = if x = true then true else false in
        if List.exists truth verbs
        then {etr with interp=strrev (most_samechars (Xlist.rev_map inps strrev)
                                                (strrev (etr.interp)))}
        else etr

let generalize etr interp_map sgjp_map =
        (* given an entry from frequency list, try to assign it a generalized
         * tag using interp_map and sgjp_map *)
        try (match uniq (StringMap.find interp_map (etr.interp))
        with
        | [] -> etr
        | h::[] -> { etr with interp=h }
        | inps -> resolve_verb (resolve_num
                        { etr with interp=(check_sgjp_interp etr sgjp_map) }
                        inps) inps)
        with Not_found -> (* Printf.printf "not found %s\n" etr.interp;*)
        {etr with interp= (simplify_qub etr.interp)}

let print_freq fname lst =
        let out = open_out fname in
        (Xlist.iter
        lst
        (fun etr -> Printf.fprintf out "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
        etr.orth etr.lemma etr.interp etr.frq etr.compos etr.sgjp_status
        etr.word_type etr.corr etr.rule_id etr.cat);
        close_out out)

let merge_entries lst =
        (* Given a list of freq entries, merge those that have the same (orth,
         * lemma, interp) triple if lowercased, summing the frequencies *)
        let lwr = Xunicode.lowercase_utf8_string in
        let etr_pnts = Xlist.fold lst StringMap.empty
        (fun m etr -> StringMap.add_inc m
                (lwr (join "~" [etr.orth; etr.lemma; etr.interp]))
                [etr] (fun l -> etr :: l))
        in
        StringMap.fold etr_pnts []
        (fun l k v -> let e = (List.hd v) in
        {orth=(lwr e.orth); lemma=(lwr e.lemma); interp=e.interp;
         (* sum the frequencies *)
         frq=string_of_int
                 (Xlist.fold v 0 (fun cnt vl -> cnt+(int_of_string vl.frq)));
         (* prefer the most general value of those parameters, among present in
          * all merged entries: *)
         compos=prefer ["COMPOS"; "COMPOS-ndm"; "COMPOS-*"; "COMPOS-ALT";
         "COMPOS-LWR"; "COMPOS-LWR-ndm"; "COMPOS-LWR-*"; "COMPOS-LWR-ALT";
         "NCOMPOS"] (Xlist.rev_map v (fun e -> e.compos));
         sgjp_status=prefer ["SGJP-EXACT"; "SGJP-LMM-UNCAPITAL";
         "SGJP-LMM-CAPITAL"; "SGJP-LMM-LOWER"; "SGJP-BTH-LOWER"; "NON-SGJP"]
         (Xlist.rev_map v (fun e -> e.sgjp_status));
         word_type=prefer ["CW"; "NCH"; "EXT"; "SYMB"; "ACRO"; "PN"; "SPEC";
         "NEOL"; "COMPD"; "WEB"] (Xlist.rev_map v (fun e -> e.word_type));
         corr=prefer ["CORR"; "CERR"; "DIAL"; "PHON"; "ERR"; "TAGD"; "PLTAN";
         "TAGE"; "ERR-TAGE"; "CERR-TAGE"] (Xlist.rev_map v (fun e -> e.corr));
         rule_id=""; cat=""}
         :: l)

let tag_rules alt_fname lst =
        (* tag freq entries with their rule ids - loading the exceptions from
         * alt_fname - currently rules ambiguity are resolved by choosing the
         * most specific rule *)
        let alts =
                Xlist.fold
                        (File.load_tab alt_fname (fun [orth; _; _] -> orth))
                        StringMap.empty (fun m e -> StringMap.add m e "")
        in
        Xlist.rev_map lst
        (fun etr ->
                if strmap_contains alts etr.orth then {etr with rule_id="ALT"}
                else if etr.compos="NCOMPOS" then etr
                else let rls = Xlist.filter
                        (Rules.CharTrees.find Inflexion.rules etr.orth)
                        (fun o -> match o with
                        (stem, rl) -> ((stem ^ rl.set) = etr.lemma)
                        && (Str.string_match (regexp_of_tag etr.interp)
                                (neutx_to_neut rl.interp) 0))
                in
                if List.length rls = 0 then
                (if etr.compos!="NCOMPOS" then
                        (Printf.printf "can't find rule for COMPOS %s\n" etr.orth;
                        {etr with compos="NCOMPOS"})
                else etr)
                (* extract the first rule from the list: *)
                else let hdrule = (match List.hd rls with (_, rl) -> rl) in
                (* if it's the only one, just return it: *)
                if List.length rls = 1 then {etr with rule_id=hdrule.id}
                else (Printf.printf "too many rules match %s (%d)\n"
                        etr.orth (List.length rls);
                      (* select the rule with the longest "set" (agglutinant) *)
                      {etr with rule_id = (Xlist.fold rls hdrule 
                              (fun choice opt ->
                               match opt with (_, opt) ->
                               if String.length choice.set<String.length opt.set
                               then opt else choice)).id}))

let tag_cats lst =
        Xlist.rev_map lst
        (fun etr ->
        {etr with cat = match List.hd (Xstring.split ":" etr.interp) with
        | "subst" -> "noun" | "depr" -> "noun"
        | "adj" -> "adj" | "adja" -> "adj" | "adjc" -> "adj" | "adjp" -> "adj"
        | "adv" -> "adv"
        | "inf" -> "verb" | "praet"-> "verb" | "fin" -> "verb"
        | "ppas" -> "verb" | "pact" -> "verb" | "pacta" -> "verb"
        | "impt" -> "verb" | "imps" -> "verb" | "pcon" -> "verb"
        | "pant" -> "verb" | "ger" -> "verb"
        | "bedzie" -> "other" | "pred"-> "other" | "prep" -> "other"
        | "num" -> "other" | "aglt" -> "other" | "winien" -> "other"
        | "qub" -> "other" | "brev" -> "other" | "comp" -> "other"
        | "interj" -> "other" | "burk" -> "other" | "numcol" -> "other"
        | "conj" -> "other" | "ppron12" -> "other" | "ppron3" -> "other"
        | "interp" -> "other" | "xxx" -> "other" | "siebie" -> "other"
        | "cond" -> "cond"
        | _ -> failwith
        (Printf.sprintf "unknown part of speech in %s" etr.interp)})

let _ =
        print_freq "../resources/NKJP1M/NKJP1M-generalized-frequency.tab"
        (tag_cats
        (tag_rules "../resources/SGJP/alt.tab"
        (merge_entries
          (let interp_map = (create_interp_map "data/interps_general.tab") in
          let sgjp_map = create_sgjp_map "../../NLP resources/sgjp-20160724.tab"
          in
          let freq = File.load_tab
          "../resources/NKJP1M/NKJP1M-tagged-frequency.tab"
          (function [o;l;i;f;c;s;w;cr] -> { orth=o; lemma=l; interp=i; frq=f;
          compos=c; sgjp_status=s; word_type=w; corr=cr; rule_id=""; cat=""}
          | [] -> failwith "Empty entry in the freq file"
          | _::_ -> failwith "Malformatted entry in the freq file") in
          Xlist.rev_map freq
          (fun etr -> generalize etr interp_map sgjp_map)))))