Commit 697802d0bc55d0205f03a03baa6acda8e30f9320

Authored by Szymon Rutkowski
1 parent f06abf44

rozpoczęcie generalizacji tagów na liście frekwencyjnej

Too many changes to show.

To preserve performance only 2 of 3 files are displayed.

morphology/freq-process.lisp 0 → 100644
  1 +(declaim (optimize (debug 3)))
  2 +
  3 +;;; general utility functions
  4 +
  5 +(defun split-str-1 (string &optional (separator " ") (r nil))
  6 + (let ((n (position separator string
  7 + :from-end t
  8 + :test #'(lambda (x y)
  9 + (find y x :test #'string=)))))
  10 + (if n
  11 + (split-str-1 (subseq string 0 n) separator (cons (subseq string (1+ n)) r))
  12 + (cons string r))))
  13 +
  14 +(defun split-str (string &optional (separator " "))
  15 + (split-str-1 string separator))
  16 +
  17 +(defun strconc (&rest elems)
  18 + "STRCONC concatenates all its arguments into one string."
  19 + (apply #'concatenate (append (list 'string) elems)))
  20 +
  21 +(defun argmax (fun args)
  22 + (let ((win (cons 0 nil)))
  23 + (dolist (a args (cdr win))
  24 + (let ((score (funcall fun a)))
  25 + (if (>= score (car win)) (setf win (cons score a)))))))
  26 +
  27 +;;; local utility functions
  28 +
  29 +(defun load-tab (fpath)
  30 + "LOAD-TAB loads entries from the file and returns them as list of lists (entries)."
  31 + (nreverse (with-open-file (fl fpath)
  32 + (labels
  33 + ((tabs->list (ln) (split-str ln (string #\tab)))
  34 + (load-tab-1 (done-tab) (let ((ln (read-line fl nil))) ; throw nil on eof
  35 + (if ln
  36 + (cons (tabs->list ln) (load-tab-1 done-tab))
  37 + done-tab))))
  38 + (load-tab-1 nil)))))
  39 +
  40 +(defun load-tab-imp (fpath)
  41 + "LOAD-TAB-IMP loads entries from the file and returns them as list of lists (entries), and does
  42 + so in imperative way that doesn't upset the SBCL stack guard."
  43 + (nreverse (with-open-file (fl fpath)
  44 + (labels
  45 + ((tabs->list (ln) (split-str ln (string #\tab)))
  46 + (load-tab-1 () (let ((ln (read-line fl nil))) ; throw nil on eof
  47 + (if ln
  48 + (tabs->list ln)
  49 + nil))))
  50 + (do ((result nil)
  51 + (ln (load-tab-1) (load-tab-1)))
  52 + ((null ln) result)
  53 + (setf result (cons ln result)))))))
  54 +
  55 +(defun load-dic (fpath)
  56 + "TAB->DIC loads a tab file into a hashmap, indexed by the first elements of element lists. The earlier
  57 + keys are **overwritten** by the latest one."
  58 + (with-open-file (fl fpath)
  59 + (labels
  60 + ((tabs->list (ln) (split-str ln (string #\tab)))
  61 + (load-tab-1 () (let ((ln (read-line fl nil))) ; throw nil on eof
  62 + (if ln
  63 + (tabs->list ln)
  64 + nil))))
  65 + (do ((dic (make-hash-table :test #'equal))
  66 + (ln (load-tab-1) (load-tab-1)))
  67 + ((null ln) dic)
  68 + (setf (gethash (first ln) dic) ln)))))
  69 +
  70 +(defun tab-to-file (tab fpath)
  71 + "TAB-TO-FILE gets a list obtained from LOAD-TAB and prints its content to a fil in FPATH, in the tab
  72 + format."
  73 + (labels ((tabify (etr)
  74 + (apply #'strconc (butlast (apply #'append ; skip the last element (\tab)
  75 + (mapcar (lambda (x) (list x (string #\tab)))
  76 + etr))))))
  77 + (with-open-file (file fpath :direction :output)
  78 + (dolist (etr tab)
  79 + (princ (format nil "~a~%" (tabify etr)) file)))))
  80 +
  81 +(defun interp-tag (tag-str)
  82 + "INTERP-TAG takes a string and returns a list, containing tag elements as string, or
  83 + as lists, in case of an alternative."
  84 + (mapcar (lambda (x) (funcall
  85 + (lambda (y) (if (= 1 (length y)) (car y) y))
  86 + (split-str x ".")))
  87 + (split-str tag-str ":")))
  88 +
  89 +(defun tag-matches-p (tag1 tag2)
  90 + "TAG-MATCHES-P tests whether tag1 in contained by/matches tag2."
  91 + (and (= (length tag1) (length tag2))
  92 + (every #'identity ; just ensure they're true
  93 + (mapcar (lambda (t1 t2)
  94 + (if (listp t2)
  95 + (if (listp t1)
  96 + (every (lambda (x) (position x t2 :test #'equal)) t1)
  97 + (position t1 t2 :test #'equal))
  98 + (equalp t1 t2)))
  99 + tag1
  100 + tag2))))
  101 +
  102 +;;; the meat
  103 +
  104 +(defun remove-irreg (tab-list)
  105 + "REMOVE-IRREG takes a list taken from load-tab and removes the adj ... pos entries that don't
  106 + have com counterparts."
  107 + (labels ((maybe-mark-suspect (entry tl)
  108 + (let ((tg (interp-tag (second entry))))
  109 + (if (and (equal (car tg) "adj")
  110 + (equal (car (last tg)) "pos"))
  111 + ;; look for com counterpart in the next iteration:
  112 + (remove-irreg-1 tl entry)
  113 + ;; the basic-basic case:
  114 + (cons entry
  115 + (remove-irreg-1 tl nil)))))
  116 + (remove-irreg-1 (tl suspect) ; suspect is an entry as list, or nil
  117 + (if (null tl)
  118 + nil
  119 + (if suspect
  120 + ;; if suspect, keep the entries only when they have a "com" counterpart
  121 + (funcall (lambda (tg)
  122 + (if (and (equal (car tg) "adj")
  123 + (equal (car (last tg)) "com")
  124 + (tag-matches-p ; cut the last tag - com/pos
  125 + (butlast (interp-tag (second suspect)))
  126 + (butlast tg)))
  127 + (cons suspect (cons (car tl) (remove-irreg-1 (cdr tl) nil)))
  128 + ;; this may also be suspect:
  129 + (maybe-mark-suspect (car tl) (cdr tl))))
  130 + (interp-tag (second (car tl))))
  131 + ;; if not suspect, try if this entry might be another suspect (ie. is a
  132 + ;; pos adj)
  133 + (maybe-mark-suspect (car tl) (cdr tl))))))
  134 + (remove-irreg-1 tab-list nil)))
  135 +
  136 +(defun accom-tags (freq-list patt-list)
  137 + "ACCOM-TAGS returns a FREQ-LIST version were tags are expanded according to the patterns from
  138 + PATT-LIST."
  139 + ;; interp tags in patterns only once
  140 + (let ((patt-lookup (mapcar (lambda (patt) (interp-tag (second patt)))
  141 + patt-list)))
  142 + (mapcar
  143 + (lambda (etr)
  144 + (let* ((etr-tag (interp-tag (third etr)))
  145 + (matches (mapcar (lambda (patt-interp) (tag-matches-p etr-tag
  146 + patt-interp))
  147 + patt-lookup))
  148 + (cnt (count t matches)))
  149 + (cond
  150 + ((= cnt 1) (funcall (lambda (x y) (setf (third x) y) x)
  151 + (copy-seq etr)
  152 + (second (nth (position t matches)
  153 + patt-list))))
  154 + ((= cnt 0) ;(format wrn-strm "WARN: No known classification of ~a" etr-tag)
  155 + etr)
  156 + (t
  157 + (funcall (lambda (x y) (setf (third x) y) x)
  158 + (copy-seq etr)
  159 + (strconc "AMBIG-" (third etr)))))))
  160 + ;; choose the longest (broadest) interpretation
  161 +;;; (let* ((counter 0)
  162 +;;; ;; extract all the matches to the interps variable
  163 +;;; (interps (mapcar (lambda (x) (nth x patt-list))
  164 +;;; (remove nil
  165 +;;; (mapcar (lambda (y) (prog1 (if y counter nil)
  166 +;;; (incf counter)))
  167 +;;; matches)))))
  168 +;;; (format nil "Ambigous classification of ~a: ~a, the longest chosen"
  169 +;;; etr-tag interps)
  170 +;;; (funcall (lambda (x y) (setf (third x) y) x)
  171 +;;; (copy-seq etr)
  172 +;;; (second (argmax (lambda (x) (length (second x)))
  173 +;;; patt-list))))))))
  174 + freq-list)))
  175 +
  176 +(defun fill-with-dic (freq-list dic)
  177 + (mapcar
  178 + (lambda (etr)
  179 + (if (not (equal (third etr) "AMBIG"))
  180 + etr
  181 + (if (gethash (first etr) dic)
  182 + (funcall (lambda (x y) (setf (third x) y) x)
  183 + (copy-seq etr)
  184 + (third (gethash (first etr) dic)))
  185 + (warn (format nil "Cannot find interpretation for ~a" (first etr))))))
  186 + freq-list))
  187 +
  188 +(defun exec ()
  189 + (tab-to-file
  190 + ;; (fill-with-dic
  191 + (accom-tags (load-tab-imp "resources/NKJP1M/NKJP1M-tagged-frequency.tab")
  192 + (remove-irreg (load-tab-imp "morphology/data/interps.tab")))
  193 + ;; (load-dic "../NLP resources/sgjp-20160724.tab"))
  194 + "resources/NKJP1M/NKJP1M-general-frequency.tab"))
... ...
resources/NKJP1M/NKJP-tagged-frequency-tagset.txt
... ... @@ -58,7 +58,6 @@ CW - common word.
58 58 SPEC - word of limited usage (scientific, vocational), but excluding slang.
59 59 NEOL - neologism.
60 60 EXT - external, non-Polish word.
61   -UNK - unknown, all the others (maybe new categories will be created for them).
62 61  
63 62 8. Correctness.
64 63 Possible values are:
... ...