Commit 697802d0bc55d0205f03a03baa6acda8e30f9320
1 parent
f06abf44
rozpoczęcie generalizacji tagów na liście frekwencyjnej
Showing
3 changed files
with
183376 additions
and
1 deletions
Too many changes to show.
To preserve performance only 2 of 3 files are displayed.
morphology/freq-process.lisp
0 → 100644
1 | +(declaim (optimize (debug 3))) | |
2 | + | |
3 | +;;; general utility functions | |
4 | + | |
5 | +(defun split-str-1 (string &optional (separator " ") (r nil)) | |
6 | + (let ((n (position separator string | |
7 | + :from-end t | |
8 | + :test #'(lambda (x y) | |
9 | + (find y x :test #'string=))))) | |
10 | + (if n | |
11 | + (split-str-1 (subseq string 0 n) separator (cons (subseq string (1+ n)) r)) | |
12 | + (cons string r)))) | |
13 | + | |
14 | +(defun split-str (string &optional (separator " ")) | |
15 | + (split-str-1 string separator)) | |
16 | + | |
17 | +(defun strconc (&rest elems) | |
18 | + "STRCONC concatenates all its arguments into one string." | |
19 | + (apply #'concatenate (append (list 'string) elems))) | |
20 | + | |
21 | +(defun argmax (fun args) | |
22 | + (let ((win (cons 0 nil))) | |
23 | + (dolist (a args (cdr win)) | |
24 | + (let ((score (funcall fun a))) | |
25 | + (if (>= score (car win)) (setf win (cons score a))))))) | |
26 | + | |
27 | +;;; local utility functions | |
28 | + | |
29 | +(defun load-tab (fpath) | |
30 | + "LOAD-TAB loads entries from the file and returns them as list of lists (entries)." | |
31 | + (nreverse (with-open-file (fl fpath) | |
32 | + (labels | |
33 | + ((tabs->list (ln) (split-str ln (string #\tab))) | |
34 | + (load-tab-1 (done-tab) (let ((ln (read-line fl nil))) ; throw nil on eof | |
35 | + (if ln | |
36 | + (cons (tabs->list ln) (load-tab-1 done-tab)) | |
37 | + done-tab)))) | |
38 | + (load-tab-1 nil))))) | |
39 | + | |
40 | +(defun load-tab-imp (fpath) | |
41 | + "LOAD-TAB-IMP loads entries from the file and returns them as list of lists (entries), and does | |
42 | + so in imperative way that doesn't upset the SBCL stack guard." | |
43 | + (nreverse (with-open-file (fl fpath) | |
44 | + (labels | |
45 | + ((tabs->list (ln) (split-str ln (string #\tab))) | |
46 | + (load-tab-1 () (let ((ln (read-line fl nil))) ; throw nil on eof | |
47 | + (if ln | |
48 | + (tabs->list ln) | |
49 | + nil)))) | |
50 | + (do ((result nil) | |
51 | + (ln (load-tab-1) (load-tab-1))) | |
52 | + ((null ln) result) | |
53 | + (setf result (cons ln result))))))) | |
54 | + | |
55 | +(defun load-dic (fpath) | |
56 | + "TAB->DIC loads a tab file into a hashmap, indexed by the first elements of element lists. The earlier | |
57 | + keys are **overwritten** by the latest one." | |
58 | + (with-open-file (fl fpath) | |
59 | + (labels | |
60 | + ((tabs->list (ln) (split-str ln (string #\tab))) | |
61 | + (load-tab-1 () (let ((ln (read-line fl nil))) ; throw nil on eof | |
62 | + (if ln | |
63 | + (tabs->list ln) | |
64 | + nil)))) | |
65 | + (do ((dic (make-hash-table :test #'equal)) | |
66 | + (ln (load-tab-1) (load-tab-1))) | |
67 | + ((null ln) dic) | |
68 | + (setf (gethash (first ln) dic) ln))))) | |
69 | + | |
70 | +(defun tab-to-file (tab fpath) | |
71 | + "TAB-TO-FILE gets a list obtained from LOAD-TAB and prints its content to a fil in FPATH, in the tab | |
72 | + format." | |
73 | + (labels ((tabify (etr) | |
74 | + (apply #'strconc (butlast (apply #'append ; skip the last element (\tab) | |
75 | + (mapcar (lambda (x) (list x (string #\tab))) | |
76 | + etr)))))) | |
77 | + (with-open-file (file fpath :direction :output) | |
78 | + (dolist (etr tab) | |
79 | + (princ (format nil "~a~%" (tabify etr)) file))))) | |
80 | + | |
81 | +(defun interp-tag (tag-str) | |
82 | + "INTERP-TAG takes a string and returns a list, containing tag elements as string, or | |
83 | + as lists, in case of an alternative." | |
84 | + (mapcar (lambda (x) (funcall | |
85 | + (lambda (y) (if (= 1 (length y)) (car y) y)) | |
86 | + (split-str x "."))) | |
87 | + (split-str tag-str ":"))) | |
88 | + | |
89 | +(defun tag-matches-p (tag1 tag2) | |
90 | + "TAG-MATCHES-P tests whether tag1 in contained by/matches tag2." | |
91 | + (and (= (length tag1) (length tag2)) | |
92 | + (every #'identity ; just ensure they're true | |
93 | + (mapcar (lambda (t1 t2) | |
94 | + (if (listp t2) | |
95 | + (if (listp t1) | |
96 | + (every (lambda (x) (position x t2 :test #'equal)) t1) | |
97 | + (position t1 t2 :test #'equal)) | |
98 | + (equalp t1 t2))) | |
99 | + tag1 | |
100 | + tag2)))) | |
101 | + | |
102 | +;;; the meat | |
103 | + | |
104 | +(defun remove-irreg (tab-list) | |
105 | + "REMOVE-IRREG takes a list taken from load-tab and removes the adj ... pos entries that don't | |
106 | + have com counterparts." | |
107 | + (labels ((maybe-mark-suspect (entry tl) | |
108 | + (let ((tg (interp-tag (second entry)))) | |
109 | + (if (and (equal (car tg) "adj") | |
110 | + (equal (car (last tg)) "pos")) | |
111 | + ;; look for com counterpart in the next iteration: | |
112 | + (remove-irreg-1 tl entry) | |
113 | + ;; the basic-basic case: | |
114 | + (cons entry | |
115 | + (remove-irreg-1 tl nil))))) | |
116 | + (remove-irreg-1 (tl suspect) ; suspect is an entry as list, or nil | |
117 | + (if (null tl) | |
118 | + nil | |
119 | + (if suspect | |
120 | + ;; if suspect, keep the entries only when they have a "com" counterpart | |
121 | + (funcall (lambda (tg) | |
122 | + (if (and (equal (car tg) "adj") | |
123 | + (equal (car (last tg)) "com") | |
124 | + (tag-matches-p ; cut the last tag - com/pos | |
125 | + (butlast (interp-tag (second suspect))) | |
126 | + (butlast tg))) | |
127 | + (cons suspect (cons (car tl) (remove-irreg-1 (cdr tl) nil))) | |
128 | + ;; this may also be suspect: | |
129 | + (maybe-mark-suspect (car tl) (cdr tl)))) | |
130 | + (interp-tag (second (car tl)))) | |
131 | + ;; if not suspect, try if this entry might be another suspect (ie. is a | |
132 | + ;; pos adj) | |
133 | + (maybe-mark-suspect (car tl) (cdr tl)))))) | |
134 | + (remove-irreg-1 tab-list nil))) | |
135 | + | |
136 | +(defun accom-tags (freq-list patt-list) | |
137 | + "ACCOM-TAGS returns a FREQ-LIST version were tags are expanded according to the patterns from | |
138 | + PATT-LIST." | |
139 | + ;; interp tags in patterns only once | |
140 | + (let ((patt-lookup (mapcar (lambda (patt) (interp-tag (second patt))) | |
141 | + patt-list))) | |
142 | + (mapcar | |
143 | + (lambda (etr) | |
144 | + (let* ((etr-tag (interp-tag (third etr))) | |
145 | + (matches (mapcar (lambda (patt-interp) (tag-matches-p etr-tag | |
146 | + patt-interp)) | |
147 | + patt-lookup)) | |
148 | + (cnt (count t matches))) | |
149 | + (cond | |
150 | + ((= cnt 1) (funcall (lambda (x y) (setf (third x) y) x) | |
151 | + (copy-seq etr) | |
152 | + (second (nth (position t matches) | |
153 | + patt-list)))) | |
154 | + ((= cnt 0) ;(format wrn-strm "WARN: No known classification of ~a" etr-tag) | |
155 | + etr) | |
156 | + (t | |
157 | + (funcall (lambda (x y) (setf (third x) y) x) | |
158 | + (copy-seq etr) | |
159 | + (strconc "AMBIG-" (third etr))))))) | |
160 | + ;; choose the longest (broadest) interpretation | |
161 | +;;; (let* ((counter 0) | |
162 | +;;; ;; extract all the matches to the interps variable | |
163 | +;;; (interps (mapcar (lambda (x) (nth x patt-list)) | |
164 | +;;; (remove nil | |
165 | +;;; (mapcar (lambda (y) (prog1 (if y counter nil) | |
166 | +;;; (incf counter))) | |
167 | +;;; matches))))) | |
168 | +;;; (format nil "Ambigous classification of ~a: ~a, the longest chosen" | |
169 | +;;; etr-tag interps) | |
170 | +;;; (funcall (lambda (x y) (setf (third x) y) x) | |
171 | +;;; (copy-seq etr) | |
172 | +;;; (second (argmax (lambda (x) (length (second x))) | |
173 | +;;; patt-list)))))))) | |
174 | + freq-list))) | |
175 | + | |
176 | +(defun fill-with-dic (freq-list dic) | |
177 | + (mapcar | |
178 | + (lambda (etr) | |
179 | + (if (not (equal (third etr) "AMBIG")) | |
180 | + etr | |
181 | + (if (gethash (first etr) dic) | |
182 | + (funcall (lambda (x y) (setf (third x) y) x) | |
183 | + (copy-seq etr) | |
184 | + (third (gethash (first etr) dic))) | |
185 | + (warn (format nil "Cannot find interpretation for ~a" (first etr)))))) | |
186 | + freq-list)) | |
187 | + | |
188 | +(defun exec () | |
189 | + (tab-to-file | |
190 | + ;; (fill-with-dic | |
191 | + (accom-tags (load-tab-imp "resources/NKJP1M/NKJP1M-tagged-frequency.tab") | |
192 | + (remove-irreg (load-tab-imp "morphology/data/interps.tab"))) | |
193 | + ;; (load-dic "../NLP resources/sgjp-20160724.tab")) | |
194 | + "resources/NKJP1M/NKJP1M-general-frequency.tab")) | |
... | ... |
resources/NKJP1M/NKJP-tagged-frequency-tagset.txt
... | ... | @@ -58,7 +58,6 @@ CW - common word. |
58 | 58 | SPEC - word of limited usage (scientific, vocational), but excluding slang. |
59 | 59 | NEOL - neologism. |
60 | 60 | EXT - external, non-Polish word. |
61 | -UNK - unknown, all the others (maybe new categories will be created for them). | |
62 | 61 | |
63 | 62 | 8. Correctness. |
64 | 63 | Possible values are: |
... | ... |