Commit 3634f1f2dfd8df37bc3f0fcd6e094c9eddfa81b2
1 parent
13ac1d32
code cleanup
Showing
1 changed file
with
54 additions
and
835 deletions
TrainingAndEval.ipynb
... | ... | @@ -2,23 +2,10 @@ |
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | - "execution_count": 1, | |
5 | + "execution_count": null, | |
6 | 6 | "id": "97d0c9ab", |
7 | 7 | "metadata": {}, |
8 | - "outputs": [ | |
9 | - { | |
10 | - "name": "stderr", | |
11 | - "output_type": "stream", | |
12 | - "text": [ | |
13 | - "2023-04-11 11:17:29.095631: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", | |
14 | - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", | |
15 | - "2023-04-11 11:17:29.331444: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", | |
16 | - "2023-04-11 11:17:30.167497: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n", | |
17 | - "2023-04-11 11:17:30.167593: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n", | |
18 | - "2023-04-11 11:17:30.167603: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n" | |
19 | - ] | |
20 | - } | |
21 | - ], | |
8 | + "outputs": [], | |
22 | 9 | "source": [ |
23 | 10 | "import importlib\n", |
24 | 11 | "\n", |
... | ... | @@ -40,7 +27,7 @@ |
40 | 27 | }, |
41 | 28 | { |
42 | 29 | "cell_type": "code", |
43 | - "execution_count": 2, | |
30 | + "execution_count": null, | |
44 | 31 | "id": "c41d6630", |
45 | 32 | "metadata": {}, |
46 | 33 | "outputs": [], |
... | ... | @@ -51,36 +38,10 @@ |
51 | 38 | }, |
52 | 39 | { |
53 | 40 | "cell_type": "code", |
54 | - "execution_count": 3, | |
41 | + "execution_count": null, | |
55 | 42 | "id": "f30d7b7c", |
56 | 43 | "metadata": {}, |
57 | - "outputs": [ | |
58 | - { | |
59 | - "name": "stdout", | |
60 | - "output_type": "stream", | |
61 | - "text": [ | |
62 | - "1 Physical GPUs, 1 Logical GPUs\n" | |
63 | - ] | |
64 | - }, | |
65 | - { | |
66 | - "name": "stderr", | |
67 | - "output_type": "stream", | |
68 | - "text": [ | |
69 | - "2023-04-11 11:17:31.717262: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
70 | - "2023-04-11 11:17:31.762533: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
71 | - "2023-04-11 11:17:31.763529: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
72 | - "2023-04-11 11:17:31.765670: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", | |
73 | - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", | |
74 | - "2023-04-11 11:17:31.769196: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
75 | - "2023-04-11 11:17:31.770058: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
76 | - "2023-04-11 11:17:31.770816: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
77 | - "2023-04-11 11:17:32.722287: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
78 | - "2023-04-11 11:17:32.723281: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
79 | - "2023-04-11 11:17:32.724062: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
80 | - "2023-04-11 11:17:32.724846: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20480 MB memory: -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:00:05.0, compute capability: 8.0\n" | |
81 | - ] | |
82 | - } | |
83 | - ], | |
44 | + "outputs": [], | |
84 | 45 | "source": [ |
85 | 46 | "# https://www.tensorflow.org/guide/gpu\n", |
86 | 47 | "gpus = tf.config.list_physical_devices('GPU')\n", |
... | ... | @@ -98,33 +59,12 @@ |
98 | 59 | }, |
99 | 60 | { |
100 | 61 | "cell_type": "code", |
101 | - "execution_count": 4, | |
62 | + "execution_count": null, | |
102 | 63 | "id": "89afdb1e", |
103 | 64 | "metadata": { |
104 | 65 | "scrolled": true |
105 | 66 | }, |
106 | - "outputs": [ | |
107 | - { | |
108 | - "name": "stdout", | |
109 | - "output_type": "stream", | |
110 | - "text": [ | |
111 | - "/device:GPU:0\n", | |
112 | - "2.10.0\n" | |
113 | - ] | |
114 | - }, | |
115 | - { | |
116 | - "name": "stderr", | |
117 | - "output_type": "stream", | |
118 | - "text": [ | |
119 | - "2023-04-11 11:17:32.739308: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
120 | - "2023-04-11 11:17:32.740224: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
121 | - "2023-04-11 11:17:32.740975: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
122 | - "2023-04-11 11:17:32.741809: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
123 | - "2023-04-11 11:17:32.742586: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", | |
124 | - "2023-04-11 11:17:32.743322: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /device:GPU:0 with 20480 MB memory: -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:00:05.0, compute capability: 8.0\n" | |
125 | - ] | |
126 | - } | |
127 | - ], | |
67 | + "outputs": [], | |
128 | 68 | "source": [ |
129 | 69 | "print(tf.test.gpu_device_name())\n", |
130 | 70 | "print(tf.__version__)" |
... | ... | @@ -132,63 +72,22 @@ |
132 | 72 | }, |
133 | 73 | { |
134 | 74 | "cell_type": "code", |
135 | - "execution_count": 5, | |
75 | + "execution_count": null, | |
136 | 76 | "id": "2b0ab576", |
137 | 77 | "metadata": {}, |
138 | - "outputs": [ | |
139 | - { | |
140 | - "name": "stderr", | |
141 | - "output_type": "stream", | |
142 | - "text": [ | |
143 | - "Found cached dataset pdb_c_beta (/home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1)\n" | |
144 | - ] | |
145 | - }, | |
146 | - { | |
147 | - "data": { | |
148 | - "application/vnd.jupyter.widget-view+json": { | |
149 | - "model_id": "55f181333dc44c7a811c515cc55c4988", | |
150 | - "version_major": 2, | |
151 | - "version_minor": 0 | |
152 | - }, | |
153 | - "text/plain": [ | |
154 | - " 0%| | 0/3 [00:00<?, ?it/s]" | |
155 | - ] | |
156 | - }, | |
157 | - "metadata": {}, | |
158 | - "output_type": "display_data" | |
159 | - } | |
160 | - ], | |
78 | + "outputs": [], | |
161 | 79 | "source": [ |
162 | 80 | "pdbc_dataset = load_dataset('pdb_c_beta')" |
163 | 81 | ] |
164 | 82 | }, |
165 | 83 | { |
166 | 84 | "cell_type": "code", |
167 | - "execution_count": 6, | |
85 | + "execution_count": null, | |
168 | 86 | "id": "2f4c317a", |
169 | 87 | "metadata": { |
170 | 88 | "scrolled": true |
171 | 89 | }, |
172 | - "outputs": [ | |
173 | - { | |
174 | - "name": "stderr", | |
175 | - "output_type": "stream", | |
176 | - "text": [ | |
177 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-ff2490f308f7f25b.arrow\n", | |
178 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-cbb40b0e978ab6ee.arrow\n", | |
179 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-3facbd810991cd6c.arrow\n", | |
180 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-e54a8628e59de21f.arrow\n", | |
181 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-9692de6b8224e758.arrow\n", | |
182 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-4042ffa1dc5d9323.arrow\n", | |
183 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-fb250709424f85ec.arrow\n", | |
184 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-1f6ce0a488a89d56.arrow\n", | |
185 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-2ae4daf5101c7aa2.arrow\n", | |
186 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-a1686820d15bcf04.arrow\n", | |
187 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-fe2c12481861f4bd.arrow\n", | |
188 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-da5a875c385c3570.arrow\n" | |
189 | - ] | |
190 | - } | |
191 | - ], | |
90 | + "outputs": [], | |
192 | 91 | "source": [ |
193 | 92 | "import importlib\n", |
194 | 93 | "\n", |
... | ... | @@ -203,20 +102,10 @@ |
203 | 102 | }, |
204 | 103 | { |
205 | 104 | "cell_type": "code", |
206 | - "execution_count": 7, | |
105 | + "execution_count": null, | |
207 | 106 | "id": "de1966ed", |
208 | 107 | "metadata": {}, |
209 | - "outputs": [ | |
210 | - { | |
211 | - "name": "stderr", | |
212 | - "output_type": "stream", | |
213 | - "text": [ | |
214 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-1dfcf507d62f6da8.arrow\n", | |
215 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-264c0111246b25c1.arrow\n", | |
216 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-6a40675124a412f0.arrow\n" | |
217 | - ] | |
218 | - } | |
219 | - ], | |
108 | + "outputs": [], | |
220 | 109 | "source": [ |
221 | 110 | "features = pdbc_dataset_spines['train'].features\n", |
222 | 111 | "pdbc_dataset_spines_cont = pdbc_dataset_spines.filter(\n", |
... | ... | @@ -226,41 +115,17 @@ |
226 | 115 | }, |
227 | 116 | { |
228 | 117 | "cell_type": "code", |
229 | - "execution_count": 8, | |
118 | + "execution_count": null, | |
230 | 119 | "id": "33ff295b", |
231 | 120 | "metadata": {}, |
232 | - "outputs": [ | |
233 | - { | |
234 | - "data": { | |
235 | - "text/plain": [ | |
236 | - "DatasetDict({\n", | |
237 | - " train: Dataset({\n", | |
238 | - " features: ['corp_id', 'sent_id', 'tokens', 'lemmas', 'cposes', 'poses', 'tags', 'heads', 'deprels', 'nonterminals', 'spines', 'anchors', 'anchor_hs'],\n", | |
239 | - " num_rows: 15903\n", | |
240 | - " })\n", | |
241 | - " validation: Dataset({\n", | |
242 | - " features: ['corp_id', 'sent_id', 'tokens', 'lemmas', 'cposes', 'poses', 'tags', 'heads', 'deprels', 'nonterminals', 'spines', 'anchors', 'anchor_hs'],\n", | |
243 | - " num_rows: 1980\n", | |
244 | - " })\n", | |
245 | - " test: Dataset({\n", | |
246 | - " features: ['corp_id', 'sent_id', 'tokens', 'lemmas', 'cposes', 'poses', 'tags', 'heads', 'deprels', 'nonterminals', 'spines', 'anchors', 'anchor_hs'],\n", | |
247 | - " num_rows: 1990\n", | |
248 | - " })\n", | |
249 | - "})" | |
250 | - ] | |
251 | - }, | |
252 | - "execution_count": 8, | |
253 | - "metadata": {}, | |
254 | - "output_type": "execute_result" | |
255 | - } | |
256 | - ], | |
121 | + "outputs": [], | |
257 | 122 | "source": [ |
258 | 123 | "pdbc_dataset_spines_cont" |
259 | 124 | ] |
260 | 125 | }, |
261 | 126 | { |
262 | 127 | "cell_type": "code", |
263 | - "execution_count": 9, | |
128 | + "execution_count": null, | |
264 | 129 | "id": "a8ddbc1f", |
265 | 130 | "metadata": {}, |
266 | 131 | "outputs": [], |
... | ... | @@ -270,7 +135,7 @@ |
270 | 135 | }, |
271 | 136 | { |
272 | 137 | "cell_type": "code", |
273 | - "execution_count": 10, | |
138 | + "execution_count": null, | |
274 | 139 | "id": "8029594b", |
275 | 140 | "metadata": {}, |
276 | 141 | "outputs": [], |
... | ... | @@ -288,30 +153,24 @@ |
288 | 153 | }, |
289 | 154 | { |
290 | 155 | "cell_type": "code", |
291 | - "execution_count": 36, | |
156 | + "execution_count": null, | |
292 | 157 | "id": "be8e93fa", |
293 | 158 | "metadata": {}, |
294 | 159 | "outputs": [], |
295 | 160 | "source": [ |
296 | - "def crop(dataset, n):\n", | |
297 | - " return dataset.filter(lambda example: len(example['tokens']) <= n)\n", | |
298 | - "\n", | |
299 | 161 | "spines_pdbc = ClassificationTask(\n", |
300 | 162 | " 'spines_pdbc',\n", |
301 | 163 | " pdbc_dataset_spines,\n", |
302 | - " #crop(pdbc_dataset, 6),\n", | |
303 | 164 | ")\n", |
304 | 165 | "\n", |
305 | 166 | "spines_pdbc_cont = ClassificationTask(\n", |
306 | 167 | " 'spines_pdbc_cont',\n", |
307 | 168 | " pdbc_dataset_spines_cont,\n", |
308 | - " #crop(pdbc_dataset, 6),\n", | |
309 | 169 | ")\n", |
310 | 170 | "\n", |
311 | 171 | "spines_pdbc_compressed = ClassificationTask(\n", |
312 | 172 | " 'spines_pdbc_compressed',\n", |
313 | 173 | " pdbc_dataset_spines_compressed,\n", |
314 | - " #crop(pdbc_dataset, 6),\n", | |
315 | 174 | ")\n", |
316 | 175 | "\n", |
317 | 176 | "TASK = spines_pdbc_compressed\n", |
... | ... | @@ -320,7 +179,7 @@ |
320 | 179 | }, |
321 | 180 | { |
322 | 181 | "cell_type": "code", |
323 | - "execution_count": 37, | |
182 | + "execution_count": null, | |
324 | 183 | "id": "7824fcee", |
325 | 184 | "metadata": {}, |
326 | 185 | "outputs": [], |
... | ... | @@ -330,56 +189,12 @@ |
330 | 189 | }, |
331 | 190 | { |
332 | 191 | "cell_type": "code", |
333 | - "execution_count": 38, | |
192 | + "execution_count": null, | |
334 | 193 | "id": "1eb5f41a", |
335 | 194 | "metadata": { |
336 | 195 | "scrolled": false |
337 | 196 | }, |
338 | - "outputs": [ | |
339 | - { | |
340 | - "name": "stdout", | |
341 | - "output_type": "stream", | |
342 | - "text": [ | |
343 | - "Loading BERT tokenizer...\n" | |
344 | - ] | |
345 | - }, | |
346 | - { | |
347 | - "name": "stderr", | |
348 | - "output_type": "stream", | |
349 | - "text": [ | |
350 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-49fe5b05228c3588.arrow\n" | |
351 | - ] | |
352 | - }, | |
353 | - { | |
354 | - "name": "stdout", | |
355 | - "output_type": "stream", | |
356 | - "text": [ | |
357 | - "Preprocessing the dataset for BERT...\n" | |
358 | - ] | |
359 | - }, | |
360 | - { | |
361 | - "data": { | |
362 | - "application/vnd.jupyter.widget-view+json": { | |
363 | - "model_id": "5f108b00fcab4db8a610f24ae03b7308", | |
364 | - "version_major": 2, | |
365 | - "version_minor": 0 | |
366 | - }, | |
367 | - "text/plain": [ | |
368 | - " 0%| | 0/2211 [00:00<?, ?ex/s]" | |
369 | - ] | |
370 | - }, | |
371 | - "metadata": {}, | |
372 | - "output_type": "display_data" | |
373 | - }, | |
374 | - { | |
375 | - "name": "stderr", | |
376 | - "output_type": "stream", | |
377 | - "text": [ | |
378 | - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-b8e2900fbd9615fd.arrow\n", | |
379 | - "You're using a HerbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" | |
380 | - ] | |
381 | - } | |
382 | - ], | |
197 | + "outputs": [], | |
383 | 198 | "source": [ |
384 | 199 | "trainer = training.Trainer(\n", |
385 | 200 | " MODEL,\n", |
... | ... | @@ -398,21 +213,10 @@ |
398 | 213 | }, |
399 | 214 | { |
400 | 215 | "cell_type": "code", |
401 | - "execution_count": 39, | |
216 | + "execution_count": null, | |
402 | 217 | "id": "276708cc", |
403 | 218 | "metadata": {}, |
404 | - "outputs": [ | |
405 | - { | |
406 | - "data": { | |
407 | - "text/plain": [ | |
408 | - "('keras_fit_logs_spines_pdbc_compressed', 'models_spines_pdbc_compressed')" | |
409 | - ] | |
410 | - }, | |
411 | - "execution_count": 39, | |
412 | - "metadata": {}, | |
413 | - "output_type": "execute_result" | |
414 | - } | |
415 | - ], | |
219 | + "outputs": [], | |
416 | 220 | "source": [ |
417 | 221 | "log_dir = f'keras_fit_logs_{TASK.name}'\n", |
418 | 222 | "model_dir = f'models_{TASK.name}'\n", |
... | ... | @@ -422,51 +226,12 @@ |
422 | 226 | }, |
423 | 227 | { |
424 | 228 | "cell_type": "code", |
425 | - "execution_count": 40, | |
229 | + "execution_count": null, | |
426 | 230 | "id": "e8ccde06", |
427 | 231 | "metadata": { |
428 | 232 | "scrolled": false |
429 | 233 | }, |
430 | - "outputs": [ | |
431 | - { | |
432 | - "name": "stdout", | |
433 | - "output_type": "stream", | |
434 | - "text": [ | |
435 | - "The tensorboard extension is already loaded. To reload it, use:\n", | |
436 | - " %reload_ext tensorboard\n", | |
437 | - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", | |
438 | - "To disable this warning, you can either:\n", | |
439 | - "\t- Avoid using `tokenizers` before the fork if possible\n", | |
440 | - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" | |
441 | - ] | |
442 | - }, | |
443 | - { | |
444 | - "data": { | |
445 | - "text/html": [ | |
446 | - "\n", | |
447 | - " <iframe id=\"tensorboard-frame-83a6a03964d4187a\" width=\"100%\" height=\"800\" frameborder=\"0\">\n", | |
448 | - " </iframe>\n", | |
449 | - " <script>\n", | |
450 | - " (function() {\n", | |
451 | - " const frame = document.getElementById(\"tensorboard-frame-83a6a03964d4187a\");\n", | |
452 | - " const url = new URL(\"/\", window.location);\n", | |
453 | - " const port = 6004;\n", | |
454 | - " if (port) {\n", | |
455 | - " url.port = port;\n", | |
456 | - " }\n", | |
457 | - " frame.src = url;\n", | |
458 | - " })();\n", | |
459 | - " </script>\n", | |
460 | - " " | |
461 | - ], | |
462 | - "text/plain": [ | |
463 | - "<IPython.core.display.HTML object>" | |
464 | - ] | |
465 | - }, | |
466 | - "metadata": {}, | |
467 | - "output_type": "display_data" | |
468 | - } | |
469 | - ], | |
234 | + "outputs": [], | |
470 | 235 | "source": [ |
471 | 236 | "%load_ext tensorboard\n", |
472 | 237 | "! killall tensorboard\n", |
... | ... | @@ -476,21 +241,12 @@ |
476 | 241 | }, |
477 | 242 | { |
478 | 243 | "cell_type": "code", |
479 | - "execution_count": 41, | |
244 | + "execution_count": null, | |
480 | 245 | "id": "a5b0da64", |
481 | 246 | "metadata": { |
482 | 247 | "scrolled": true |
483 | 248 | }, |
484 | - "outputs": [ | |
485 | - { | |
486 | - "name": "stdout", | |
487 | - "output_type": "stream", | |
488 | - "text": [ | |
489 | - "CPU times: user 6 ยตs, sys: 1 ยตs, total: 7 ยตs\n", | |
490 | - "Wall time: 15.7 ยตs\n" | |
491 | - ] | |
492 | - } | |
493 | - ], | |
249 | + "outputs": [], | |
494 | 250 | "source": [ |
495 | 251 | "%%time\n", |
496 | 252 | "\n", |
... | ... | @@ -505,45 +261,10 @@ |
505 | 261 | }, |
506 | 262 | { |
507 | 263 | "cell_type": "code", |
508 | - "execution_count": 42, | |
509 | - "id": "e42b2bd4", | |
510 | - "metadata": {}, | |
511 | - "outputs": [], | |
512 | - "source": [ | |
513 | - "#import importlib\n", | |
514 | - "#from neural_parser import hybrid_tree_utils\n", | |
515 | - "#importlib.reload(hybrid_tree_utils)\n", | |
516 | - "#from neural_parser import data_utils\n", | |
517 | - "#importlib.reload(data_utils)\n", | |
518 | - "#from neural_parser import constituency_parser\n", | |
519 | - "#importlib.reload(constituency_parser)" | |
520 | - ] | |
521 | - }, | |
522 | - { | |
523 | - "cell_type": "code", | |
524 | - "execution_count": 43, | |
264 | + "execution_count": null, | |
525 | 265 | "id": "2f65dead", |
526 | 266 | "metadata": {}, |
527 | - "outputs": [ | |
528 | - { | |
529 | - "name": "stdout", | |
530 | - "output_type": "stream", | |
531 | - "text": [ | |
532 | - "created 3 classifier(s)\n" | |
533 | - ] | |
534 | - }, | |
535 | - { | |
536 | - "name": "stderr", | |
537 | - "output_type": "stream", | |
538 | - "text": [ | |
539 | - "Some layers from the model checkpoint at models_spines_pdbc_compressed/model were not used when initializing TFBertForMultiTargetTokenClassification: ['dropout_73']\n", | |
540 | - "- This IS expected if you are initializing TFBertForMultiTargetTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", | |
541 | - "- This IS NOT expected if you are initializing TFBertForMultiTargetTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", | |
542 | - "All the layers of TFBertForMultiTargetTokenClassification were initialized from the model checkpoint at models_spines_pdbc_compressed/model.\n", | |
543 | - "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMultiTargetTokenClassification for predictions without further training.\n" | |
544 | - ] | |
545 | - } | |
546 | - ], | |
267 | + "outputs": [], | |
547 | 268 | "source": [ |
548 | 269 | "if not TRAIN:\n", |
549 | 270 | " from neural_parser import constituency_parser\n", |
... | ... | @@ -552,7 +273,7 @@ |
552 | 273 | }, |
553 | 274 | { |
554 | 275 | "cell_type": "code", |
555 | - "execution_count": 44, | |
276 | + "execution_count": null, | |
556 | 277 | "id": "24edee79", |
557 | 278 | "metadata": {}, |
558 | 279 | "outputs": [], |
... | ... | @@ -565,41 +286,10 @@ |
565 | 286 | }, |
566 | 287 | { |
567 | 288 | "cell_type": "code", |
568 | - "execution_count": 45, | |
289 | + "execution_count": null, | |
569 | 290 | "id": "4a7cd10b", |
570 | 291 | "metadata": {}, |
571 | - "outputs": [ | |
572 | - { | |
573 | - "name": "stdout", | |
574 | - "output_type": "stream", | |
575 | - "text": [ | |
576 | - "1/1 [==============================] - 10s 10s/step\n" | |
577 | - ] | |
578 | - }, | |
579 | - { | |
580 | - "data": { | |
581 | - "text/plain": [ | |
582 | - "[(['Miaล', 'em', 'kotka', '.'],\n", | |
583 | - " {'spines': ['ROOT_S_VP_V', '<EMPTY>', 'NP_N', 'Punct'],\n", | |
584 | - " 'anchors': ['<ROOT>', 'V', 'S', 'ROOT'],\n", | |
585 | - " 'anchor_hs': ['<ROOT>', '1', '1', '1']}),\n", | |
586 | - " (['Wlazล', 'kotek', 'na', 'pลotek', 'i', 'mruga', '.'],\n", | |
587 | - " {'spines': ['VP_V',\n", | |
588 | - " 'NP_N',\n", | |
589 | - " 'PrepNP_Prep',\n", | |
590 | - " 'NP_N',\n", | |
591 | - " 'ROOT_S_VP_Conj',\n", | |
592 | - " 'VP_V',\n", | |
593 | - " 'Punct'],\n", | |
594 | - " 'anchors': ['VP', 'S', 'VP', 'PrepNP', '<ROOT>', 'VP', 'ROOT'],\n", | |
595 | - " 'anchor_hs': ['1', '1', '2', '1', '<ROOT>', '1', '1']})]" | |
596 | - ] | |
597 | - }, | |
598 | - "execution_count": 45, | |
599 | - "metadata": {}, | |
600 | - "output_type": "execute_result" | |
601 | - } | |
602 | - ], | |
292 | + "outputs": [], | |
603 | 293 | "source": [ |
604 | 294 | "parser.parse(sentences)" |
605 | 295 | ] |
... | ... | @@ -616,21 +306,10 @@ |
616 | 306 | }, |
617 | 307 | { |
618 | 308 | "cell_type": "code", |
619 | - "execution_count": 46, | |
309 | + "execution_count": null, | |
620 | 310 | "id": "4ac4b9df", |
621 | 311 | "metadata": {}, |
622 | - "outputs": [ | |
623 | - { | |
624 | - "data": { | |
625 | - "text/plain": [ | |
626 | - "<module 'neural_parser.constants' from '/home/kkrasnowska/neural-parsing/ICCS/neural_parser/constants.py'>" | |
627 | - ] | |
628 | - }, | |
629 | - "execution_count": 46, | |
630 | - "metadata": {}, | |
631 | - "output_type": "execute_result" | |
632 | - } | |
633 | - ], | |
312 | + "outputs": [], | |
634 | 313 | "source": [ |
635 | 314 | "from neural_parser import hybrid_tree_utils\n", |
636 | 315 | "importlib.reload(hybrid_tree_utils)\n", |
... | ... | @@ -640,86 +319,12 @@ |
640 | 319 | }, |
641 | 320 | { |
642 | 321 | "cell_type": "code", |
643 | - "execution_count": 47, | |
644 | - "id": "d1b28792", | |
645 | - "metadata": {}, | |
646 | - "outputs": [], | |
647 | - "source": [ | |
648 | - "from spacy import displacy\n", | |
649 | - "\n", | |
650 | - "def to_deps(tokens, deprels, heads):\n", | |
651 | - " deps = {'words' : [], 'arcs' : []}\n", | |
652 | - " for i, (token, deprel, head) in enumerate(zip(tokens, deprels, heads)):\n", | |
653 | - " deps['words'].append({'text' : token, 'tag' : 'X'})\n", | |
654 | - " if head >= 0:\n", | |
655 | - " d = 'left' if head > i else 'right'\n", | |
656 | - " start, end = sorted((i, head))\n", | |
657 | - " deps['arcs'].append({'start' : start, 'end' : end, 'label' : deprel, 'dir' : d})\n", | |
658 | - " return deps\n", | |
659 | - "\n", | |
660 | - "def display_deps(tokens, deprels, heads):\n", | |
661 | - " displacy.render(to_deps(tokens, deprels, heads), manual=True, options={'distance' : 80})\n", | |
662 | - " \n", | |
663 | - "import urllib.parse\n", | |
664 | - "import json\n", | |
665 | - "\n", | |
666 | - "def show_tree(tree):\n", | |
667 | - " tree_json = json.dumps(hybrid_tree_utils.tree2dict(tree)['tree'])\n", | |
668 | - " src = f'http://127.0.0.1:8010/?tree={urllib.parse.quote(tree_json)}'\n", | |
669 | - " display(IFrame(src, 950, 550))" | |
670 | - ] | |
671 | - }, | |
672 | - { | |
673 | - "cell_type": "code", | |
674 | - "execution_count": 48, | |
322 | + "execution_count": null, | |
675 | 323 | "id": "9f443569", |
676 | 324 | "metadata": { |
677 | 325 | "scrolled": true |
678 | 326 | }, |
679 | - "outputs": [ | |
680 | - { | |
681 | - "name": "stdout", | |
682 | - "output_type": "stream", | |
683 | - "text": [ | |
684 | - "2211\n", | |
685 | - "2205\n", | |
686 | - "['Caลujฤ', '.']\n" | |
687 | - ] | |
688 | - }, | |
689 | - { | |
690 | - "data": { | |
691 | - "text/plain": [ | |
692 | - "{'heads': [None, 0],\n", | |
693 | - " 'deprels': ['ROOT', 'punct'],\n", | |
694 | - " 'spines': ['ROOT_S_VP_V', 'Punct'],\n", | |
695 | - " 'anchors': ['<ROOT>', 'ROOT'],\n", | |
696 | - " 'anchor_hs': ['<ROOT>', '1']}" | |
697 | - ] | |
698 | - }, | |
699 | - "metadata": {}, | |
700 | - "output_type": "display_data" | |
701 | - }, | |
702 | - { | |
703 | - "name": "stdout", | |
704 | - "output_type": "stream", | |
705 | - "text": [ | |
706 | - "['Drzemaล', '.']\n" | |
707 | - ] | |
708 | - }, | |
709 | - { | |
710 | - "data": { | |
711 | - "text/plain": [ | |
712 | - "{'heads': [None, 0],\n", | |
713 | - " 'deprels': ['ROOT', 'punct'],\n", | |
714 | - " 'spines': ['ROOT_S_VP_V', 'Punct'],\n", | |
715 | - " 'anchors': ['<ROOT>', 'ROOT'],\n", | |
716 | - " 'anchor_hs': ['<ROOT>', '1']}" | |
717 | - ] | |
718 | - }, | |
719 | - "metadata": {}, | |
720 | - "output_type": "display_data" | |
721 | - } | |
722 | - ], | |
327 | + "outputs": [], | |
723 | 328 | "source": [ |
724 | 329 | "HDR = [\n", |
725 | 330 | " 'heads', 'deprels',\n", |
... | ... | @@ -753,49 +358,10 @@ |
753 | 358 | }, |
754 | 359 | { |
755 | 360 | "cell_type": "code", |
756 | - "execution_count": 49, | |
361 | + "execution_count": null, | |
757 | 362 | "id": "3f53c039", |
758 | 363 | "metadata": {}, |
759 | - "outputs": [ | |
760 | - { | |
761 | - "name": "stdout", | |
762 | - "output_type": "stream", | |
763 | - "text": [ | |
764 | - "70/70 [==============================] - 17s 152ms/step\n", | |
765 | - "69/69 [==============================] - 12s 168ms/step\n", | |
766 | - "['Caลujฤ', '.']\n" | |
767 | - ] | |
768 | - }, | |
769 | - { | |
770 | - "data": { | |
771 | - "text/plain": [ | |
772 | - "{'spines': ['ROOT_S_VP_V', 'Punct'],\n", | |
773 | - " 'anchors': ['<ROOT>', 'ROOT'],\n", | |
774 | - " 'anchor_hs': ['<ROOT>', '1']}" | |
775 | - ] | |
776 | - }, | |
777 | - "metadata": {}, | |
778 | - "output_type": "display_data" | |
779 | - }, | |
780 | - { | |
781 | - "name": "stdout", | |
782 | - "output_type": "stream", | |
783 | - "text": [ | |
784 | - "['Drzemaล', '.']\n" | |
785 | - ] | |
786 | - }, | |
787 | - { | |
788 | - "data": { | |
789 | - "text/plain": [ | |
790 | - "{'spines': ['ROOT_S_VP_V', 'Punct'],\n", | |
791 | - " 'anchors': ['<ROOT>', 'ROOT'],\n", | |
792 | - " 'anchor_hs': ['<ROOT>', '1']}" | |
793 | - ] | |
794 | - }, | |
795 | - "metadata": {}, | |
796 | - "output_type": "display_data" | |
797 | - } | |
798 | - ], | |
364 | + "outputs": [], | |
799 | 365 | "source": [ |
800 | 366 | "def get_predicted_data(TOKENS_TRUE):\n", |
801 | 367 | " PARSED = parser.parse([' '.join(toks) for toks in TOKENS_TRUE])\n", |
... | ... | @@ -821,45 +387,10 @@ |
821 | 387 | }, |
822 | 388 | { |
823 | 389 | "cell_type": "code", |
824 | - "execution_count": 50, | |
390 | + "execution_count": null, | |
825 | 391 | "id": "17c1d9cb", |
826 | 392 | "metadata": {}, |
827 | - "outputs": [ | |
828 | - { | |
829 | - "name": "stdout", | |
830 | - "output_type": "stream", | |
831 | - "text": [ | |
832 | - "2211\n", | |
833 | - "2205\n", | |
834 | - "['Caลujฤ', '.']\n" | |
835 | - ] | |
836 | - }, | |
837 | - { | |
838 | - "data": { | |
839 | - "text/plain": [ | |
840 | - "{'heads': [None, 0], 'deprels': ['root', 'punct']}" | |
841 | - ] | |
842 | - }, | |
843 | - "metadata": {}, | |
844 | - "output_type": "display_data" | |
845 | - }, | |
846 | - { | |
847 | - "name": "stdout", | |
848 | - "output_type": "stream", | |
849 | - "text": [ | |
850 | - "['Drzemaล', '.']\n" | |
851 | - ] | |
852 | - }, | |
853 | - { | |
854 | - "data": { | |
855 | - "text/plain": [ | |
856 | - "{'heads': [None, 0], 'deprels': ['root', 'punct']}" | |
857 | - ] | |
858 | - }, | |
859 | - "metadata": {}, | |
860 | - "output_type": "display_data" | |
861 | - } | |
862 | - ], | |
393 | + "outputs": [], | |
863 | 394 | "source": [ |
864 | 395 | "import conllu\n", |
865 | 396 | "\n", |
... | ... | @@ -894,7 +425,7 @@ |
894 | 425 | }, |
895 | 426 | { |
896 | 427 | "cell_type": "code", |
897 | - "execution_count": 51, | |
428 | + "execution_count": null, | |
898 | 429 | "id": "004918c6", |
899 | 430 | "metadata": {}, |
900 | 431 | "outputs": [], |
... | ... | @@ -913,42 +444,22 @@ |
913 | 444 | "def tree2spans(tree, labeled=True, headed=False):\n", |
914 | 445 | " spans = []\n", |
915 | 446 | " _tree2spans(tree, spans, labeled=labeled, headed=headed)\n", |
916 | - " # TODO\n", | |
917 | - " #try:\n", | |
918 | - " # assert(len(spans) == len(set(spans)))\n", | |
919 | - " #except:\n", | |
920 | - " # show_tree(tree)\n", | |
921 | - " # (display(spans))\n", | |
922 | - " # 1/0\n", | |
923 | 447 | " return set(spans)" |
924 | 448 | ] |
925 | 449 | }, |
926 | 450 | { |
927 | 451 | "cell_type": "code", |
928 | - "execution_count": 52, | |
452 | + "execution_count": null, | |
929 | 453 | "id": "65d493ca", |
930 | 454 | "metadata": {}, |
931 | - "outputs": [ | |
932 | - { | |
933 | - "data": { | |
934 | - "text/plain": [ | |
935 | - "<module 'neural_parser.hybrid_tree_utils' from '/home/kkrasnowska/neural-parsing/ICCS/neural_parser/hybrid_tree_utils.py'>" | |
936 | - ] | |
937 | - }, | |
938 | - "execution_count": 52, | |
939 | - "metadata": {}, | |
940 | - "output_type": "execute_result" | |
941 | - } | |
942 | - ], | |
455 | + "outputs": [], | |
943 | 456 | "source": [ |
944 | - "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n", | |
945 | - "\n", | |
946 | - "importlib.reload(hybrid_tree_utils)" | |
457 | + "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score" | |
947 | 458 | ] |
948 | 459 | }, |
949 | 460 | { |
950 | 461 | "cell_type": "code", |
951 | - "execution_count": 53, | |
462 | + "execution_count": null, | |
952 | 463 | "id": "e5f88e76", |
953 | 464 | "metadata": { |
954 | 465 | "scrolled": false |
... | ... | @@ -964,18 +475,11 @@ |
964 | 475 | " key : {'true' : [], 'pred' : []} for key in ('heads', ('heads', 'deprels'))\n", |
965 | 476 | " }\n", |
966 | 477 | "\n", |
967 | - " k = 0\n", | |
968 | 478 | " i = 0\n", |
969 | 479 | " PROBLEM_TREES = []\n", |
970 | 480 | "\n", |
971 | 481 | " for toks, true, pred, combo in zip(tokens, tags_true, tags_pred, tags_combo):\n", |
972 | 482 | " \n", |
973 | - " #sent = ' '.join(toks)\n", | |
974 | - " #cats = HDR\n", | |
975 | - " #true = dict(zip(cats, zip(*true)))\n", | |
976 | - " #pred = dict(zip(cats, zip(*pred)))\n", | |
977 | - " #print('----------------------------')\n", | |
978 | - " #print(sent)\n", | |
979 | 483 | " dummy = {'lemmas' : ['_' for _ in toks], 'tags' : ['_' for _ in toks]}\n", |
980 | 484 | " true.update(dummy)\n", |
981 | 485 | " pred.update(dummy)\n", |
... | ... | @@ -994,12 +498,6 @@ |
994 | 498 | " print('=============================')\n", |
995 | 499 | " raise\n", |
996 | 500 | " tree_pred, problems = None, None\n", |
997 | - " #if 'reattach' in problems:\n", | |
998 | - " # show_tree(tree_pred)\n", | |
999 | - " \n", | |
1000 | - " #if pred['lemmas_corr'] != pred['lemmas']:\n", | |
1001 | - " # print(pred['lemmas_corr'])\n", | |
1002 | - " # print(pred['lemmas'])\n", | |
1003 | 501 | " \n", |
1004 | 502 | " for key, v in accuracies.items():\n", |
1005 | 503 | " if type(key) == str:\n", |
... | ... | @@ -1011,31 +509,11 @@ |
1011 | 509 | " \n", |
1012 | 510 | " spans_true = tree2spans(tree_true, labeled=labeled, headed=headed)\n", |
1013 | 511 | " spans_pred = tree2spans(tree_pred, labeled=labeled, headed=headed) if tree_pred else set()\n", |
1014 | - " if 'adwokata' in toks:\n", | |
1015 | - " print(spans_true)\n", | |
1016 | - " print(spans_pred)\n", | |
1017 | 512 | " tp = len(spans_true.intersection(spans_pred))\n", |
1018 | 513 | " P[0] += tp\n", |
1019 | 514 | " R[0] += tp\n", |
1020 | 515 | " P[1] += len(spans_pred)\n", |
1021 | 516 | " R[1] += len(spans_true)\n", |
1022 | - " leafs = tree_true.get_yield()\n", | |
1023 | - " discont = [leaf.from_index for leaf in leafs] != list(range(len(leafs)))\n", | |
1024 | - " #if k < 5 and len(toks) > 9 and [leaf.features['index'] for leaf in leafs] != list(range(len(leafs))):\n", | |
1025 | - " #if k < 5 and spans_combo != spans_true:\n", | |
1026 | - " #if k < 5 and not OK:\n", | |
1027 | - " #if discont and len(toks) > 12 and k < 0 and spans_pred == spans_true:\n", | |
1028 | - " if len(toks) == 8 and k < 0:\n", | |
1029 | - " print('GOLD TREE:')\n", | |
1030 | - " show_tree(tree_true)\n", | |
1031 | - " display(true)\n", | |
1032 | - " #display(_tree2dict(tree_true))\n", | |
1033 | - " print('PREDICTED TREE:')\n", | |
1034 | - " show_tree(tree_pred)\n", | |
1035 | - " display(pred)\n", | |
1036 | - " print('FP:', spans_pred - spans_true)\n", | |
1037 | - " print('FN:', spans_true - spans_pred)\n", | |
1038 | - " k += 1\n", | |
1039 | 517 | " i += 1\n", |
1040 | 518 | " \n", |
1041 | 519 | " p, r = P[0]/P[1], R[0]/R[1]\n", |
... | ... | @@ -1060,25 +538,12 @@ |
1060 | 538 | }, |
1061 | 539 | { |
1062 | 540 | "cell_type": "code", |
1063 | - "execution_count": 54, | |
541 | + "execution_count": null, | |
1064 | 542 | "id": "8f8a771a", |
1065 | 543 | "metadata": { |
1066 | 544 | "scrolled": false |
1067 | 545 | }, |
1068 | - "outputs": [ | |
1069 | - { | |
1070 | - "name": "stdout", | |
1071 | - "output_type": "stream", | |
1072 | - "text": [ | |
1073 | - "unlabeled{((3,), 'SPAN', False), ((2, 3), 'SPAN', False), ((4,), 'SPAN', False), ((0, 1, 2, 3, 4), 'SPAN', False), ((0, 1, 2, 3), 'SPAN', False), ((2,), 'SPAN', False), ((0, 1), 'SPAN', False)}\n", | |
1074 | - "{((3,), 'SPAN', False), ((2, 3), 'SPAN', False), ((4,), 'SPAN', False), ((0, 1, 2, 3, 4), 'SPAN', False), ((0, 1, 2, 3), 'SPAN', False), ((2,), 'SPAN', False), ((0, 1), 'SPAN', False)}\n", | |
1075 | - "non-headed{((2,), 'Prep', False), ((4,), 'Punct', False), ((2, 3), 'PrepNP', False), ((3,), 'N', False), ((0, 1, 2, 3), 'S', False), ((0, 1), 'VP', False), ((0, 1), 'V', False), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False)}\n", | |
1076 | - "{((2,), 'Prep', False), ((4,), 'Punct', False), ((2, 3), 'PrepNP', False), ((3,), 'N', False), ((0, 1, 2, 3), 'S', False), ((0, 1), 'VP', False), ((0, 1), 'V', False), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False)}\n", | |
1077 | - "headed{((0, 1, 2, 3), 'S', True), ((4,), 'Punct', False), ((0, 1), 'VP', True), ((2, 3), 'PrepNP', False), ((0, 1), 'V', True), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False), ((2,), 'Prep', True), ((3,), 'N', True)}\n", | |
1078 | - "{((0, 1, 2, 3), 'S', True), ((4,), 'Punct', False), ((0, 1), 'VP', True), ((2, 3), 'PrepNP', False), ((0, 1), 'V', True), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False), ((2,), 'Prep', True), ((3,), 'N', True)}\n" | |
1079 | - ] | |
1080 | - } | |
1081 | - ], | |
546 | + "outputs": [], | |
1082 | 547 | "source": [ |
1083 | 548 | "EVAL_DATA = {\n", |
1084 | 549 | " '1val' : (TOKENS_VAL, TAGS_VAL, TAGS_P_VAL, TAGS_C_VAL),\n", |
... | ... | @@ -1113,7 +578,7 @@ |
1113 | 578 | }, |
1114 | 579 | { |
1115 | 580 | "cell_type": "code", |
1116 | - "execution_count": 55, | |
581 | + "execution_count": null, | |
1117 | 582 | "id": "63192852", |
1118 | 583 | "metadata": {}, |
1119 | 584 | "outputs": [], |
... | ... | @@ -1123,7 +588,7 @@ |
1123 | 588 | }, |
1124 | 589 | { |
1125 | 590 | "cell_type": "code", |
1126 | - "execution_count": 56, | |
591 | + "execution_count": null, | |
1127 | 592 | "id": "78250b1b", |
1128 | 593 | "metadata": {}, |
1129 | 594 | "outputs": [], |
... | ... | @@ -1133,7 +598,7 @@ |
1133 | 598 | }, |
1134 | 599 | { |
1135 | 600 | "cell_type": "code", |
1136 | - "execution_count": 57, | |
601 | + "execution_count": null, | |
1137 | 602 | "id": "bba6ed15", |
1138 | 603 | "metadata": {}, |
1139 | 604 | "outputs": [], |
... | ... | @@ -1143,260 +608,20 @@ |
1143 | 608 | }, |
1144 | 609 | { |
1145 | 610 | "cell_type": "code", |
1146 | - "execution_count": 58, | |
611 | + "execution_count": null, | |
1147 | 612 | "id": "543377f8", |
1148 | 613 | "metadata": {}, |
1149 | - "outputs": [ | |
1150 | - { | |
1151 | - "data": { | |
1152 | - "text/html": [ | |
1153 | - "<div>\n", | |
1154 | - "<style scoped>\n", | |
1155 | - " .dataframe tbody tr th:only-of-type {\n", | |
1156 | - " vertical-align: middle;\n", | |
1157 | - " }\n", | |
1158 | - "\n", | |
1159 | - " .dataframe tbody tr th {\n", | |
1160 | - " vertical-align: top;\n", | |
1161 | - " }\n", | |
1162 | - "\n", | |
1163 | - " .dataframe thead th {\n", | |
1164 | - " text-align: right;\n", | |
1165 | - " }\n", | |
1166 | - "</style>\n", | |
1167 | - "<table border=\"1\" class=\"dataframe\">\n", | |
1168 | - " <thead>\n", | |
1169 | - " <tr style=\"text-align: right;\">\n", | |
1170 | - " <th></th>\n", | |
1171 | - " <th></th>\n", | |
1172 | - " <th></th>\n", | |
1173 | - " <th></th>\n", | |
1174 | - " <th>dataset</th>\n", | |
1175 | - " <th>measure_type</th>\n", | |
1176 | - " <th>measure</th>\n", | |
1177 | - " <th>value</th>\n", | |
1178 | - " </tr>\n", | |
1179 | - " <tr>\n", | |
1180 | - " <th>dataset</th>\n", | |
1181 | - " <th>measure</th>\n", | |
1182 | - " <th>measure_type</th>\n", | |
1183 | - " <th></th>\n", | |
1184 | - " <th></th>\n", | |
1185 | - " <th></th>\n", | |
1186 | - " <th></th>\n", | |
1187 | - " <th></th>\n", | |
1188 | - " </tr>\n", | |
1189 | - " </thead>\n", | |
1190 | - " <tbody>\n", | |
1191 | - " <tr>\n", | |
1192 | - " <th rowspan=\"9\" valign=\"top\">test</th>\n", | |
1193 | - " <th rowspan=\"3\" valign=\"top\">F1</th>\n", | |
1194 | - " <th>headed</th>\n", | |
1195 | - " <th>7</th>\n", | |
1196 | - " <td>test</td>\n", | |
1197 | - " <td>headed</td>\n", | |
1198 | - " <td>F1</td>\n", | |
1199 | - " <td>0.959192</td>\n", | |
1200 | - " </tr>\n", | |
1201 | - " <tr>\n", | |
1202 | - " <th>non-headed</th>\n", | |
1203 | - " <th>8</th>\n", | |
1204 | - " <td>test</td>\n", | |
1205 | - " <td>non-headed</td>\n", | |
1206 | - " <td>F1</td>\n", | |
1207 | - " <td>0.965236</td>\n", | |
1208 | - " </tr>\n", | |
1209 | - " <tr>\n", | |
1210 | - " <th>unlabeled</th>\n", | |
1211 | - " <th>15</th>\n", | |
1212 | - " <td>test</td>\n", | |
1213 | - " <td>unlabeled</td>\n", | |
1214 | - " <td>F1</td>\n", | |
1215 | - " <td>0.964436</td>\n", | |
1216 | - " </tr>\n", | |
1217 | - " <tr>\n", | |
1218 | - " <th rowspan=\"3\" valign=\"top\">P</th>\n", | |
1219 | - " <th>headed</th>\n", | |
1220 | - " <th>9</th>\n", | |
1221 | - " <td>test</td>\n", | |
1222 | - " <td>headed</td>\n", | |
1223 | - " <td>P</td>\n", | |
1224 | - " <td>0.959611</td>\n", | |
1225 | - " </tr>\n", | |
1226 | - " <tr>\n", | |
1227 | - " <th>non-headed</th>\n", | |
1228 | - " <th>6</th>\n", | |
1229 | - " <td>test</td>\n", | |
1230 | - " <td>non-headed</td>\n", | |
1231 | - " <td>P</td>\n", | |
1232 | - " <td>0.965658</td>\n", | |
1233 | - " </tr>\n", | |
1234 | - " <tr>\n", | |
1235 | - " <th>unlabeled</th>\n", | |
1236 | - " <th>13</th>\n", | |
1237 | - " <td>test</td>\n", | |
1238 | - " <td>unlabeled</td>\n", | |
1239 | - " <td>P</td>\n", | |
1240 | - " <td>0.964118</td>\n", | |
1241 | - " </tr>\n", | |
1242 | - " <tr>\n", | |
1243 | - " <th rowspan=\"3\" valign=\"top\">R</th>\n", | |
1244 | - " <th>headed</th>\n", | |
1245 | - " <th>2</th>\n", | |
1246 | - " <td>test</td>\n", | |
1247 | - " <td>headed</td>\n", | |
1248 | - " <td>R</td>\n", | |
1249 | - " <td>0.958773</td>\n", | |
1250 | - " </tr>\n", | |
1251 | - " <tr>\n", | |
1252 | - " <th>non-headed</th>\n", | |
1253 | - " <th>5</th>\n", | |
1254 | - " <td>test</td>\n", | |
1255 | - " <td>non-headed</td>\n", | |
1256 | - " <td>R</td>\n", | |
1257 | - " <td>0.964815</td>\n", | |
1258 | - " </tr>\n", | |
1259 | - " <tr>\n", | |
1260 | - " <th>unlabeled</th>\n", | |
1261 | - " <th>0</th>\n", | |
1262 | - " <td>test</td>\n", | |
1263 | - " <td>unlabeled</td>\n", | |
1264 | - " <td>R</td>\n", | |
1265 | - " <td>0.964754</td>\n", | |
1266 | - " </tr>\n", | |
1267 | - " <tr>\n", | |
1268 | - " <th rowspan=\"9\" valign=\"top\">val</th>\n", | |
1269 | - " <th rowspan=\"3\" valign=\"top\">F1</th>\n", | |
1270 | - " <th>headed</th>\n", | |
1271 | - " <th>14</th>\n", | |
1272 | - " <td>val</td>\n", | |
1273 | - " <td>headed</td>\n", | |
1274 | - " <td>F1</td>\n", | |
1275 | - " <td>0.957423</td>\n", | |
1276 | - " </tr>\n", | |
1277 | - " <tr>\n", | |
1278 | - " <th>non-headed</th>\n", | |
1279 | - " <th>4</th>\n", | |
1280 | - " <td>val</td>\n", | |
1281 | - " <td>non-headed</td>\n", | |
1282 | - " <td>F1</td>\n", | |
1283 | - " <td>0.963231</td>\n", | |
1284 | - " </tr>\n", | |
1285 | - " <tr>\n", | |
1286 | - " <th>unlabeled</th>\n", | |
1287 | - " <th>1</th>\n", | |
1288 | - " <td>val</td>\n", | |
1289 | - " <td>unlabeled</td>\n", | |
1290 | - " <td>F1</td>\n", | |
1291 | - " <td>0.962553</td>\n", | |
1292 | - " </tr>\n", | |
1293 | - " <tr>\n", | |
1294 | - " <th rowspan=\"3\" valign=\"top\">P</th>\n", | |
1295 | - " <th>headed</th>\n", | |
1296 | - " <th>10</th>\n", | |
1297 | - " <td>val</td>\n", | |
1298 | - " <td>headed</td>\n", | |
1299 | - " <td>P</td>\n", | |
1300 | - " <td>0.958145</td>\n", | |
1301 | - " </tr>\n", | |
1302 | - " <tr>\n", | |
1303 | - " <th>non-headed</th>\n", | |
1304 | - " <th>16</th>\n", | |
1305 | - " <td>val</td>\n", | |
1306 | - " <td>non-headed</td>\n", | |
1307 | - " <td>P</td>\n", | |
1308 | - " <td>0.963958</td>\n", | |
1309 | - " </tr>\n", | |
1310 | - " <tr>\n", | |
1311 | - " <th>unlabeled</th>\n", | |
1312 | - " <th>11</th>\n", | |
1313 | - " <td>val</td>\n", | |
1314 | - " <td>unlabeled</td>\n", | |
1315 | - " <td>P</td>\n", | |
1316 | - " <td>0.962762</td>\n", | |
1317 | - " </tr>\n", | |
1318 | - " <tr>\n", | |
1319 | - " <th rowspan=\"3\" valign=\"top\">R</th>\n", | |
1320 | - " <th>headed</th>\n", | |
1321 | - " <th>17</th>\n", | |
1322 | - " <td>val</td>\n", | |
1323 | - " <td>headed</td>\n", | |
1324 | - " <td>R</td>\n", | |
1325 | - " <td>0.956702</td>\n", | |
1326 | - " </tr>\n", | |
1327 | - " <tr>\n", | |
1328 | - " <th>non-headed</th>\n", | |
1329 | - " <th>12</th>\n", | |
1330 | - " <td>val</td>\n", | |
1331 | - " <td>non-headed</td>\n", | |
1332 | - " <td>R</td>\n", | |
1333 | - " <td>0.962505</td>\n", | |
1334 | - " </tr>\n", | |
1335 | - " <tr>\n", | |
1336 | - " <th>unlabeled</th>\n", | |
1337 | - " <th>3</th>\n", | |
1338 | - " <td>val</td>\n", | |
1339 | - " <td>unlabeled</td>\n", | |
1340 | - " <td>R</td>\n", | |
1341 | - " <td>0.962343</td>\n", | |
1342 | - " </tr>\n", | |
1343 | - " </tbody>\n", | |
1344 | - "</table>\n", | |
1345 | - "</div>" | |
1346 | - ], | |
1347 | - "text/plain": [ | |
1348 | - " dataset measure_type measure value\n", | |
1349 | - "dataset measure measure_type \n", | |
1350 | - "test F1 headed 7 test headed F1 0.959192\n", | |
1351 | - " non-headed 8 test non-headed F1 0.965236\n", | |
1352 | - " unlabeled 15 test unlabeled F1 0.964436\n", | |
1353 | - " P headed 9 test headed P 0.959611\n", | |
1354 | - " non-headed 6 test non-headed P 0.965658\n", | |
1355 | - " unlabeled 13 test unlabeled P 0.964118\n", | |
1356 | - " R headed 2 test headed R 0.958773\n", | |
1357 | - " non-headed 5 test non-headed R 0.964815\n", | |
1358 | - " unlabeled 0 test unlabeled R 0.964754\n", | |
1359 | - "val F1 headed 14 val headed F1 0.957423\n", | |
1360 | - " non-headed 4 val non-headed F1 0.963231\n", | |
1361 | - " unlabeled 1 val unlabeled F1 0.962553\n", | |
1362 | - " P headed 10 val headed P 0.958145\n", | |
1363 | - " non-headed 16 val non-headed P 0.963958\n", | |
1364 | - " unlabeled 11 val unlabeled P 0.962762\n", | |
1365 | - " R headed 17 val headed R 0.956702\n", | |
1366 | - " non-headed 12 val non-headed R 0.962505\n", | |
1367 | - " unlabeled 3 val unlabeled R 0.962343" | |
1368 | - ] | |
1369 | - }, | |
1370 | - "execution_count": 58, | |
1371 | - "metadata": {}, | |
1372 | - "output_type": "execute_result" | |
1373 | - } | |
1374 | - ], | |
614 | + "outputs": [], | |
1375 | 615 | "source": [ |
1376 | 616 | "results.groupby(['dataset', 'measure', 'measure_type'], group_keys=True).apply(lambda x: x)" |
1377 | 617 | ] |
1378 | 618 | }, |
1379 | 619 | { |
1380 | 620 | "cell_type": "code", |
1381 | - "execution_count": 59, | |
621 | + "execution_count": null, | |
1382 | 622 | "id": "0b5d3fe4", |
1383 | 623 | "metadata": {}, |
1384 | - "outputs": [ | |
1385 | - { | |
1386 | - "name": "stdout", | |
1387 | - "output_type": "stream", | |
1388 | - "text": [ | |
1389 | - "\\toprule\n", | |
1390 | - "& \\multicolumn{3}{c}{validation} & \\multicolumn{3}{c}{test} \\\\\n", | |
1391 | - "& precision & recall & F1 & precision & recall & F1 \\\\\n", | |
1392 | - "\\midrule\n", | |
1393 | - "1unlabeled & 96.28\\% & 96.23\\% & 96.26\\% & 96.41\\% & 96.48\\% & 96.44\\% \\\\\n", | |
1394 | - "2non-headed & 96.40\\% & 96.25\\% & 96.32\\% & 96.57\\% & 96.48\\% & 96.52\\% \\\\\n", | |
1395 | - "3headed & 95.81\\% & 95.67\\% & 95.74\\% & 95.96\\% & 95.88\\% & 95.92\\% \\\\\n", | |
1396 | - "\\bottomrule\n" | |
1397 | - ] | |
1398 | - } | |
1399 | - ], | |
624 | + "outputs": [], | |
1400 | 625 | "source": [ |
1401 | 626 | "for t in tex:\n", |
1402 | 627 | " print(t, end='')" |
... | ... | @@ -1444,10 +669,6 @@ |
1444 | 669 | " precisions = precision_score(TRUE, PRED, average=None)\n", |
1445 | 670 | " recalls = recall_score(TRUE, PRED, average=None)\n", |
1446 | 671 | " f1s = f1_score(TRUE, PRED, average=None)\n", |
1447 | - " #for v, p, r, f in sorted(zip(values, precisions, recalls, f1s), key=lambda x: -x[3]):\n", | |
1448 | - " # if v.endswith('formarzecz') or v.endswith('formaczas'):\n", | |
1449 | - " # spine = ' $\\\\rightarrow$ '.join(f'\\\\nt{{{n}}}' for n in v.split('_'))\n", | |
1450 | - " # print(f'{spine} & {100 * p:.2f}\\\\% & {100 * r:.2f}\\\\% & {100 * f:.2f}\\\\% \\\\\\\\')\n", | |
1451 | 672 | " \n", |
1452 | 673 | " ct_pre, cp_pre = Counter(), Counter()\n", |
1453 | 674 | " for val in values:\n", |
... | ... | @@ -1458,7 +679,6 @@ |
1458 | 679 | " rows = []\n", |
1459 | 680 | " \n", |
1460 | 681 | " for pre in ct_pre.keys():\n", |
1461 | - " # TODO\n", | |
1462 | 682 | " if pre == 'ign':\n", |
1463 | 683 | " continue\n", |
1464 | 684 | " if not cp_pre[pre] * ct_pre[pre]:\n", |
... | ... | @@ -1472,7 +692,6 @@ |
1472 | 692 | " spine = ' $\\\\rightarrow$ '.join(f'\\\\nt{{{n}}}' for n in v.split('_'))\n", |
1473 | 693 | " rws.append(f'{spine} & {100 * p:.2f}\\\\% & {100 * r:.2f}\\\\% & {100 * f:.2f}\\\\% & {ct[v]} \\\\\\\\')\n", |
1474 | 694 | " wp, wr = cp[v] / cp_pre[pre], ct[v] / ct_pre[pre]\n", |
1475 | - " #print(f' {v:36s} {100 * p:6.2f} {wp:7.3f} {100 * r:6.2f} {wr:7.3f}')\n", | |
1476 | 695 | " P += p * wp\n", |
1477 | 696 | " R += r * wr\n", |
1478 | 697 | " F = 2 * P * R / (P + R)\n", |
... | ... |