Commit 3634f1f2dfd8df37bc3f0fcd6e094c9eddfa81b2

Authored by Katarzyna Krasnowska
1 parent 13ac1d32

code cleanup

Showing 1 changed file with 54 additions and 835 deletions
TrainingAndEval.ipynb
... ... @@ -2,23 +2,10 @@
2 2 "cells": [
3 3 {
4 4 "cell_type": "code",
5   - "execution_count": 1,
  5 + "execution_count": null,
6 6 "id": "97d0c9ab",
7 7 "metadata": {},
8   - "outputs": [
9   - {
10   - "name": "stderr",
11   - "output_type": "stream",
12   - "text": [
13   - "2023-04-11 11:17:29.095631: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
14   - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
15   - "2023-04-11 11:17:29.331444: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
16   - "2023-04-11 11:17:30.167497: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
17   - "2023-04-11 11:17:30.167593: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
18   - "2023-04-11 11:17:30.167603: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
19   - ]
20   - }
21   - ],
  8 + "outputs": [],
22 9 "source": [
23 10 "import importlib\n",
24 11 "\n",
... ... @@ -40,7 +27,7 @@
40 27 },
41 28 {
42 29 "cell_type": "code",
43   - "execution_count": 2,
  30 + "execution_count": null,
44 31 "id": "c41d6630",
45 32 "metadata": {},
46 33 "outputs": [],
... ... @@ -51,36 +38,10 @@
51 38 },
52 39 {
53 40 "cell_type": "code",
54   - "execution_count": 3,
  41 + "execution_count": null,
55 42 "id": "f30d7b7c",
56 43 "metadata": {},
57   - "outputs": [
58   - {
59   - "name": "stdout",
60   - "output_type": "stream",
61   - "text": [
62   - "1 Physical GPUs, 1 Logical GPUs\n"
63   - ]
64   - },
65   - {
66   - "name": "stderr",
67   - "output_type": "stream",
68   - "text": [
69   - "2023-04-11 11:17:31.717262: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
70   - "2023-04-11 11:17:31.762533: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
71   - "2023-04-11 11:17:31.763529: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
72   - "2023-04-11 11:17:31.765670: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
73   - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
74   - "2023-04-11 11:17:31.769196: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
75   - "2023-04-11 11:17:31.770058: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
76   - "2023-04-11 11:17:31.770816: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
77   - "2023-04-11 11:17:32.722287: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
78   - "2023-04-11 11:17:32.723281: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
79   - "2023-04-11 11:17:32.724062: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
80   - "2023-04-11 11:17:32.724846: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20480 MB memory: -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:00:05.0, compute capability: 8.0\n"
81   - ]
82   - }
83   - ],
  44 + "outputs": [],
84 45 "source": [
85 46 "# https://www.tensorflow.org/guide/gpu\n",
86 47 "gpus = tf.config.list_physical_devices('GPU')\n",
... ... @@ -98,33 +59,12 @@
98 59 },
99 60 {
100 61 "cell_type": "code",
101   - "execution_count": 4,
  62 + "execution_count": null,
102 63 "id": "89afdb1e",
103 64 "metadata": {
104 65 "scrolled": true
105 66 },
106   - "outputs": [
107   - {
108   - "name": "stdout",
109   - "output_type": "stream",
110   - "text": [
111   - "/device:GPU:0\n",
112   - "2.10.0\n"
113   - ]
114   - },
115   - {
116   - "name": "stderr",
117   - "output_type": "stream",
118   - "text": [
119   - "2023-04-11 11:17:32.739308: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
120   - "2023-04-11 11:17:32.740224: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
121   - "2023-04-11 11:17:32.740975: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
122   - "2023-04-11 11:17:32.741809: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
123   - "2023-04-11 11:17:32.742586: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
124   - "2023-04-11 11:17:32.743322: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /device:GPU:0 with 20480 MB memory: -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:00:05.0, compute capability: 8.0\n"
125   - ]
126   - }
127   - ],
  67 + "outputs": [],
128 68 "source": [
129 69 "print(tf.test.gpu_device_name())\n",
130 70 "print(tf.__version__)"
... ... @@ -132,63 +72,22 @@
132 72 },
133 73 {
134 74 "cell_type": "code",
135   - "execution_count": 5,
  75 + "execution_count": null,
136 76 "id": "2b0ab576",
137 77 "metadata": {},
138   - "outputs": [
139   - {
140   - "name": "stderr",
141   - "output_type": "stream",
142   - "text": [
143   - "Found cached dataset pdb_c_beta (/home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1)\n"
144   - ]
145   - },
146   - {
147   - "data": {
148   - "application/vnd.jupyter.widget-view+json": {
149   - "model_id": "55f181333dc44c7a811c515cc55c4988",
150   - "version_major": 2,
151   - "version_minor": 0
152   - },
153   - "text/plain": [
154   - " 0%| | 0/3 [00:00<?, ?it/s]"
155   - ]
156   - },
157   - "metadata": {},
158   - "output_type": "display_data"
159   - }
160   - ],
  78 + "outputs": [],
161 79 "source": [
162 80 "pdbc_dataset = load_dataset('pdb_c_beta')"
163 81 ]
164 82 },
165 83 {
166 84 "cell_type": "code",
167   - "execution_count": 6,
  85 + "execution_count": null,
168 86 "id": "2f4c317a",
169 87 "metadata": {
170 88 "scrolled": true
171 89 },
172   - "outputs": [
173   - {
174   - "name": "stderr",
175   - "output_type": "stream",
176   - "text": [
177   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-ff2490f308f7f25b.arrow\n",
178   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-cbb40b0e978ab6ee.arrow\n",
179   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-3facbd810991cd6c.arrow\n",
180   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-e54a8628e59de21f.arrow\n",
181   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-9692de6b8224e758.arrow\n",
182   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-4042ffa1dc5d9323.arrow\n",
183   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-fb250709424f85ec.arrow\n",
184   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-1f6ce0a488a89d56.arrow\n",
185   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-2ae4daf5101c7aa2.arrow\n",
186   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-a1686820d15bcf04.arrow\n",
187   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-fe2c12481861f4bd.arrow\n",
188   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-da5a875c385c3570.arrow\n"
189   - ]
190   - }
191   - ],
  90 + "outputs": [],
192 91 "source": [
193 92 "import importlib\n",
194 93 "\n",
... ... @@ -203,20 +102,10 @@
203 102 },
204 103 {
205 104 "cell_type": "code",
206   - "execution_count": 7,
  105 + "execution_count": null,
207 106 "id": "de1966ed",
208 107 "metadata": {},
209   - "outputs": [
210   - {
211   - "name": "stderr",
212   - "output_type": "stream",
213   - "text": [
214   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-1dfcf507d62f6da8.arrow\n",
215   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-264c0111246b25c1.arrow\n",
216   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-6a40675124a412f0.arrow\n"
217   - ]
218   - }
219   - ],
  108 + "outputs": [],
220 109 "source": [
221 110 "features = pdbc_dataset_spines['train'].features\n",
222 111 "pdbc_dataset_spines_cont = pdbc_dataset_spines.filter(\n",
... ... @@ -226,41 +115,17 @@
226 115 },
227 116 {
228 117 "cell_type": "code",
229   - "execution_count": 8,
  118 + "execution_count": null,
230 119 "id": "33ff295b",
231 120 "metadata": {},
232   - "outputs": [
233   - {
234   - "data": {
235   - "text/plain": [
236   - "DatasetDict({\n",
237   - " train: Dataset({\n",
238   - " features: ['corp_id', 'sent_id', 'tokens', 'lemmas', 'cposes', 'poses', 'tags', 'heads', 'deprels', 'nonterminals', 'spines', 'anchors', 'anchor_hs'],\n",
239   - " num_rows: 15903\n",
240   - " })\n",
241   - " validation: Dataset({\n",
242   - " features: ['corp_id', 'sent_id', 'tokens', 'lemmas', 'cposes', 'poses', 'tags', 'heads', 'deprels', 'nonterminals', 'spines', 'anchors', 'anchor_hs'],\n",
243   - " num_rows: 1980\n",
244   - " })\n",
245   - " test: Dataset({\n",
246   - " features: ['corp_id', 'sent_id', 'tokens', 'lemmas', 'cposes', 'poses', 'tags', 'heads', 'deprels', 'nonterminals', 'spines', 'anchors', 'anchor_hs'],\n",
247   - " num_rows: 1990\n",
248   - " })\n",
249   - "})"
250   - ]
251   - },
252   - "execution_count": 8,
253   - "metadata": {},
254   - "output_type": "execute_result"
255   - }
256   - ],
  121 + "outputs": [],
257 122 "source": [
258 123 "pdbc_dataset_spines_cont"
259 124 ]
260 125 },
261 126 {
262 127 "cell_type": "code",
263   - "execution_count": 9,
  128 + "execution_count": null,
264 129 "id": "a8ddbc1f",
265 130 "metadata": {},
266 131 "outputs": [],
... ... @@ -270,7 +135,7 @@
270 135 },
271 136 {
272 137 "cell_type": "code",
273   - "execution_count": 10,
  138 + "execution_count": null,
274 139 "id": "8029594b",
275 140 "metadata": {},
276 141 "outputs": [],
... ... @@ -288,30 +153,24 @@
288 153 },
289 154 {
290 155 "cell_type": "code",
291   - "execution_count": 36,
  156 + "execution_count": null,
292 157 "id": "be8e93fa",
293 158 "metadata": {},
294 159 "outputs": [],
295 160 "source": [
296   - "def crop(dataset, n):\n",
297   - " return dataset.filter(lambda example: len(example['tokens']) <= n)\n",
298   - "\n",
299 161 "spines_pdbc = ClassificationTask(\n",
300 162 " 'spines_pdbc',\n",
301 163 " pdbc_dataset_spines,\n",
302   - " #crop(pdbc_dataset, 6),\n",
303 164 ")\n",
304 165 "\n",
305 166 "spines_pdbc_cont = ClassificationTask(\n",
306 167 " 'spines_pdbc_cont',\n",
307 168 " pdbc_dataset_spines_cont,\n",
308   - " #crop(pdbc_dataset, 6),\n",
309 169 ")\n",
310 170 "\n",
311 171 "spines_pdbc_compressed = ClassificationTask(\n",
312 172 " 'spines_pdbc_compressed',\n",
313 173 " pdbc_dataset_spines_compressed,\n",
314   - " #crop(pdbc_dataset, 6),\n",
315 174 ")\n",
316 175 "\n",
317 176 "TASK = spines_pdbc_compressed\n",
... ... @@ -320,7 +179,7 @@
320 179 },
321 180 {
322 181 "cell_type": "code",
323   - "execution_count": 37,
  182 + "execution_count": null,
324 183 "id": "7824fcee",
325 184 "metadata": {},
326 185 "outputs": [],
... ... @@ -330,56 +189,12 @@
330 189 },
331 190 {
332 191 "cell_type": "code",
333   - "execution_count": 38,
  192 + "execution_count": null,
334 193 "id": "1eb5f41a",
335 194 "metadata": {
336 195 "scrolled": false
337 196 },
338   - "outputs": [
339   - {
340   - "name": "stdout",
341   - "output_type": "stream",
342   - "text": [
343   - "Loading BERT tokenizer...\n"
344   - ]
345   - },
346   - {
347   - "name": "stderr",
348   - "output_type": "stream",
349   - "text": [
350   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-49fe5b05228c3588.arrow\n"
351   - ]
352   - },
353   - {
354   - "name": "stdout",
355   - "output_type": "stream",
356   - "text": [
357   - "Preprocessing the dataset for BERT...\n"
358   - ]
359   - },
360   - {
361   - "data": {
362   - "application/vnd.jupyter.widget-view+json": {
363   - "model_id": "5f108b00fcab4db8a610f24ae03b7308",
364   - "version_major": 2,
365   - "version_minor": 0
366   - },
367   - "text/plain": [
368   - " 0%| | 0/2211 [00:00<?, ?ex/s]"
369   - ]
370   - },
371   - "metadata": {},
372   - "output_type": "display_data"
373   - },
374   - {
375   - "name": "stderr",
376   - "output_type": "stream",
377   - "text": [
378   - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-b8e2900fbd9615fd.arrow\n",
379   - "You're using a HerbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
380   - ]
381   - }
382   - ],
  197 + "outputs": [],
383 198 "source": [
384 199 "trainer = training.Trainer(\n",
385 200 " MODEL,\n",
... ... @@ -398,21 +213,10 @@
398 213 },
399 214 {
400 215 "cell_type": "code",
401   - "execution_count": 39,
  216 + "execution_count": null,
402 217 "id": "276708cc",
403 218 "metadata": {},
404   - "outputs": [
405   - {
406   - "data": {
407   - "text/plain": [
408   - "('keras_fit_logs_spines_pdbc_compressed', 'models_spines_pdbc_compressed')"
409   - ]
410   - },
411   - "execution_count": 39,
412   - "metadata": {},
413   - "output_type": "execute_result"
414   - }
415   - ],
  219 + "outputs": [],
416 220 "source": [
417 221 "log_dir = f'keras_fit_logs_{TASK.name}'\n",
418 222 "model_dir = f'models_{TASK.name}'\n",
... ... @@ -422,51 +226,12 @@
422 226 },
423 227 {
424 228 "cell_type": "code",
425   - "execution_count": 40,
  229 + "execution_count": null,
426 230 "id": "e8ccde06",
427 231 "metadata": {
428 232 "scrolled": false
429 233 },
430   - "outputs": [
431   - {
432   - "name": "stdout",
433   - "output_type": "stream",
434   - "text": [
435   - "The tensorboard extension is already loaded. To reload it, use:\n",
436   - " %reload_ext tensorboard\n",
437   - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
438   - "To disable this warning, you can either:\n",
439   - "\t- Avoid using `tokenizers` before the fork if possible\n",
440   - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
441   - ]
442   - },
443   - {
444   - "data": {
445   - "text/html": [
446   - "\n",
447   - " <iframe id=\"tensorboard-frame-83a6a03964d4187a\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
448   - " </iframe>\n",
449   - " <script>\n",
450   - " (function() {\n",
451   - " const frame = document.getElementById(\"tensorboard-frame-83a6a03964d4187a\");\n",
452   - " const url = new URL(\"/\", window.location);\n",
453   - " const port = 6004;\n",
454   - " if (port) {\n",
455   - " url.port = port;\n",
456   - " }\n",
457   - " frame.src = url;\n",
458   - " })();\n",
459   - " </script>\n",
460   - " "
461   - ],
462   - "text/plain": [
463   - "<IPython.core.display.HTML object>"
464   - ]
465   - },
466   - "metadata": {},
467   - "output_type": "display_data"
468   - }
469   - ],
  234 + "outputs": [],
470 235 "source": [
471 236 "%load_ext tensorboard\n",
472 237 "! killall tensorboard\n",
... ... @@ -476,21 +241,12 @@
476 241 },
477 242 {
478 243 "cell_type": "code",
479   - "execution_count": 41,
  244 + "execution_count": null,
480 245 "id": "a5b0da64",
481 246 "metadata": {
482 247 "scrolled": true
483 248 },
484   - "outputs": [
485   - {
486   - "name": "stdout",
487   - "output_type": "stream",
488   - "text": [
489   - "CPU times: user 6 ยตs, sys: 1 ยตs, total: 7 ยตs\n",
490   - "Wall time: 15.7 ยตs\n"
491   - ]
492   - }
493   - ],
  249 + "outputs": [],
494 250 "source": [
495 251 "%%time\n",
496 252 "\n",
... ... @@ -505,45 +261,10 @@
505 261 },
506 262 {
507 263 "cell_type": "code",
508   - "execution_count": 42,
509   - "id": "e42b2bd4",
510   - "metadata": {},
511   - "outputs": [],
512   - "source": [
513   - "#import importlib\n",
514   - "#from neural_parser import hybrid_tree_utils\n",
515   - "#importlib.reload(hybrid_tree_utils)\n",
516   - "#from neural_parser import data_utils\n",
517   - "#importlib.reload(data_utils)\n",
518   - "#from neural_parser import constituency_parser\n",
519   - "#importlib.reload(constituency_parser)"
520   - ]
521   - },
522   - {
523   - "cell_type": "code",
524   - "execution_count": 43,
  264 + "execution_count": null,
525 265 "id": "2f65dead",
526 266 "metadata": {},
527   - "outputs": [
528   - {
529   - "name": "stdout",
530   - "output_type": "stream",
531   - "text": [
532   - "created 3 classifier(s)\n"
533   - ]
534   - },
535   - {
536   - "name": "stderr",
537   - "output_type": "stream",
538   - "text": [
539   - "Some layers from the model checkpoint at models_spines_pdbc_compressed/model were not used when initializing TFBertForMultiTargetTokenClassification: ['dropout_73']\n",
540   - "- This IS expected if you are initializing TFBertForMultiTargetTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
541   - "- This IS NOT expected if you are initializing TFBertForMultiTargetTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
542   - "All the layers of TFBertForMultiTargetTokenClassification were initialized from the model checkpoint at models_spines_pdbc_compressed/model.\n",
543   - "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMultiTargetTokenClassification for predictions without further training.\n"
544   - ]
545   - }
546   - ],
  267 + "outputs": [],
547 268 "source": [
548 269 "if not TRAIN:\n",
549 270 " from neural_parser import constituency_parser\n",
... ... @@ -552,7 +273,7 @@
552 273 },
553 274 {
554 275 "cell_type": "code",
555   - "execution_count": 44,
  276 + "execution_count": null,
556 277 "id": "24edee79",
557 278 "metadata": {},
558 279 "outputs": [],
... ... @@ -565,41 +286,10 @@
565 286 },
566 287 {
567 288 "cell_type": "code",
568   - "execution_count": 45,
  289 + "execution_count": null,
569 290 "id": "4a7cd10b",
570 291 "metadata": {},
571   - "outputs": [
572   - {
573   - "name": "stdout",
574   - "output_type": "stream",
575   - "text": [
576   - "1/1 [==============================] - 10s 10s/step\n"
577   - ]
578   - },
579   - {
580   - "data": {
581   - "text/plain": [
582   - "[(['Miaล‚', 'em', 'kotka', '.'],\n",
583   - " {'spines': ['ROOT_S_VP_V', '<EMPTY>', 'NP_N', 'Punct'],\n",
584   - " 'anchors': ['<ROOT>', 'V', 'S', 'ROOT'],\n",
585   - " 'anchor_hs': ['<ROOT>', '1', '1', '1']}),\n",
586   - " (['Wlazล‚', 'kotek', 'na', 'pล‚otek', 'i', 'mruga', '.'],\n",
587   - " {'spines': ['VP_V',\n",
588   - " 'NP_N',\n",
589   - " 'PrepNP_Prep',\n",
590   - " 'NP_N',\n",
591   - " 'ROOT_S_VP_Conj',\n",
592   - " 'VP_V',\n",
593   - " 'Punct'],\n",
594   - " 'anchors': ['VP', 'S', 'VP', 'PrepNP', '<ROOT>', 'VP', 'ROOT'],\n",
595   - " 'anchor_hs': ['1', '1', '2', '1', '<ROOT>', '1', '1']})]"
596   - ]
597   - },
598   - "execution_count": 45,
599   - "metadata": {},
600   - "output_type": "execute_result"
601   - }
602   - ],
  292 + "outputs": [],
603 293 "source": [
604 294 "parser.parse(sentences)"
605 295 ]
... ... @@ -616,21 +306,10 @@
616 306 },
617 307 {
618 308 "cell_type": "code",
619   - "execution_count": 46,
  309 + "execution_count": null,
620 310 "id": "4ac4b9df",
621 311 "metadata": {},
622   - "outputs": [
623   - {
624   - "data": {
625   - "text/plain": [
626   - "<module 'neural_parser.constants' from '/home/kkrasnowska/neural-parsing/ICCS/neural_parser/constants.py'>"
627   - ]
628   - },
629   - "execution_count": 46,
630   - "metadata": {},
631   - "output_type": "execute_result"
632   - }
633   - ],
  312 + "outputs": [],
634 313 "source": [
635 314 "from neural_parser import hybrid_tree_utils\n",
636 315 "importlib.reload(hybrid_tree_utils)\n",
... ... @@ -640,86 +319,12 @@
640 319 },
641 320 {
642 321 "cell_type": "code",
643   - "execution_count": 47,
644   - "id": "d1b28792",
645   - "metadata": {},
646   - "outputs": [],
647   - "source": [
648   - "from spacy import displacy\n",
649   - "\n",
650   - "def to_deps(tokens, deprels, heads):\n",
651   - " deps = {'words' : [], 'arcs' : []}\n",
652   - " for i, (token, deprel, head) in enumerate(zip(tokens, deprels, heads)):\n",
653   - " deps['words'].append({'text' : token, 'tag' : 'X'})\n",
654   - " if head >= 0:\n",
655   - " d = 'left' if head > i else 'right'\n",
656   - " start, end = sorted((i, head))\n",
657   - " deps['arcs'].append({'start' : start, 'end' : end, 'label' : deprel, 'dir' : d})\n",
658   - " return deps\n",
659   - "\n",
660   - "def display_deps(tokens, deprels, heads):\n",
661   - " displacy.render(to_deps(tokens, deprels, heads), manual=True, options={'distance' : 80})\n",
662   - " \n",
663   - "import urllib.parse\n",
664   - "import json\n",
665   - "\n",
666   - "def show_tree(tree):\n",
667   - " tree_json = json.dumps(hybrid_tree_utils.tree2dict(tree)['tree'])\n",
668   - " src = f'http://127.0.0.1:8010/?tree={urllib.parse.quote(tree_json)}'\n",
669   - " display(IFrame(src, 950, 550))"
670   - ]
671   - },
672   - {
673   - "cell_type": "code",
674   - "execution_count": 48,
  322 + "execution_count": null,
675 323 "id": "9f443569",
676 324 "metadata": {
677 325 "scrolled": true
678 326 },
679   - "outputs": [
680   - {
681   - "name": "stdout",
682   - "output_type": "stream",
683   - "text": [
684   - "2211\n",
685   - "2205\n",
686   - "['Caล‚ujฤ™', '.']\n"
687   - ]
688   - },
689   - {
690   - "data": {
691   - "text/plain": [
692   - "{'heads': [None, 0],\n",
693   - " 'deprels': ['ROOT', 'punct'],\n",
694   - " 'spines': ['ROOT_S_VP_V', 'Punct'],\n",
695   - " 'anchors': ['<ROOT>', 'ROOT'],\n",
696   - " 'anchor_hs': ['<ROOT>', '1']}"
697   - ]
698   - },
699   - "metadata": {},
700   - "output_type": "display_data"
701   - },
702   - {
703   - "name": "stdout",
704   - "output_type": "stream",
705   - "text": [
706   - "['Drzemaล‚', '.']\n"
707   - ]
708   - },
709   - {
710   - "data": {
711   - "text/plain": [
712   - "{'heads': [None, 0],\n",
713   - " 'deprels': ['ROOT', 'punct'],\n",
714   - " 'spines': ['ROOT_S_VP_V', 'Punct'],\n",
715   - " 'anchors': ['<ROOT>', 'ROOT'],\n",
716   - " 'anchor_hs': ['<ROOT>', '1']}"
717   - ]
718   - },
719   - "metadata": {},
720   - "output_type": "display_data"
721   - }
722   - ],
  327 + "outputs": [],
723 328 "source": [
724 329 "HDR = [\n",
725 330 " 'heads', 'deprels',\n",
... ... @@ -753,49 +358,10 @@
753 358 },
754 359 {
755 360 "cell_type": "code",
756   - "execution_count": 49,
  361 + "execution_count": null,
757 362 "id": "3f53c039",
758 363 "metadata": {},
759   - "outputs": [
760   - {
761   - "name": "stdout",
762   - "output_type": "stream",
763   - "text": [
764   - "70/70 [==============================] - 17s 152ms/step\n",
765   - "69/69 [==============================] - 12s 168ms/step\n",
766   - "['Caล‚ujฤ™', '.']\n"
767   - ]
768   - },
769   - {
770   - "data": {
771   - "text/plain": [
772   - "{'spines': ['ROOT_S_VP_V', 'Punct'],\n",
773   - " 'anchors': ['<ROOT>', 'ROOT'],\n",
774   - " 'anchor_hs': ['<ROOT>', '1']}"
775   - ]
776   - },
777   - "metadata": {},
778   - "output_type": "display_data"
779   - },
780   - {
781   - "name": "stdout",
782   - "output_type": "stream",
783   - "text": [
784   - "['Drzemaล‚', '.']\n"
785   - ]
786   - },
787   - {
788   - "data": {
789   - "text/plain": [
790   - "{'spines': ['ROOT_S_VP_V', 'Punct'],\n",
791   - " 'anchors': ['<ROOT>', 'ROOT'],\n",
792   - " 'anchor_hs': ['<ROOT>', '1']}"
793   - ]
794   - },
795   - "metadata": {},
796   - "output_type": "display_data"
797   - }
798   - ],
  364 + "outputs": [],
799 365 "source": [
800 366 "def get_predicted_data(TOKENS_TRUE):\n",
801 367 " PARSED = parser.parse([' '.join(toks) for toks in TOKENS_TRUE])\n",
... ... @@ -821,45 +387,10 @@
821 387 },
822 388 {
823 389 "cell_type": "code",
824   - "execution_count": 50,
  390 + "execution_count": null,
825 391 "id": "17c1d9cb",
826 392 "metadata": {},
827   - "outputs": [
828   - {
829   - "name": "stdout",
830   - "output_type": "stream",
831   - "text": [
832   - "2211\n",
833   - "2205\n",
834   - "['Caล‚ujฤ™', '.']\n"
835   - ]
836   - },
837   - {
838   - "data": {
839   - "text/plain": [
840   - "{'heads': [None, 0], 'deprels': ['root', 'punct']}"
841   - ]
842   - },
843   - "metadata": {},
844   - "output_type": "display_data"
845   - },
846   - {
847   - "name": "stdout",
848   - "output_type": "stream",
849   - "text": [
850   - "['Drzemaล‚', '.']\n"
851   - ]
852   - },
853   - {
854   - "data": {
855   - "text/plain": [
856   - "{'heads': [None, 0], 'deprels': ['root', 'punct']}"
857   - ]
858   - },
859   - "metadata": {},
860   - "output_type": "display_data"
861   - }
862   - ],
  393 + "outputs": [],
863 394 "source": [
864 395 "import conllu\n",
865 396 "\n",
... ... @@ -894,7 +425,7 @@
894 425 },
895 426 {
896 427 "cell_type": "code",
897   - "execution_count": 51,
  428 + "execution_count": null,
898 429 "id": "004918c6",
899 430 "metadata": {},
900 431 "outputs": [],
... ... @@ -913,42 +444,22 @@
913 444 "def tree2spans(tree, labeled=True, headed=False):\n",
914 445 " spans = []\n",
915 446 " _tree2spans(tree, spans, labeled=labeled, headed=headed)\n",
916   - " # TODO\n",
917   - " #try:\n",
918   - " # assert(len(spans) == len(set(spans)))\n",
919   - " #except:\n",
920   - " # show_tree(tree)\n",
921   - " # (display(spans))\n",
922   - " # 1/0\n",
923 447 " return set(spans)"
924 448 ]
925 449 },
926 450 {
927 451 "cell_type": "code",
928   - "execution_count": 52,
  452 + "execution_count": null,
929 453 "id": "65d493ca",
930 454 "metadata": {},
931   - "outputs": [
932   - {
933   - "data": {
934   - "text/plain": [
935   - "<module 'neural_parser.hybrid_tree_utils' from '/home/kkrasnowska/neural-parsing/ICCS/neural_parser/hybrid_tree_utils.py'>"
936   - ]
937   - },
938   - "execution_count": 52,
939   - "metadata": {},
940   - "output_type": "execute_result"
941   - }
942   - ],
  455 + "outputs": [],
943 456 "source": [
944   - "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n",
945   - "\n",
946   - "importlib.reload(hybrid_tree_utils)"
  457 + "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score"
947 458 ]
948 459 },
949 460 {
950 461 "cell_type": "code",
951   - "execution_count": 53,
  462 + "execution_count": null,
952 463 "id": "e5f88e76",
953 464 "metadata": {
954 465 "scrolled": false
... ... @@ -964,18 +475,11 @@
964 475 " key : {'true' : [], 'pred' : []} for key in ('heads', ('heads', 'deprels'))\n",
965 476 " }\n",
966 477 "\n",
967   - " k = 0\n",
968 478 " i = 0\n",
969 479 " PROBLEM_TREES = []\n",
970 480 "\n",
971 481 " for toks, true, pred, combo in zip(tokens, tags_true, tags_pred, tags_combo):\n",
972 482 " \n",
973   - " #sent = ' '.join(toks)\n",
974   - " #cats = HDR\n",
975   - " #true = dict(zip(cats, zip(*true)))\n",
976   - " #pred = dict(zip(cats, zip(*pred)))\n",
977   - " #print('----------------------------')\n",
978   - " #print(sent)\n",
979 483 " dummy = {'lemmas' : ['_' for _ in toks], 'tags' : ['_' for _ in toks]}\n",
980 484 " true.update(dummy)\n",
981 485 " pred.update(dummy)\n",
... ... @@ -994,12 +498,6 @@
994 498 " print('=============================')\n",
995 499 " raise\n",
996 500 " tree_pred, problems = None, None\n",
997   - " #if 'reattach' in problems:\n",
998   - " # show_tree(tree_pred)\n",
999   - " \n",
1000   - " #if pred['lemmas_corr'] != pred['lemmas']:\n",
1001   - " # print(pred['lemmas_corr'])\n",
1002   - " # print(pred['lemmas'])\n",
1003 501 " \n",
1004 502 " for key, v in accuracies.items():\n",
1005 503 " if type(key) == str:\n",
... ... @@ -1011,31 +509,11 @@
1011 509 " \n",
1012 510 " spans_true = tree2spans(tree_true, labeled=labeled, headed=headed)\n",
1013 511 " spans_pred = tree2spans(tree_pred, labeled=labeled, headed=headed) if tree_pred else set()\n",
1014   - " if 'adwokata' in toks:\n",
1015   - " print(spans_true)\n",
1016   - " print(spans_pred)\n",
1017 512 " tp = len(spans_true.intersection(spans_pred))\n",
1018 513 " P[0] += tp\n",
1019 514 " R[0] += tp\n",
1020 515 " P[1] += len(spans_pred)\n",
1021 516 " R[1] += len(spans_true)\n",
1022   - " leafs = tree_true.get_yield()\n",
1023   - " discont = [leaf.from_index for leaf in leafs] != list(range(len(leafs)))\n",
1024   - " #if k < 5 and len(toks) > 9 and [leaf.features['index'] for leaf in leafs] != list(range(len(leafs))):\n",
1025   - " #if k < 5 and spans_combo != spans_true:\n",
1026   - " #if k < 5 and not OK:\n",
1027   - " #if discont and len(toks) > 12 and k < 0 and spans_pred == spans_true:\n",
1028   - " if len(toks) == 8 and k < 0:\n",
1029   - " print('GOLD TREE:')\n",
1030   - " show_tree(tree_true)\n",
1031   - " display(true)\n",
1032   - " #display(_tree2dict(tree_true))\n",
1033   - " print('PREDICTED TREE:')\n",
1034   - " show_tree(tree_pred)\n",
1035   - " display(pred)\n",
1036   - " print('FP:', spans_pred - spans_true)\n",
1037   - " print('FN:', spans_true - spans_pred)\n",
1038   - " k += 1\n",
1039 517 " i += 1\n",
1040 518 " \n",
1041 519 " p, r = P[0]/P[1], R[0]/R[1]\n",
... ... @@ -1060,25 +538,12 @@
1060 538 },
1061 539 {
1062 540 "cell_type": "code",
1063   - "execution_count": 54,
  541 + "execution_count": null,
1064 542 "id": "8f8a771a",
1065 543 "metadata": {
1066 544 "scrolled": false
1067 545 },
1068   - "outputs": [
1069   - {
1070   - "name": "stdout",
1071   - "output_type": "stream",
1072   - "text": [
1073   - "unlabeled{((3,), 'SPAN', False), ((2, 3), 'SPAN', False), ((4,), 'SPAN', False), ((0, 1, 2, 3, 4), 'SPAN', False), ((0, 1, 2, 3), 'SPAN', False), ((2,), 'SPAN', False), ((0, 1), 'SPAN', False)}\n",
1074   - "{((3,), 'SPAN', False), ((2, 3), 'SPAN', False), ((4,), 'SPAN', False), ((0, 1, 2, 3, 4), 'SPAN', False), ((0, 1, 2, 3), 'SPAN', False), ((2,), 'SPAN', False), ((0, 1), 'SPAN', False)}\n",
1075   - "non-headed{((2,), 'Prep', False), ((4,), 'Punct', False), ((2, 3), 'PrepNP', False), ((3,), 'N', False), ((0, 1, 2, 3), 'S', False), ((0, 1), 'VP', False), ((0, 1), 'V', False), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False)}\n",
1076   - "{((2,), 'Prep', False), ((4,), 'Punct', False), ((2, 3), 'PrepNP', False), ((3,), 'N', False), ((0, 1, 2, 3), 'S', False), ((0, 1), 'VP', False), ((0, 1), 'V', False), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False)}\n",
1077   - "headed{((0, 1, 2, 3), 'S', True), ((4,), 'Punct', False), ((0, 1), 'VP', True), ((2, 3), 'PrepNP', False), ((0, 1), 'V', True), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False), ((2,), 'Prep', True), ((3,), 'N', True)}\n",
1078   - "{((0, 1, 2, 3), 'S', True), ((4,), 'Punct', False), ((0, 1), 'VP', True), ((2, 3), 'PrepNP', False), ((0, 1), 'V', True), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False), ((2,), 'Prep', True), ((3,), 'N', True)}\n"
1079   - ]
1080   - }
1081   - ],
  546 + "outputs": [],
1082 547 "source": [
1083 548 "EVAL_DATA = {\n",
1084 549 " '1val' : (TOKENS_VAL, TAGS_VAL, TAGS_P_VAL, TAGS_C_VAL),\n",
... ... @@ -1113,7 +578,7 @@
1113 578 },
1114 579 {
1115 580 "cell_type": "code",
1116   - "execution_count": 55,
  581 + "execution_count": null,
1117 582 "id": "63192852",
1118 583 "metadata": {},
1119 584 "outputs": [],
... ... @@ -1123,7 +588,7 @@
1123 588 },
1124 589 {
1125 590 "cell_type": "code",
1126   - "execution_count": 56,
  591 + "execution_count": null,
1127 592 "id": "78250b1b",
1128 593 "metadata": {},
1129 594 "outputs": [],
... ... @@ -1133,7 +598,7 @@
1133 598 },
1134 599 {
1135 600 "cell_type": "code",
1136   - "execution_count": 57,
  601 + "execution_count": null,
1137 602 "id": "bba6ed15",
1138 603 "metadata": {},
1139 604 "outputs": [],
... ... @@ -1143,260 +608,20 @@
1143 608 },
1144 609 {
1145 610 "cell_type": "code",
1146   - "execution_count": 58,
  611 + "execution_count": null,
1147 612 "id": "543377f8",
1148 613 "metadata": {},
1149   - "outputs": [
1150   - {
1151   - "data": {
1152   - "text/html": [
1153   - "<div>\n",
1154   - "<style scoped>\n",
1155   - " .dataframe tbody tr th:only-of-type {\n",
1156   - " vertical-align: middle;\n",
1157   - " }\n",
1158   - "\n",
1159   - " .dataframe tbody tr th {\n",
1160   - " vertical-align: top;\n",
1161   - " }\n",
1162   - "\n",
1163   - " .dataframe thead th {\n",
1164   - " text-align: right;\n",
1165   - " }\n",
1166   - "</style>\n",
1167   - "<table border=\"1\" class=\"dataframe\">\n",
1168   - " <thead>\n",
1169   - " <tr style=\"text-align: right;\">\n",
1170   - " <th></th>\n",
1171   - " <th></th>\n",
1172   - " <th></th>\n",
1173   - " <th></th>\n",
1174   - " <th>dataset</th>\n",
1175   - " <th>measure_type</th>\n",
1176   - " <th>measure</th>\n",
1177   - " <th>value</th>\n",
1178   - " </tr>\n",
1179   - " <tr>\n",
1180   - " <th>dataset</th>\n",
1181   - " <th>measure</th>\n",
1182   - " <th>measure_type</th>\n",
1183   - " <th></th>\n",
1184   - " <th></th>\n",
1185   - " <th></th>\n",
1186   - " <th></th>\n",
1187   - " <th></th>\n",
1188   - " </tr>\n",
1189   - " </thead>\n",
1190   - " <tbody>\n",
1191   - " <tr>\n",
1192   - " <th rowspan=\"9\" valign=\"top\">test</th>\n",
1193   - " <th rowspan=\"3\" valign=\"top\">F1</th>\n",
1194   - " <th>headed</th>\n",
1195   - " <th>7</th>\n",
1196   - " <td>test</td>\n",
1197   - " <td>headed</td>\n",
1198   - " <td>F1</td>\n",
1199   - " <td>0.959192</td>\n",
1200   - " </tr>\n",
1201   - " <tr>\n",
1202   - " <th>non-headed</th>\n",
1203   - " <th>8</th>\n",
1204   - " <td>test</td>\n",
1205   - " <td>non-headed</td>\n",
1206   - " <td>F1</td>\n",
1207   - " <td>0.965236</td>\n",
1208   - " </tr>\n",
1209   - " <tr>\n",
1210   - " <th>unlabeled</th>\n",
1211   - " <th>15</th>\n",
1212   - " <td>test</td>\n",
1213   - " <td>unlabeled</td>\n",
1214   - " <td>F1</td>\n",
1215   - " <td>0.964436</td>\n",
1216   - " </tr>\n",
1217   - " <tr>\n",
1218   - " <th rowspan=\"3\" valign=\"top\">P</th>\n",
1219   - " <th>headed</th>\n",
1220   - " <th>9</th>\n",
1221   - " <td>test</td>\n",
1222   - " <td>headed</td>\n",
1223   - " <td>P</td>\n",
1224   - " <td>0.959611</td>\n",
1225   - " </tr>\n",
1226   - " <tr>\n",
1227   - " <th>non-headed</th>\n",
1228   - " <th>6</th>\n",
1229   - " <td>test</td>\n",
1230   - " <td>non-headed</td>\n",
1231   - " <td>P</td>\n",
1232   - " <td>0.965658</td>\n",
1233   - " </tr>\n",
1234   - " <tr>\n",
1235   - " <th>unlabeled</th>\n",
1236   - " <th>13</th>\n",
1237   - " <td>test</td>\n",
1238   - " <td>unlabeled</td>\n",
1239   - " <td>P</td>\n",
1240   - " <td>0.964118</td>\n",
1241   - " </tr>\n",
1242   - " <tr>\n",
1243   - " <th rowspan=\"3\" valign=\"top\">R</th>\n",
1244   - " <th>headed</th>\n",
1245   - " <th>2</th>\n",
1246   - " <td>test</td>\n",
1247   - " <td>headed</td>\n",
1248   - " <td>R</td>\n",
1249   - " <td>0.958773</td>\n",
1250   - " </tr>\n",
1251   - " <tr>\n",
1252   - " <th>non-headed</th>\n",
1253   - " <th>5</th>\n",
1254   - " <td>test</td>\n",
1255   - " <td>non-headed</td>\n",
1256   - " <td>R</td>\n",
1257   - " <td>0.964815</td>\n",
1258   - " </tr>\n",
1259   - " <tr>\n",
1260   - " <th>unlabeled</th>\n",
1261   - " <th>0</th>\n",
1262   - " <td>test</td>\n",
1263   - " <td>unlabeled</td>\n",
1264   - " <td>R</td>\n",
1265   - " <td>0.964754</td>\n",
1266   - " </tr>\n",
1267   - " <tr>\n",
1268   - " <th rowspan=\"9\" valign=\"top\">val</th>\n",
1269   - " <th rowspan=\"3\" valign=\"top\">F1</th>\n",
1270   - " <th>headed</th>\n",
1271   - " <th>14</th>\n",
1272   - " <td>val</td>\n",
1273   - " <td>headed</td>\n",
1274   - " <td>F1</td>\n",
1275   - " <td>0.957423</td>\n",
1276   - " </tr>\n",
1277   - " <tr>\n",
1278   - " <th>non-headed</th>\n",
1279   - " <th>4</th>\n",
1280   - " <td>val</td>\n",
1281   - " <td>non-headed</td>\n",
1282   - " <td>F1</td>\n",
1283   - " <td>0.963231</td>\n",
1284   - " </tr>\n",
1285   - " <tr>\n",
1286   - " <th>unlabeled</th>\n",
1287   - " <th>1</th>\n",
1288   - " <td>val</td>\n",
1289   - " <td>unlabeled</td>\n",
1290   - " <td>F1</td>\n",
1291   - " <td>0.962553</td>\n",
1292   - " </tr>\n",
1293   - " <tr>\n",
1294   - " <th rowspan=\"3\" valign=\"top\">P</th>\n",
1295   - " <th>headed</th>\n",
1296   - " <th>10</th>\n",
1297   - " <td>val</td>\n",
1298   - " <td>headed</td>\n",
1299   - " <td>P</td>\n",
1300   - " <td>0.958145</td>\n",
1301   - " </tr>\n",
1302   - " <tr>\n",
1303   - " <th>non-headed</th>\n",
1304   - " <th>16</th>\n",
1305   - " <td>val</td>\n",
1306   - " <td>non-headed</td>\n",
1307   - " <td>P</td>\n",
1308   - " <td>0.963958</td>\n",
1309   - " </tr>\n",
1310   - " <tr>\n",
1311   - " <th>unlabeled</th>\n",
1312   - " <th>11</th>\n",
1313   - " <td>val</td>\n",
1314   - " <td>unlabeled</td>\n",
1315   - " <td>P</td>\n",
1316   - " <td>0.962762</td>\n",
1317   - " </tr>\n",
1318   - " <tr>\n",
1319   - " <th rowspan=\"3\" valign=\"top\">R</th>\n",
1320   - " <th>headed</th>\n",
1321   - " <th>17</th>\n",
1322   - " <td>val</td>\n",
1323   - " <td>headed</td>\n",
1324   - " <td>R</td>\n",
1325   - " <td>0.956702</td>\n",
1326   - " </tr>\n",
1327   - " <tr>\n",
1328   - " <th>non-headed</th>\n",
1329   - " <th>12</th>\n",
1330   - " <td>val</td>\n",
1331   - " <td>non-headed</td>\n",
1332   - " <td>R</td>\n",
1333   - " <td>0.962505</td>\n",
1334   - " </tr>\n",
1335   - " <tr>\n",
1336   - " <th>unlabeled</th>\n",
1337   - " <th>3</th>\n",
1338   - " <td>val</td>\n",
1339   - " <td>unlabeled</td>\n",
1340   - " <td>R</td>\n",
1341   - " <td>0.962343</td>\n",
1342   - " </tr>\n",
1343   - " </tbody>\n",
1344   - "</table>\n",
1345   - "</div>"
1346   - ],
1347   - "text/plain": [
1348   - " dataset measure_type measure value\n",
1349   - "dataset measure measure_type \n",
1350   - "test F1 headed 7 test headed F1 0.959192\n",
1351   - " non-headed 8 test non-headed F1 0.965236\n",
1352   - " unlabeled 15 test unlabeled F1 0.964436\n",
1353   - " P headed 9 test headed P 0.959611\n",
1354   - " non-headed 6 test non-headed P 0.965658\n",
1355   - " unlabeled 13 test unlabeled P 0.964118\n",
1356   - " R headed 2 test headed R 0.958773\n",
1357   - " non-headed 5 test non-headed R 0.964815\n",
1358   - " unlabeled 0 test unlabeled R 0.964754\n",
1359   - "val F1 headed 14 val headed F1 0.957423\n",
1360   - " non-headed 4 val non-headed F1 0.963231\n",
1361   - " unlabeled 1 val unlabeled F1 0.962553\n",
1362   - " P headed 10 val headed P 0.958145\n",
1363   - " non-headed 16 val non-headed P 0.963958\n",
1364   - " unlabeled 11 val unlabeled P 0.962762\n",
1365   - " R headed 17 val headed R 0.956702\n",
1366   - " non-headed 12 val non-headed R 0.962505\n",
1367   - " unlabeled 3 val unlabeled R 0.962343"
1368   - ]
1369   - },
1370   - "execution_count": 58,
1371   - "metadata": {},
1372   - "output_type": "execute_result"
1373   - }
1374   - ],
  614 + "outputs": [],
1375 615 "source": [
1376 616 "results.groupby(['dataset', 'measure', 'measure_type'], group_keys=True).apply(lambda x: x)"
1377 617 ]
1378 618 },
1379 619 {
1380 620 "cell_type": "code",
1381   - "execution_count": 59,
  621 + "execution_count": null,
1382 622 "id": "0b5d3fe4",
1383 623 "metadata": {},
1384   - "outputs": [
1385   - {
1386   - "name": "stdout",
1387   - "output_type": "stream",
1388   - "text": [
1389   - "\\toprule\n",
1390   - "& \\multicolumn{3}{c}{validation} & \\multicolumn{3}{c}{test} \\\\\n",
1391   - "& precision & recall & F1 & precision & recall & F1 \\\\\n",
1392   - "\\midrule\n",
1393   - "1unlabeled & 96.28\\% & 96.23\\% & 96.26\\% & 96.41\\% & 96.48\\% & 96.44\\% \\\\\n",
1394   - "2non-headed & 96.40\\% & 96.25\\% & 96.32\\% & 96.57\\% & 96.48\\% & 96.52\\% \\\\\n",
1395   - "3headed & 95.81\\% & 95.67\\% & 95.74\\% & 95.96\\% & 95.88\\% & 95.92\\% \\\\\n",
1396   - "\\bottomrule\n"
1397   - ]
1398   - }
1399   - ],
  624 + "outputs": [],
1400 625 "source": [
1401 626 "for t in tex:\n",
1402 627 " print(t, end='')"
... ... @@ -1444,10 +669,6 @@
1444 669 " precisions = precision_score(TRUE, PRED, average=None)\n",
1445 670 " recalls = recall_score(TRUE, PRED, average=None)\n",
1446 671 " f1s = f1_score(TRUE, PRED, average=None)\n",
1447   - " #for v, p, r, f in sorted(zip(values, precisions, recalls, f1s), key=lambda x: -x[3]):\n",
1448   - " # if v.endswith('formarzecz') or v.endswith('formaczas'):\n",
1449   - " # spine = ' $\\\\rightarrow$ '.join(f'\\\\nt{{{n}}}' for n in v.split('_'))\n",
1450   - " # print(f'{spine} & {100 * p:.2f}\\\\% & {100 * r:.2f}\\\\% & {100 * f:.2f}\\\\% \\\\\\\\')\n",
1451 672 " \n",
1452 673 " ct_pre, cp_pre = Counter(), Counter()\n",
1453 674 " for val in values:\n",
... ... @@ -1458,7 +679,6 @@
1458 679 " rows = []\n",
1459 680 " \n",
1460 681 " for pre in ct_pre.keys():\n",
1461   - " # TODO\n",
1462 682 " if pre == 'ign':\n",
1463 683 " continue\n",
1464 684 " if not cp_pre[pre] * ct_pre[pre]:\n",
... ... @@ -1472,7 +692,6 @@
1472 692 " spine = ' $\\\\rightarrow$ '.join(f'\\\\nt{{{n}}}' for n in v.split('_'))\n",
1473 693 " rws.append(f'{spine} & {100 * p:.2f}\\\\% & {100 * r:.2f}\\\\% & {100 * f:.2f}\\\\% & {ct[v]} \\\\\\\\')\n",
1474 694 " wp, wr = cp[v] / cp_pre[pre], ct[v] / ct_pre[pre]\n",
1475   - " #print(f' {v:36s} {100 * p:6.2f} {wp:7.3f} {100 * r:6.2f} {wr:7.3f}')\n",
1476 695 " P += p * wp\n",
1477 696 " R += r * wr\n",
1478 697 " F = 2 * P * R / (P + R)\n",
... ...