diff --git a/TrainingAndEval.ipynb b/TrainingAndEval.ipynb index 753718e..1c6067e 100644 --- a/TrainingAndEval.ipynb +++ b/TrainingAndEval.ipynb @@ -2,23 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "97d0c9ab", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-04-11 11:17:29.095631: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-04-11 11:17:29.331444: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2023-04-11 11:17:30.167497: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n", - "2023-04-11 11:17:30.167593: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n", - "2023-04-11 11:17:30.167603: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n" - ] - } - ], + "outputs": [], "source": [ "import importlib\n", "\n", @@ -40,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "c41d6630", "metadata": {}, "outputs": [], @@ -51,36 +38,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "f30d7b7c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 Physical GPUs, 1 Logical GPUs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-04-11 11:17:31.717262: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:31.762533: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:31.763529: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:31.765670: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-04-11 11:17:31.769196: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:31.770058: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:31.770816: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:32.722287: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:32.723281: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:32.724062: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:32.724846: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20480 MB memory: -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:00:05.0, compute capability: 8.0\n" - ] - } - ], + "outputs": [], "source": [ "# https://www.tensorflow.org/guide/gpu\n", "gpus = tf.config.list_physical_devices('GPU')\n", @@ -98,33 +59,12 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "89afdb1e", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/device:GPU:0\n", - "2.10.0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-04-11 11:17:32.739308: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:32.740224: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:32.740975: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:32.741809: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:32.742586: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", - "2023-04-11 11:17:32.743322: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /device:GPU:0 with 20480 MB memory: -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:00:05.0, compute capability: 8.0\n" - ] - } - ], + "outputs": [], "source": [ "print(tf.test.gpu_device_name())\n", "print(tf.__version__)" @@ -132,63 +72,22 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "2b0ab576", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset pdb_c_beta (/home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "55f181333dc44c7a811c515cc55c4988", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/3 [00:00<?, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "pdbc_dataset = load_dataset('pdb_c_beta')" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "2f4c317a", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-ff2490f308f7f25b.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-cbb40b0e978ab6ee.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-3facbd810991cd6c.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-e54a8628e59de21f.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-9692de6b8224e758.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-4042ffa1dc5d9323.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-fb250709424f85ec.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-1f6ce0a488a89d56.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-2ae4daf5101c7aa2.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-a1686820d15bcf04.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-fe2c12481861f4bd.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-da5a875c385c3570.arrow\n" - ] - } - ], + "outputs": [], "source": [ "import importlib\n", "\n", @@ -203,20 +102,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "de1966ed", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-1dfcf507d62f6da8.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-264c0111246b25c1.arrow\n", - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-6a40675124a412f0.arrow\n" - ] - } - ], + "outputs": [], "source": [ "features = pdbc_dataset_spines['train'].features\n", "pdbc_dataset_spines_cont = pdbc_dataset_spines.filter(\n", @@ -226,41 +115,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "33ff295b", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DatasetDict({\n", - " train: Dataset({\n", - " features: ['corp_id', 'sent_id', 'tokens', 'lemmas', 'cposes', 'poses', 'tags', 'heads', 'deprels', 'nonterminals', 'spines', 'anchors', 'anchor_hs'],\n", - " num_rows: 15903\n", - " })\n", - " validation: Dataset({\n", - " features: ['corp_id', 'sent_id', 'tokens', 'lemmas', 'cposes', 'poses', 'tags', 'heads', 'deprels', 'nonterminals', 'spines', 'anchors', 'anchor_hs'],\n", - " num_rows: 1980\n", - " })\n", - " test: Dataset({\n", - " features: ['corp_id', 'sent_id', 'tokens', 'lemmas', 'cposes', 'poses', 'tags', 'heads', 'deprels', 'nonterminals', 'spines', 'anchors', 'anchor_hs'],\n", - " num_rows: 1990\n", - " })\n", - "})" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pdbc_dataset_spines_cont" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "a8ddbc1f", "metadata": {}, "outputs": [], @@ -270,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "8029594b", "metadata": {}, "outputs": [], @@ -288,30 +153,24 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "be8e93fa", "metadata": {}, "outputs": [], "source": [ - "def crop(dataset, n):\n", - " return dataset.filter(lambda example: len(example['tokens']) <= n)\n", - "\n", "spines_pdbc = ClassificationTask(\n", " 'spines_pdbc',\n", " pdbc_dataset_spines,\n", - " #crop(pdbc_dataset, 6),\n", ")\n", "\n", "spines_pdbc_cont = ClassificationTask(\n", " 'spines_pdbc_cont',\n", " pdbc_dataset_spines_cont,\n", - " #crop(pdbc_dataset, 6),\n", ")\n", "\n", "spines_pdbc_compressed = ClassificationTask(\n", " 'spines_pdbc_compressed',\n", " pdbc_dataset_spines_compressed,\n", - " #crop(pdbc_dataset, 6),\n", ")\n", "\n", "TASK = spines_pdbc_compressed\n", @@ -320,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "7824fcee", "metadata": {}, "outputs": [], @@ -330,56 +189,12 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "1eb5f41a", "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading BERT tokenizer...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-49fe5b05228c3588.arrow\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Preprocessing the dataset for BERT...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5f108b00fcab4db8a610f24ae03b7308", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/2211 [00:00<?, ?ex/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading cached processed dataset at /home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1/cache-b8e2900fbd9615fd.arrow\n", - "You're using a HerbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" - ] - } - ], + "outputs": [], "source": [ "trainer = training.Trainer(\n", " MODEL,\n", @@ -398,21 +213,10 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "276708cc", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('keras_fit_logs_spines_pdbc_compressed', 'models_spines_pdbc_compressed')" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "log_dir = f'keras_fit_logs_{TASK.name}'\n", "model_dir = f'models_{TASK.name}'\n", @@ -422,51 +226,12 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "e8ccde06", "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The tensorboard extension is already loaded. To reload it, use:\n", - " %reload_ext tensorboard\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " <iframe id=\"tensorboard-frame-83a6a03964d4187a\" width=\"100%\" height=\"800\" frameborder=\"0\">\n", - " </iframe>\n", - " <script>\n", - " (function() {\n", - " const frame = document.getElementById(\"tensorboard-frame-83a6a03964d4187a\");\n", - " const url = new URL(\"/\", window.location);\n", - " const port = 6004;\n", - " if (port) {\n", - " url.port = port;\n", - " }\n", - " frame.src = url;\n", - " })();\n", - " </script>\n", - " " - ], - "text/plain": [ - "<IPython.core.display.HTML object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "%load_ext tensorboard\n", "! killall tensorboard\n", @@ -476,21 +241,12 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "a5b0da64", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 6 µs, sys: 1 µs, total: 7 µs\n", - "Wall time: 15.7 µs\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "\n", @@ -505,45 +261,10 @@ }, { "cell_type": "code", - "execution_count": 42, - "id": "e42b2bd4", - "metadata": {}, - "outputs": [], - "source": [ - "#import importlib\n", - "#from neural_parser import hybrid_tree_utils\n", - "#importlib.reload(hybrid_tree_utils)\n", - "#from neural_parser import data_utils\n", - "#importlib.reload(data_utils)\n", - "#from neural_parser import constituency_parser\n", - "#importlib.reload(constituency_parser)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, + "execution_count": null, "id": "2f65dead", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "created 3 classifier(s)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some layers from the model checkpoint at models_spines_pdbc_compressed/model were not used when initializing TFBertForMultiTargetTokenClassification: ['dropout_73']\n", - "- This IS expected if you are initializing TFBertForMultiTargetTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing TFBertForMultiTargetTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "All the layers of TFBertForMultiTargetTokenClassification were initialized from the model checkpoint at models_spines_pdbc_compressed/model.\n", - "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMultiTargetTokenClassification for predictions without further training.\n" - ] - } - ], + "outputs": [], "source": [ "if not TRAIN:\n", " from neural_parser import constituency_parser\n", @@ -552,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "id": "24edee79", "metadata": {}, "outputs": [], @@ -565,41 +286,10 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "id": "4a7cd10b", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1/1 [==============================] - 10s 10s/step\n" - ] - }, - { - "data": { - "text/plain": [ - "[(['Miał', 'em', 'kotka', '.'],\n", - " {'spines': ['ROOT_S_VP_V', '<EMPTY>', 'NP_N', 'Punct'],\n", - " 'anchors': ['<ROOT>', 'V', 'S', 'ROOT'],\n", - " 'anchor_hs': ['<ROOT>', '1', '1', '1']}),\n", - " (['Wlazł', 'kotek', 'na', 'płotek', 'i', 'mruga', '.'],\n", - " {'spines': ['VP_V',\n", - " 'NP_N',\n", - " 'PrepNP_Prep',\n", - " 'NP_N',\n", - " 'ROOT_S_VP_Conj',\n", - " 'VP_V',\n", - " 'Punct'],\n", - " 'anchors': ['VP', 'S', 'VP', 'PrepNP', '<ROOT>', 'VP', 'ROOT'],\n", - " 'anchor_hs': ['1', '1', '2', '1', '<ROOT>', '1', '1']})]" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "parser.parse(sentences)" ] @@ -616,21 +306,10 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "4ac4b9df", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "<module 'neural_parser.constants' from '/home/kkrasnowska/neural-parsing/ICCS/neural_parser/constants.py'>" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from neural_parser import hybrid_tree_utils\n", "importlib.reload(hybrid_tree_utils)\n", @@ -640,86 +319,12 @@ }, { "cell_type": "code", - "execution_count": 47, - "id": "d1b28792", - "metadata": {}, - "outputs": [], - "source": [ - "from spacy import displacy\n", - "\n", - "def to_deps(tokens, deprels, heads):\n", - " deps = {'words' : [], 'arcs' : []}\n", - " for i, (token, deprel, head) in enumerate(zip(tokens, deprels, heads)):\n", - " deps['words'].append({'text' : token, 'tag' : 'X'})\n", - " if head >= 0:\n", - " d = 'left' if head > i else 'right'\n", - " start, end = sorted((i, head))\n", - " deps['arcs'].append({'start' : start, 'end' : end, 'label' : deprel, 'dir' : d})\n", - " return deps\n", - "\n", - "def display_deps(tokens, deprels, heads):\n", - " displacy.render(to_deps(tokens, deprels, heads), manual=True, options={'distance' : 80})\n", - " \n", - "import urllib.parse\n", - "import json\n", - "\n", - "def show_tree(tree):\n", - " tree_json = json.dumps(hybrid_tree_utils.tree2dict(tree)['tree'])\n", - " src = f'http://127.0.0.1:8010/?tree={urllib.parse.quote(tree_json)}'\n", - " display(IFrame(src, 950, 550))" - ] - }, - { - "cell_type": "code", - "execution_count": 48, + "execution_count": null, "id": "9f443569", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2211\n", - "2205\n", - "['Całuję', '.']\n" - ] - }, - { - "data": { - "text/plain": [ - "{'heads': [None, 0],\n", - " 'deprels': ['ROOT', 'punct'],\n", - " 'spines': ['ROOT_S_VP_V', 'Punct'],\n", - " 'anchors': ['<ROOT>', 'ROOT'],\n", - " 'anchor_hs': ['<ROOT>', '1']}" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Drzemał', '.']\n" - ] - }, - { - "data": { - "text/plain": [ - "{'heads': [None, 0],\n", - " 'deprels': ['ROOT', 'punct'],\n", - " 'spines': ['ROOT_S_VP_V', 'Punct'],\n", - " 'anchors': ['<ROOT>', 'ROOT'],\n", - " 'anchor_hs': ['<ROOT>', '1']}" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "HDR = [\n", " 'heads', 'deprels',\n", @@ -753,49 +358,10 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "id": "3f53c039", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "70/70 [==============================] - 17s 152ms/step\n", - "69/69 [==============================] - 12s 168ms/step\n", - "['Całuję', '.']\n" - ] - }, - { - "data": { - "text/plain": [ - "{'spines': ['ROOT_S_VP_V', 'Punct'],\n", - " 'anchors': ['<ROOT>', 'ROOT'],\n", - " 'anchor_hs': ['<ROOT>', '1']}" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Drzemał', '.']\n" - ] - }, - { - "data": { - "text/plain": [ - "{'spines': ['ROOT_S_VP_V', 'Punct'],\n", - " 'anchors': ['<ROOT>', 'ROOT'],\n", - " 'anchor_hs': ['<ROOT>', '1']}" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "def get_predicted_data(TOKENS_TRUE):\n", " PARSED = parser.parse([' '.join(toks) for toks in TOKENS_TRUE])\n", @@ -821,45 +387,10 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "id": "17c1d9cb", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2211\n", - "2205\n", - "['Całuję', '.']\n" - ] - }, - { - "data": { - "text/plain": [ - "{'heads': [None, 0], 'deprels': ['root', 'punct']}" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Drzemał', '.']\n" - ] - }, - { - "data": { - "text/plain": [ - "{'heads': [None, 0], 'deprels': ['root', 'punct']}" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import conllu\n", "\n", @@ -894,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": null, "id": "004918c6", "metadata": {}, "outputs": [], @@ -913,42 +444,22 @@ "def tree2spans(tree, labeled=True, headed=False):\n", " spans = []\n", " _tree2spans(tree, spans, labeled=labeled, headed=headed)\n", - " # TODO\n", - " #try:\n", - " # assert(len(spans) == len(set(spans)))\n", - " #except:\n", - " # show_tree(tree)\n", - " # (display(spans))\n", - " # 1/0\n", " return set(spans)" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "id": "65d493ca", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "<module 'neural_parser.hybrid_tree_utils' from '/home/kkrasnowska/neural-parsing/ICCS/neural_parser/hybrid_tree_utils.py'>" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n", - "\n", - "importlib.reload(hybrid_tree_utils)" + "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "id": "e5f88e76", "metadata": { "scrolled": false @@ -964,18 +475,11 @@ " key : {'true' : [], 'pred' : []} for key in ('heads', ('heads', 'deprels'))\n", " }\n", "\n", - " k = 0\n", " i = 0\n", " PROBLEM_TREES = []\n", "\n", " for toks, true, pred, combo in zip(tokens, tags_true, tags_pred, tags_combo):\n", " \n", - " #sent = ' '.join(toks)\n", - " #cats = HDR\n", - " #true = dict(zip(cats, zip(*true)))\n", - " #pred = dict(zip(cats, zip(*pred)))\n", - " #print('----------------------------')\n", - " #print(sent)\n", " dummy = {'lemmas' : ['_' for _ in toks], 'tags' : ['_' for _ in toks]}\n", " true.update(dummy)\n", " pred.update(dummy)\n", @@ -994,12 +498,6 @@ " print('=============================')\n", " raise\n", " tree_pred, problems = None, None\n", - " #if 'reattach' in problems:\n", - " # show_tree(tree_pred)\n", - " \n", - " #if pred['lemmas_corr'] != pred['lemmas']:\n", - " # print(pred['lemmas_corr'])\n", - " # print(pred['lemmas'])\n", " \n", " for key, v in accuracies.items():\n", " if type(key) == str:\n", @@ -1011,31 +509,11 @@ " \n", " spans_true = tree2spans(tree_true, labeled=labeled, headed=headed)\n", " spans_pred = tree2spans(tree_pred, labeled=labeled, headed=headed) if tree_pred else set()\n", - " if 'adwokata' in toks:\n", - " print(spans_true)\n", - " print(spans_pred)\n", " tp = len(spans_true.intersection(spans_pred))\n", " P[0] += tp\n", " R[0] += tp\n", " P[1] += len(spans_pred)\n", " R[1] += len(spans_true)\n", - " leafs = tree_true.get_yield()\n", - " discont = [leaf.from_index for leaf in leafs] != list(range(len(leafs)))\n", - " #if k < 5 and len(toks) > 9 and [leaf.features['index'] for leaf in leafs] != list(range(len(leafs))):\n", - " #if k < 5 and spans_combo != spans_true:\n", - " #if k < 5 and not OK:\n", - " #if discont and len(toks) > 12 and k < 0 and spans_pred == spans_true:\n", - " if len(toks) == 8 and k < 0:\n", - " print('GOLD TREE:')\n", - " show_tree(tree_true)\n", - " display(true)\n", - " #display(_tree2dict(tree_true))\n", - " print('PREDICTED TREE:')\n", - " show_tree(tree_pred)\n", - " display(pred)\n", - " print('FP:', spans_pred - spans_true)\n", - " print('FN:', spans_true - spans_pred)\n", - " k += 1\n", " i += 1\n", " \n", " p, r = P[0]/P[1], R[0]/R[1]\n", @@ -1060,25 +538,12 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "id": "8f8a771a", "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "unlabeled{((3,), 'SPAN', False), ((2, 3), 'SPAN', False), ((4,), 'SPAN', False), ((0, 1, 2, 3, 4), 'SPAN', False), ((0, 1, 2, 3), 'SPAN', False), ((2,), 'SPAN', False), ((0, 1), 'SPAN', False)}\n", - "{((3,), 'SPAN', False), ((2, 3), 'SPAN', False), ((4,), 'SPAN', False), ((0, 1, 2, 3, 4), 'SPAN', False), ((0, 1, 2, 3), 'SPAN', False), ((2,), 'SPAN', False), ((0, 1), 'SPAN', False)}\n", - "non-headed{((2,), 'Prep', False), ((4,), 'Punct', False), ((2, 3), 'PrepNP', False), ((3,), 'N', False), ((0, 1, 2, 3), 'S', False), ((0, 1), 'VP', False), ((0, 1), 'V', False), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False)}\n", - "{((2,), 'Prep', False), ((4,), 'Punct', False), ((2, 3), 'PrepNP', False), ((3,), 'N', False), ((0, 1, 2, 3), 'S', False), ((0, 1), 'VP', False), ((0, 1), 'V', False), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False)}\n", - "headed{((0, 1, 2, 3), 'S', True), ((4,), 'Punct', False), ((0, 1), 'VP', True), ((2, 3), 'PrepNP', False), ((0, 1), 'V', True), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False), ((2,), 'Prep', True), ((3,), 'N', True)}\n", - "{((0, 1, 2, 3), 'S', True), ((4,), 'Punct', False), ((0, 1), 'VP', True), ((2, 3), 'PrepNP', False), ((0, 1), 'V', True), ((3,), 'NP', False), ((0, 1, 2, 3, 4), 'ROOT', False), ((2,), 'Prep', True), ((3,), 'N', True)}\n" - ] - } - ], + "outputs": [], "source": [ "EVAL_DATA = {\n", " '1val' : (TOKENS_VAL, TAGS_VAL, TAGS_P_VAL, TAGS_C_VAL),\n", @@ -1113,7 +578,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "id": "63192852", "metadata": {}, "outputs": [], @@ -1123,7 +588,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "id": "78250b1b", "metadata": {}, "outputs": [], @@ -1133,7 +598,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": null, "id": "bba6ed15", "metadata": {}, "outputs": [], @@ -1143,260 +608,20 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "id": "543377f8", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th>dataset</th>\n", - " <th>measure_type</th>\n", - " <th>measure</th>\n", - " <th>value</th>\n", - " </tr>\n", - " <tr>\n", - " <th>dataset</th>\n", - " <th>measure</th>\n", - " <th>measure_type</th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th rowspan=\"9\" valign=\"top\">test</th>\n", - " <th rowspan=\"3\" valign=\"top\">F1</th>\n", - " <th>headed</th>\n", - " <th>7</th>\n", - " <td>test</td>\n", - " <td>headed</td>\n", - " <td>F1</td>\n", - " <td>0.959192</td>\n", - " </tr>\n", - " <tr>\n", - " <th>non-headed</th>\n", - " <th>8</th>\n", - " <td>test</td>\n", - " <td>non-headed</td>\n", - " <td>F1</td>\n", - " <td>0.965236</td>\n", - " </tr>\n", - " <tr>\n", - " <th>unlabeled</th>\n", - " <th>15</th>\n", - " <td>test</td>\n", - " <td>unlabeled</td>\n", - " <td>F1</td>\n", - " <td>0.964436</td>\n", - " </tr>\n", - " <tr>\n", - " <th rowspan=\"3\" valign=\"top\">P</th>\n", - " <th>headed</th>\n", - " <th>9</th>\n", - " <td>test</td>\n", - " <td>headed</td>\n", - " <td>P</td>\n", - " <td>0.959611</td>\n", - " </tr>\n", - " <tr>\n", - " <th>non-headed</th>\n", - " <th>6</th>\n", - " <td>test</td>\n", - " <td>non-headed</td>\n", - " <td>P</td>\n", - " <td>0.965658</td>\n", - " </tr>\n", - " <tr>\n", - " <th>unlabeled</th>\n", - " <th>13</th>\n", - " <td>test</td>\n", - " <td>unlabeled</td>\n", - " <td>P</td>\n", - " <td>0.964118</td>\n", - " </tr>\n", - " <tr>\n", - " <th rowspan=\"3\" valign=\"top\">R</th>\n", - " <th>headed</th>\n", - " <th>2</th>\n", - " <td>test</td>\n", - " <td>headed</td>\n", - " <td>R</td>\n", - " <td>0.958773</td>\n", - " </tr>\n", - " <tr>\n", - " <th>non-headed</th>\n", - " <th>5</th>\n", - " <td>test</td>\n", - " <td>non-headed</td>\n", - " <td>R</td>\n", - " <td>0.964815</td>\n", - " </tr>\n", - " <tr>\n", - " <th>unlabeled</th>\n", - " <th>0</th>\n", - " <td>test</td>\n", - " <td>unlabeled</td>\n", - " <td>R</td>\n", - " <td>0.964754</td>\n", - " </tr>\n", - " <tr>\n", - " <th rowspan=\"9\" valign=\"top\">val</th>\n", - " <th rowspan=\"3\" valign=\"top\">F1</th>\n", - " <th>headed</th>\n", - " <th>14</th>\n", - " <td>val</td>\n", - " <td>headed</td>\n", - " <td>F1</td>\n", - " <td>0.957423</td>\n", - " </tr>\n", - " <tr>\n", - " <th>non-headed</th>\n", - " <th>4</th>\n", - " <td>val</td>\n", - " <td>non-headed</td>\n", - " <td>F1</td>\n", - " <td>0.963231</td>\n", - " </tr>\n", - " <tr>\n", - " <th>unlabeled</th>\n", - " <th>1</th>\n", - " <td>val</td>\n", - " <td>unlabeled</td>\n", - " <td>F1</td>\n", - " <td>0.962553</td>\n", - " </tr>\n", - " <tr>\n", - " <th rowspan=\"3\" valign=\"top\">P</th>\n", - " <th>headed</th>\n", - " <th>10</th>\n", - " <td>val</td>\n", - " <td>headed</td>\n", - " <td>P</td>\n", - " <td>0.958145</td>\n", - " </tr>\n", - " <tr>\n", - " <th>non-headed</th>\n", - " <th>16</th>\n", - " <td>val</td>\n", - " <td>non-headed</td>\n", - " <td>P</td>\n", - " <td>0.963958</td>\n", - " </tr>\n", - " <tr>\n", - " <th>unlabeled</th>\n", - " <th>11</th>\n", - " <td>val</td>\n", - " <td>unlabeled</td>\n", - " <td>P</td>\n", - " <td>0.962762</td>\n", - " </tr>\n", - " <tr>\n", - " <th rowspan=\"3\" valign=\"top\">R</th>\n", - " <th>headed</th>\n", - " <th>17</th>\n", - " <td>val</td>\n", - " <td>headed</td>\n", - " <td>R</td>\n", - " <td>0.956702</td>\n", - " </tr>\n", - " <tr>\n", - " <th>non-headed</th>\n", - " <th>12</th>\n", - " <td>val</td>\n", - " <td>non-headed</td>\n", - " <td>R</td>\n", - " <td>0.962505</td>\n", - " </tr>\n", - " <tr>\n", - " <th>unlabeled</th>\n", - " <th>3</th>\n", - " <td>val</td>\n", - " <td>unlabeled</td>\n", - " <td>R</td>\n", - " <td>0.962343</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " dataset measure_type measure value\n", - "dataset measure measure_type \n", - "test F1 headed 7 test headed F1 0.959192\n", - " non-headed 8 test non-headed F1 0.965236\n", - " unlabeled 15 test unlabeled F1 0.964436\n", - " P headed 9 test headed P 0.959611\n", - " non-headed 6 test non-headed P 0.965658\n", - " unlabeled 13 test unlabeled P 0.964118\n", - " R headed 2 test headed R 0.958773\n", - " non-headed 5 test non-headed R 0.964815\n", - " unlabeled 0 test unlabeled R 0.964754\n", - "val F1 headed 14 val headed F1 0.957423\n", - " non-headed 4 val non-headed F1 0.963231\n", - " unlabeled 1 val unlabeled F1 0.962553\n", - " P headed 10 val headed P 0.958145\n", - " non-headed 16 val non-headed P 0.963958\n", - " unlabeled 11 val unlabeled P 0.962762\n", - " R headed 17 val headed R 0.956702\n", - " non-headed 12 val non-headed R 0.962505\n", - " unlabeled 3 val unlabeled R 0.962343" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "results.groupby(['dataset', 'measure', 'measure_type'], group_keys=True).apply(lambda x: x)" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "id": "0b5d3fe4", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\\toprule\n", - "& \\multicolumn{3}{c}{validation} & \\multicolumn{3}{c}{test} \\\\\n", - "& precision & recall & F1 & precision & recall & F1 \\\\\n", - "\\midrule\n", - "1unlabeled & 96.28\\% & 96.23\\% & 96.26\\% & 96.41\\% & 96.48\\% & 96.44\\% \\\\\n", - "2non-headed & 96.40\\% & 96.25\\% & 96.32\\% & 96.57\\% & 96.48\\% & 96.52\\% \\\\\n", - "3headed & 95.81\\% & 95.67\\% & 95.74\\% & 95.96\\% & 95.88\\% & 95.92\\% \\\\\n", - "\\bottomrule\n" - ] - } - ], + "outputs": [], "source": [ "for t in tex:\n", " print(t, end='')" @@ -1444,10 +669,6 @@ " precisions = precision_score(TRUE, PRED, average=None)\n", " recalls = recall_score(TRUE, PRED, average=None)\n", " f1s = f1_score(TRUE, PRED, average=None)\n", - " #for v, p, r, f in sorted(zip(values, precisions, recalls, f1s), key=lambda x: -x[3]):\n", - " # if v.endswith('formarzecz') or v.endswith('formaczas'):\n", - " # spine = ' $\\\\rightarrow$ '.join(f'\\\\nt{{{n}}}' for n in v.split('_'))\n", - " # print(f'{spine} & {100 * p:.2f}\\\\% & {100 * r:.2f}\\\\% & {100 * f:.2f}\\\\% \\\\\\\\')\n", " \n", " ct_pre, cp_pre = Counter(), Counter()\n", " for val in values:\n", @@ -1458,7 +679,6 @@ " rows = []\n", " \n", " for pre in ct_pre.keys():\n", - " # TODO\n", " if pre == 'ign':\n", " continue\n", " if not cp_pre[pre] * ct_pre[pre]:\n", @@ -1472,7 +692,6 @@ " spine = ' $\\\\rightarrow$ '.join(f'\\\\nt{{{n}}}' for n in v.split('_'))\n", " rws.append(f'{spine} & {100 * p:.2f}\\\\% & {100 * r:.2f}\\\\% & {100 * f:.2f}\\\\% & {ct[v]} \\\\\\\\')\n", " wp, wr = cp[v] / cp_pre[pre], ct[v] / ct_pre[pre]\n", - " #print(f' {v:36s} {100 * p:6.2f} {wp:7.3f} {100 * r:6.2f} {wr:7.3f}')\n", " P += p * wp\n", " R += r * wr\n", " F = 2 * P * R / (P + R)\n",