Commit 0dfb2dfb2c7d5b0308aa37015da22c0bd77c2767

Authored by Katarzyna Krasnowska
0 parents

initial commit

Too many changes to show.

To preserve performance only 4 of 14 files are displayed.

BeNePar/DataPreparation.ipynb 0 → 100644
  1 +++ a/BeNePar/DataPreparation.ipynb
  1 +{
  2 + "cells": [
  3 + {
  4 + "cell_type": "code",
  5 + "execution_count": 1,
  6 + "id": "5cd26f6f",
  7 + "metadata": {},
  8 + "outputs": [],
  9 + "source": [
  10 + "import os\n",
  11 + "\n",
  12 + "from datasets import load_dataset\n",
  13 + "\n",
  14 + "from IPython.display import display\n",
  15 + "\n",
  16 + "import sys\n",
  17 + "sys.path.append('../')\n",
  18 + "from neural_parser import hybrid_tree_utils"
  19 + ]
  20 + },
  21 + {
  22 + "cell_type": "code",
  23 + "execution_count": 2,
  24 + "id": "fecef4af",
  25 + "metadata": {},
  26 + "outputs": [
  27 + {
  28 + "name": "stderr",
  29 + "output_type": "stream",
  30 + "text": [
  31 + "Found cached dataset pdb_c_beta (/home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1)\n"
  32 + ]
  33 + },
  34 + {
  35 + "data": {
  36 + "application/vnd.jupyter.widget-view+json": {
  37 + "model_id": "d6fc0deda216433982f304d7451158b2",
  38 + "version_major": 2,
  39 + "version_minor": 0
  40 + },
  41 + "text/plain": [
  42 + " 0%| | 0/3 [00:00<?, ?it/s]"
  43 + ]
  44 + },
  45 + "metadata": {},
  46 + "output_type": "display_data"
  47 + }
  48 + ],
  49 + "source": [
  50 + "pdbc_dataset = load_dataset('../pdb_c_beta/')"
  51 + ]
  52 + },
  53 + {
  54 + "cell_type": "code",
  55 + "execution_count": 3,
  56 + "id": "23da801f",
  57 + "metadata": {},
  58 + "outputs": [],
  59 + "source": [
  60 + "BRACKETS_DIR = 'brackets'\n",
  61 + "! rm -r {BRACKETS_DIR}\n",
  62 + "! mkdir {BRACKETS_DIR}"
  63 + ]
  64 + },
  65 + {
  66 + "cell_type": "code",
  67 + "execution_count": 5,
  68 + "id": "c105feff",
  69 + "metadata": {},
  70 + "outputs": [
  71 + {
  72 + "name": "stdout",
  73 + "output_type": "stream",
  74 + "text": [
  75 + "train\n",
  76 + " brackets/pdbc-cont-train.dat\n",
  77 + " 15903\n",
  78 + "validation\n",
  79 + " brackets/pdbc-cont-validation.dat\n",
  80 + " 1980\n",
  81 + "test\n",
  82 + " brackets/pdbc-cont-test.dat\n",
  83 + " 1990\n"
  84 + ]
  85 + }
  86 + ],
  87 + "source": [
  88 + "features = pdbc_dataset['train'].features\n",
  89 + "\n",
  90 + "for part, dataset in pdbc_dataset.items():\n",
  91 + " print(part)\n",
  92 + " b_cont = []\n",
  93 + " for sentence in dataset:\n",
  94 + " tree = hybrid_tree_utils.tree_from_dataset_instance(sentence, features)\n",
  95 + " if tree.is_continuous():\n",
  96 + " b_cont.append(f'(TOP {tree.to_brackets(morph_tags=True)})')\n",
  97 + " filepath = os.path.join(BRACKETS_DIR, f'pdbc-cont-{part}.dat')\n",
  98 + " with open(filepath, 'w') as f:\n",
  99 + " print(' ', filepath)\n",
  100 + " print(' ', len(b_cont))\n",
  101 + " for row in b_cont:\n",
  102 + " print(row, file=f)"
  103 + ]
  104 + },
  105 + {
  106 + "cell_type": "code",
  107 + "execution_count": 6,
  108 + "id": "c849233c",
  109 + "metadata": {},
  110 + "outputs": [
  111 + {
  112 + "name": "stdout",
  113 + "output_type": "stream",
  114 + "text": [
  115 + " 1990 121784 1024525 brackets/pdbc-cont-test.dat\n",
  116 + " 15903 1022627 8620535 brackets/pdbc-cont-train.dat\n",
  117 + " 1980 126288 1065593 brackets/pdbc-cont-validation.dat\n",
  118 + " 19873 1270699 10710653 total\n"
  119 + ]
  120 + }
  121 + ],
  122 + "source": [
  123 + "! wc {BRACKETS_DIR}/*.dat"
  124 + ]
  125 + },
  126 + {
  127 + "cell_type": "code",
  128 + "execution_count": 8,
  129 + "id": "679b9f10",
  130 + "metadata": {},
  131 + "outputs": [
  132 + {
  133 + "name": "stdout",
  134 + "output_type": "stream",
  135 + "text": [
  136 + "(TOP (ROOT (*S (S (NP (AdjP (*Adj (adj:sg:nom:f:pos Skośnooka))) (*NP (*N (subst:sg:nom:f dziewczynka)))) (*VP (*V (fin:sg:ter:imperf trzyma))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:pl:loc:f rękach)))) (NP (AdjP (*Adj (adj:pl:nom:f:pos drewniane))) (*NP (*N (subst:pl:nom:f pałeczki))))) (Punct (interp ,)) (*Conj (conj a)) (S (PrepNP (*Prep (prep:inst:nwok przed)) (NP (*N (ppron3:sg:inst:f:ter:akc:praep nią)))) (*VP (*V (fin:pl:ter:imperf znajdują))) (Part (part się)) (NP (*NP (*N (subst:pl:nom:n:ncol naczynia))) (AdjP (*Adj (adj:pl:nom:n:pos kuchenne)))))) (Punct (interp .))))\r\n",
  137 + "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:n:col Dziecko))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (AdjP (*Adj (adj:sg:loc:f:pos różowej))) (*NP (*N (subst:sg:loc:f opasce)))))) (*VP (*V (fin:sg:ter:imperf unosi))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:pl:loc:f rękach)))) (NP (AdjP (*Adj (adj:pl:acc:m3:pos drewniane))) (*NP (*N (subst:pl:acc:m3 patyczki)))) (PrepNP (*Prep (prep:inst:nwok nad)) (NP (AdjP (*AdjP (*Adj (ppas:pl:inst:n:perf:aff postawionymi))) (NP (PrepNP (*Prep (prep:gen do)) (NP (*N (subst:sg:gen:f góry)))) (*NP (*N (subst:sg:inst:n:ncol dnem))))) (*NP (NP (*N (subst:sg:inst:f miską))) (*Conj (conj i)) (NP (*N (subst:sg:inst:m3 garnkiem))))))) (Punct (interp .))))\r\n",
  138 + "(TOP (ROOT (*S (NP (*NP (*N (subst:pl:nom:m1 Zawodnicy))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*NP (*N (subst:sg:loc:n:ncol pobliżu))) (NP (*N (subst:sg:gen:f piłki)))))) (*VP (*V (fin:pl:ter:imperf przepychają))) (Part (part się)) (PrepNP (*Prep (prep:inst między)) (NP (*N (siebie:inst sobą)))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:n:ncol boisku))))) (Punct (interp .))))\r\n",
  139 + "(TOP (ROOT (*S (S (NP (*NP (*N (subst:sg:nom:f Dziewczynka))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:sg:loc:f sukience))))) (*VP (*V (fin:sg:ter:imperf puszcza))) (NP (*NP (*N (subst:pl:acc:f bańki))) (AdjP (*Adj (adj:pl:acc:f:pos mydlane)))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:f trawie))))) (Punct (interp ,)) (*Conj (conj a)) (S (PrepNP (*Prep (prep:inst za)) (NP (*N (ppron3:sg:inst:f:ter:akc:praep nią)))) (*VP (*V (fin:sg:ter:imperf stoi))) (NP (AdjP (*Adj (adj:sg:nom:f:pos druga))) (*NP (*N (subst:sg:nom:f dziewczynka)))))) (Punct (interp .))))\r\n",
  140 + "(TOP (ROOT (*S (NP (*NP (*N (subst:pl:nom:f Dziewczynki))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (AdjP (*Adj (adj:pl:loc:f:pos kolorowych))) (*NP (*N (subst:pl:loc:f sukienkach)))))) (*VP (*V (fin:pl:ter:imperf stoją))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:f trawie)))) (VP (Punct (interp ,)) (*VP (*V (pcon:imperf puszczając))) (NP (*NP (*N (subst:pl:acc:f bańki))) (AdjP (*Adj (adj:pl:acc:f:pos mydlane)))))) (Punct (interp .))))\r\n",
  141 + "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:f Grupa))) (NP (*N (subst:pl:gen:n:col dzieci)))) (*VP (*V (fin:sg:ter:imperf moczy))) (Part (part się)) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*NP (*N (subst:sg:loc:f wodzie))) (PrepNP (*Prep (prep:gen:nwok z)) (NP (*N (subst:sg:gen:f fontanny))))))) (Punct (interp .))))\r\n",
  142 + "(TOP (ROOT (*S (NP (*NumP (*Num (num:pl:nom:m1:rec:ncol Kilku))) (NP (*N (subst:pl:gen:m1 chłopców)))) (*VP (*V (fin:sg:ter:imperf kąpie))) (Part (part się)) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*NP (*N (subst:sg:loc:f fontannie))) (PrepNP (*Prep (prep:gen obok)) (NP (*NP (*N (subst:pl:gen:m3 stolików))) (CP (Punct (interp ,)) (*S (PrepAdjP (*Prep (prep:loc przy)) (AdjP (*Adj (adj:pl:loc:m3:pos których)))) (*VP (*V (fin:pl:ter:imperf siedzą))) (NP (*N (subst:pl:nom:m1 ludzie)))))))))) (Punct (interp .))))\r\n",
  143 + "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:f Dwójka))) (NP (AdjP (*Adj (adj:pl:gen:n:pos nagich))) (*NP (*N (subst:pl:gen:n:col dzieci))) (AdjP (*AdjP (*Adj (ppas:pl:gen:n:perf:aff ubrudzonych))) (NP (*N (subst:pl:inst:f farbkami)))))) (*VP (*V (fin:sg:ter:imperf siedzi))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:f podłodze)))) (PrepNP (*Prep (prep:gen obok)) (NP (AdjP (*Adj (adj:pl:gen:f:pos porozrzucanych))) (*NP (*N (subst:pl:gen:f kartek)))))) (Punct (interp .))))\r\n",
  144 + "(TOP (ROOT (*S (S (NP (*NumP (*Num (num:pl:nom:n:rec:col Dwoje))) (NP (AdjP (AdjP (*Adj (adj:pl:gen:n:pos nagich))) (*Conj (interp ,)) (AdjP (*Adj (adj:pl:gen:n:pos małych)))) (*NP (*N (subst:pl:gen:n:col dzieci))))) (*VP (*V (fin:sg:ter:imperf siedzi))) (PrepNP (*Prep (prep:gen naprzeciwko)) (NP (*N (siebie:gen siebie))))) (*Conj (conj i)) (S (NP (AdjP (*Adj (adj:sg:nom:n:com większe))) (*NP (*N (subst:sg:nom:n:col dziecko)))) (*VP (*V (fin:sg:ter:imperf smaruje))) (NP (*N (subst:sg:inst:f farbą))) (NP (AdjP (*Adj (adj:sg:acc:n:com mniejsze))) (*NP (*N (subst:sg:acc:n:col dziecko)))))) (Punct (interp .))))\r\n",
  145 + "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:f Dziewczynka))) (PrepNP (*Prep (prep:loc o)) (NP (AdjP (*Adj (adj:pl:loc:n:pos ciemnych))) (*NP (*N (subst:pl:loc:n:col oczach)))))) (*VP (*V (fin:sg:ter:imperf patrzy))) (PrepNP (*Prep (prep:acc na)) (NP (AdjP (*Adj (adj:sg:acc:m3:pos czarny))) (*NP (*N (subst:sg:acc:m3 przedmiot))) (CP (Punct (interp ,)) (*S (AdjP (*Adj (adj:sg:acc:m3:pos który))) (*VP (*V (fin:sg:ter:imperf trzyma))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:sg:loc:f ręce))))))))) (Punct (interp .))))\r\n"
  146 + ]
  147 + }
  148 + ],
  149 + "source": [
  150 + "! head {BRACKETS_DIR}/pdbc-cont-train.dat"
  151 + ]
  152 + }
  153 + ],
  154 + "metadata": {
  155 + "kernelspec": {
  156 + "display_name": "TF_zajecia",
  157 + "language": "python",
  158 + "name": "tf_zajecia"
  159 + },
  160 + "language_info": {
  161 + "codemirror_mode": {
  162 + "name": "ipython",
  163 + "version": 3
  164 + },
  165 + "file_extension": ".py",
  166 + "mimetype": "text/x-python",
  167 + "name": "python",
  168 + "nbconvert_exporter": "python",
  169 + "pygments_lexer": "ipython3",
  170 + "version": "3.10.6"
  171 + }
  172 + },
  173 + "nbformat": 4,
  174 + "nbformat_minor": 5
  175 +}
... ...
BeNePar/TrainAndParse.ipynb 0 → 100644
  1 +++ a/BeNePar/TrainAndParse.ipynb
  1 +{
  2 + "cells": [
  3 + {
  4 + "cell_type": "code",
  5 + "execution_count": 5,
  6 + "id": "d8404675",
  7 + "metadata": {},
  8 + "outputs": [],
  9 + "source": [
  10 + "#BENEPAR = '/home/kkrasnowska/benepar_experiments/self-attentive-parser/src/main.py'"
  11 + ]
  12 + },
  13 + {
  14 + "cell_type": "code",
  15 + "execution_count": 6,
  16 + "id": "88603098",
  17 + "metadata": {},
  18 + "outputs": [],
  19 + "source": [
  20 + "#! mkdir models"
  21 + ]
  22 + },
  23 + {
  24 + "cell_type": "code",
  25 + "execution_count": 7,
  26 + "id": "d5aedb53",
  27 + "metadata": {},
  28 + "outputs": [],
  29 + "source": [
  30 + "#! python {BENEPAR} train \\\n",
  31 + "# --train-path brackets/pdbc-cont-train.dat \\\n",
  32 + "# --dev-path brackets/pdbc-cont-validation.dat \\\n",
  33 + "# --evalb-dir /home/kkrasnowska/benepar_experiments/self-attentive-parser/EVALB_SPMRL \\\n",
  34 + "# --use-pretrained --pretrained-model \"allegro/herbert-large-cased\" \\\n",
  35 + "# --use-encoder --num-layers 2 \\\n",
  36 + "# --predict-tags \\\n",
  37 + "# --model-path-base models"
  38 + ]
  39 + },
  40 + {
  41 + "cell_type": "code",
  42 + "execution_count": 8,
  43 + "id": "3f6aaf27",
  44 + "metadata": {},
  45 + "outputs": [],
  46 + "source": [
  47 + "from IPython.display import display, HTML"
  48 + ]
  49 + },
  50 + {
  51 + "cell_type": "code",
  52 + "execution_count": 9,
  53 + "id": "8d9d5103",
  54 + "metadata": {},
  55 + "outputs": [],
  56 + "source": [
  57 + "import benepar\n",
  58 + "import nltk\n",
  59 + "import spacy"
  60 + ]
  61 + },
  62 + {
  63 + "cell_type": "code",
  64 + "execution_count": 10,
  65 + "id": "c56eda57",
  66 + "metadata": {},
  67 + "outputs": [
  68 + {
  69 + "name": "stderr",
  70 + "output_type": "stream",
  71 + "text": [
  72 + "Some weights of the model checkpoint at allegro/herbert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']\n",
  73 + "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
  74 + "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
  75 + ]
  76 + }
  77 + ],
  78 + "source": [
  79 + "MODEL = 'models_dev=97.36.pt'\n",
  80 + "parser = benepar.Parser(MODEL)"
  81 + ]
  82 + },
  83 + {
  84 + "cell_type": "code",
  85 + "execution_count": null,
  86 + "id": "35ffd9af",
  87 + "metadata": {},
  88 + "outputs": [],
  89 + "source": []
  90 + },
  91 + {
  92 + "cell_type": "code",
  93 + "execution_count": 11,
  94 + "id": "06ae821c",
  95 + "metadata": {},
  96 + "outputs": [],
  97 + "source": [
  98 + "def postprocess(tree):\n",
  99 + " for node in tree.subtrees():\n",
  100 + " l = node.label()\n",
  101 + " node.set_label(l.replace('LPAR', '(').replace('RPAR', ')'))\n",
  102 + " for i, child in enumerate(node):\n",
  103 + " if type(child) == str:\n",
  104 + " node[i] = child.replace('-LSB-', '[').replace('-RSB-', ']')\n",
  105 + " return tree\n",
  106 + "\n",
  107 + "def parse_tokenized_sentences(sentences, parser):\n",
  108 + " [benepar.InputSentence(words=tokens) for tokens in sentences]\n",
  109 + " return list(map(postprocess, parser.parse_sents(\n",
  110 + " [benepar.InputSentence(words=tokens) for tokens in sentences]\n",
  111 + " )))\n",
  112 + "\n",
  113 + "def parse_sentence(sentence, parser):\n",
  114 + " return parse_tokenized_sentences([sentence.split()], parser)[0]"
  115 + ]
  116 + },
  117 + {
  118 + "cell_type": "code",
  119 + "execution_count": 12,
  120 + "id": "c96dc9d9",
  121 + "metadata": {},
  122 + "outputs": [
  123 + {
  124 + "name": "stderr",
  125 + "output_type": "stream",
  126 + "text": [
  127 + "You're using a HerbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
  128 + "/home/kkrasnowska/venvs/torch_benepar/lib/python3.10/site-packages/torch/distributions/distribution.py:44: UserWarning: <class 'torch_struct.distributions.TreeCRF'> does not define `arg_constraints`. Please set `arg_constraints = {}` or initialize the distribution with `validate_args=False` to turn off validation.\n",
  129 + " warnings.warn(f'{self.__class__} does not define `arg_constraints`. ' +\n"
  130 + ]
  131 + },
  132 + {
  133 + "data": {
  134 + "image/svg+xml": [
  135 + "<svg baseProfile=\"full\" height=\"312px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,512.0,312.0\" width=\"512px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">TOP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"12.5%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">[</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"6.25%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"75%\" x=\"12.5%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"35.4167%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:pl:nom:m2</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Koty</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"17.7083%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"12.5%\" x=\"35.4167%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">pred</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">to</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"41.6667%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"35.4167%\" x=\"47.9167%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:pl:nom:m1</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">złodzieje</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"65.625%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"16.6667%\" x=\"83.3333%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"91.6667%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"12.5%\" x=\"87.5%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">]</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"93.75%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg>"
  136 + ],
  137 + "text/plain": [
  138 + "Tree('TOP', [Tree('ROOT', [Tree('Punct', [Tree('interp', ['['])]), Tree('*S', [Tree('NP', [Tree('*N', [Tree('subst:pl:nom:m2', ['Koty'])])]), Tree('*VP', [Tree('*V', [Tree('pred', ['to'])])]), Tree('NP', [Tree('*N', [Tree('subst:pl:nom:m1', ['złodzieje'])])]), Tree('Punct', [Tree('interp', ['.'])])]), Tree('Punct', [Tree('interp', [']'])])])])"
  139 + ]
  140 + },
  141 + "execution_count": 12,
  142 + "metadata": {},
  143 + "output_type": "execute_result"
  144 + }
  145 + ],
  146 + "source": [
  147 + "parse_sentence('[ Koty to złodzieje . ]', parser)"
  148 + ]
  149 + },
  150 + {
  151 + "cell_type": "code",
  152 + "execution_count": 13,
  153 + "id": "d62d1e31",
  154 + "metadata": {},
  155 + "outputs": [
  156 + {
  157 + "data": {
  158 + "image/svg+xml": [
  159 + "<svg baseProfile=\"full\" height=\"312px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,560.0,312.0\" width=\"560px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">TOP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"88.5714%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"72.5806%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"44.4444%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">praet:sg:m1:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Widział</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"22.2222%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.5556%\" x=\"44.4444%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">aglt:sg:pri:imperf:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">am</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.2222%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"36.2903%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"27.4194%\" x=\"72.5806%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:acc:m2</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">kotka</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"86.2903%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"44.2857%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"11.4286%\" x=\"88.5714%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"94.2857%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg>"
  160 + ],
  161 + "text/plain": [
  162 + "Tree('TOP', [Tree('ROOT', [Tree('*S', [Tree('*VP', [Tree('*V', [Tree('praet:sg:m1:imperf', ['Widział']), Tree('aglt:sg:pri:imperf:nwok', ['am'])])]), Tree('NP', [Tree('*N', [Tree('subst:sg:acc:m2', ['kotka'])])])]), Tree('Punct', [Tree('interp', ['.'])])])])"
  163 + ]
  164 + },
  165 + "execution_count": 13,
  166 + "metadata": {},
  167 + "output_type": "execute_result"
  168 + }
  169 + ],
  170 + "source": [
  171 + "parse_sentence('Widział am kotka .', parser)"
  172 + ]
  173 + },
  174 + {
  175 + "cell_type": "code",
  176 + "execution_count": 14,
  177 + "id": "418db531",
  178 + "metadata": {},
  179 + "outputs": [],
  180 + "source": [
  181 + "with open('brackets/pdbc-cont-validation.dat') as f:\n",
  182 + " val_trees = [postprocess(nltk.Tree.fromstring(l.strip('\\n'))) for l in f.readlines()]\n",
  183 + "with open('brackets/pdbc-cont-test.dat') as f:\n",
  184 + " test_trees = [postprocess(nltk.Tree.fromstring(l.strip('\\n'))) for l in f.readlines()]"
  185 + ]
  186 + },
  187 + {
  188 + "cell_type": "code",
  189 + "execution_count": 15,
  190 + "id": "2e5f4739",
  191 + "metadata": {},
  192 + "outputs": [],
  193 + "source": [
  194 + "val_sentences = [tree.leaves() for tree in val_trees]\n",
  195 + "test_sentences = [tree.leaves() for tree in test_trees]"
  196 + ]
  197 + },
  198 + {
  199 + "cell_type": "code",
  200 + "execution_count": 16,
  201 + "id": "6c52ef3f",
  202 + "metadata": {},
  203 + "outputs": [
  204 + {
  205 + "name": "stderr",
  206 + "output_type": "stream",
  207 + "text": [
  208 + "/home/kkrasnowska/venvs/torch_benepar/lib/python3.10/site-packages/torch/distributions/distribution.py:44: UserWarning: <class 'torch_struct.distributions.TreeCRF'> does not define `arg_constraints`. Please set `arg_constraints = {}` or initialize the distribution with `validate_args=False` to turn off validation.\n",
  209 + " warnings.warn(f'{self.__class__} does not define `arg_constraints`. ' +\n"
  210 + ]
  211 + }
  212 + ],
  213 + "source": [
  214 + "val_pred_trees = parse_tokenized_sentences(val_sentences, parser)\n",
  215 + "test_pred_trees = parse_tokenized_sentences(test_sentences, parser)"
  216 + ]
  217 + },
  218 + {
  219 + "cell_type": "code",
  220 + "execution_count": 17,
  221 + "id": "d6d45ba8",
  222 + "metadata": {},
  223 + "outputs": [],
  224 + "source": [
  225 + "assert(len(val_trees) == len(val_pred_trees))\n",
  226 + "assert(len(test_trees) == len(test_pred_trees))"
  227 + ]
  228 + },
  229 + {
  230 + "cell_type": "code",
  231 + "execution_count": 18,
  232 + "id": "399c3f08",
  233 + "metadata": {},
  234 + "outputs": [],
  235 + "source": [
  236 + "# drop the TOP\n",
  237 + "val_trees = [t[0] for t in val_trees]\n",
  238 + "test_trees = [t[0] for t in test_trees]\n",
  239 + "val_pred_trees = [t[0] for t in val_pred_trees]\n",
  240 + "test_pred_trees = [t[0] for t in test_pred_trees]"
  241 + ]
  242 + },
  243 + {
  244 + "cell_type": "code",
  245 + "execution_count": 19,
  246 + "id": "827be810",
  247 + "metadata": {},
  248 + "outputs": [
  249 + {
  250 + "data": {
  251 + "image/svg+xml": [
  252 + "<svg baseProfile=\"full\" height=\"504px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,2808.0,504.0\" width=\"2808px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"97.7208%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"41.691%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">S</text></svg><svg width=\"22.3776%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"46.875%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:loc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">W</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.4375%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"53.125%\" x=\"46.875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:loc:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">samolocie</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"73.4375%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"11.1888%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"13.986%\" x=\"22.3776%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">praet:sg:m1:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">czytał</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"29.3706%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"63.6364%\" x=\"36.3636%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"18.6813%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:pl:acc:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">wycinki</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"9.34066%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"81.3187%\" x=\"18.6813%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"20.2703%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:gen:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">z</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"10.1351%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"79.7297%\" x=\"20.2703%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"27.1186%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:gen:f</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prasy</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"13.5593%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"72.8814%\" x=\"27.1186%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"41.8605%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:gen:f:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">polskiej</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.9302%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"16.2791%\" x=\"41.8605%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">i</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"41.8605%\" x=\"58.1395%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:gen:f:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">polonijnej</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"79.0698%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"63.5593%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"60.1351%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"59.3407%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"68.1818%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.8455%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"2.33236%\" x=\"41.691%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">-</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"42.8571%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.9767%\" x=\"44.0233%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">S</text></svg><svg width=\"17.7083%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepAdjP</text></svg><svg width=\"44.1176%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:loc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">w</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"22.0588%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.8824%\" x=\"44.1176%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:loc:m3:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">każdym</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.0588%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"8.85417%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"6.77083%\" x=\"17.7083%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">imps:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">piętnowano</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"21.0938%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"17.7083%\" x=\"24.4792%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ppron3:sg:acc:m1:ter:nakc:npraep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">go</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"33.3333%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"57.8125%\" x=\"42.1875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Compar</text></svg><svg width=\"6.30631%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Comp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">comp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">jako</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"3.15315%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"93.6937%\" x=\"6.30631%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"7.69231%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">\"</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"3.84615%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"30.7692%\" x=\"7.69231%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"46.875%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:gen:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">bez</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.4375%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"53.125%\" x=\"46.875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:gen:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">umiaru</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"73.4375%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.0769%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"24.0385%\" x=\"38.4615%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ppas:sg:acc:m1:perf:aff</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">zapatrzonego</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50.4808%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"29.8077%\" x=\"62.5%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"48.3871%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:acc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">w</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"24.1935%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"51.6129%\" x=\"48.3871%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:acc:f</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Moskwę</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"74.1935%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"77.4038%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"7.69231%\" x=\"92.3077%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">\"</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"96.1538%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"53.1532%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"71.0938%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.0117%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"48.8604%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"2.2792%\" x=\"97.7208%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"98.8604%\" y1=\"1.2em\" y2=\"3em\" /></svg>"
  253 + ],
  254 + "text/plain": [
  255 + "Tree('ROOT', [Tree('*S', [Tree('S', [Tree('PrepNP', [Tree('*Prep', [Tree('prep:loc:nwok', ['W'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:loc:m3', ['samolocie'])])])]), Tree('*VP', [Tree('*V', [Tree('praet:sg:m1:imperf', ['czytał'])])]), Tree('NP', [Tree('*NP', [Tree('*N', [Tree('subst:pl:acc:m3', ['wycinki'])])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:gen:nwok', ['z'])]), Tree('NP', [Tree('*NP', [Tree('*N', [Tree('subst:sg:gen:f', ['prasy'])])]), Tree('AdjP', [Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:gen:f:pos', ['polskiej'])])]), Tree('*Conj', [Tree('conj', ['i'])]), Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:gen:f:pos', ['polonijnej'])])])])])])])]), Tree('*Conj', [Tree('interp', ['-'])]), Tree('S', [Tree('PrepAdjP', [Tree('*Prep', [Tree('prep:loc:nwok', ['w'])]), Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:loc:m3:pos', ['każdym'])])])]), Tree('*VP', [Tree('*V', [Tree('imps:imperf', ['piętnowano'])])]), Tree('NP', [Tree('*N', [Tree('ppron3:sg:acc:m1:ter:nakc:npraep', ['go'])])]), Tree('Compar', [Tree('*Comp', [Tree('comp', ['jako'])]), Tree('AdjP', [Tree('Punct', [Tree('interp', ['\"'])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:gen:nwok', ['bez'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:gen:m3', ['umiaru'])])])]), Tree('*AdjP', [Tree('*Adj', [Tree('ppas:sg:acc:m1:perf:aff', ['zapatrzonego'])])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:acc:nwok', ['w'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:acc:f', ['Moskwę'])])])]), Tree('Punct', [Tree('interp', ['\"'])])])])])]), Tree('Punct', [Tree('interp', ['.'])])])"
  256 + ]
  257 + },
  258 + "execution_count": 19,
  259 + "metadata": {},
  260 + "output_type": "execute_result"
  261 + }
  262 + ],
  263 + "source": [
  264 + "val_trees[504]"
  265 + ]
  266 + },
  267 + {
  268 + "cell_type": "code",
  269 + "execution_count": 20,
  270 + "id": "1059e782",
  271 + "metadata": {},
  272 + "outputs": [
  273 + {
  274 + "data": {
  275 + "image/svg+xml": [
  276 + "<svg baseProfile=\"full\" height=\"504px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,2808.0,504.0\" width=\"2808px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"97.7208%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"41.691%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">S</text></svg><svg width=\"22.3776%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"46.875%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:loc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">W</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.4375%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"53.125%\" x=\"46.875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:loc:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">samolocie</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"73.4375%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"11.1888%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"13.986%\" x=\"22.3776%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">praet:sg:m1:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">czytał</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"29.3706%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"63.6364%\" x=\"36.3636%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"18.6813%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:pl:acc:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">wycinki</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"9.34066%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"81.3187%\" x=\"18.6813%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"20.2703%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:gen:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">z</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"10.1351%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"79.7297%\" x=\"20.2703%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"27.1186%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:gen:f</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prasy</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"13.5593%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"72.8814%\" x=\"27.1186%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"41.8605%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:gen:f:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">polskiej</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.9302%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"16.2791%\" x=\"41.8605%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">i</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"41.8605%\" x=\"58.1395%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:gen:f:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">polonijnej</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"79.0698%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"63.5593%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"60.1351%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"59.3407%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"68.1818%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.8455%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"2.33236%\" x=\"41.691%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">-</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"42.8571%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.9767%\" x=\"44.0233%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">S</text></svg><svg width=\"17.7083%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepAdjP</text></svg><svg width=\"44.1176%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:loc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">w</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"22.0588%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.8824%\" x=\"44.1176%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:loc:m3:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">każdym</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.0588%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"8.85417%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"6.77083%\" x=\"17.7083%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">imps:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">piętnowano</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"21.0938%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"17.7083%\" x=\"24.4792%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ppron3:sg:acc:m1:ter:nakc:npraep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">go</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"33.3333%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"57.8125%\" x=\"42.1875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Compar</text></svg><svg width=\"6.30631%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Comp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">comp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">jako</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"3.15315%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"93.6937%\" x=\"6.30631%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"7.69231%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">\"</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"3.84615%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"30.7692%\" x=\"7.69231%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"46.875%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:gen:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">bez</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.4375%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"53.125%\" x=\"46.875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:gen:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">umiaru</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"73.4375%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.0769%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"24.0385%\" x=\"38.4615%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ppas:sg:acc:m1:perf:aff</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">zapatrzonego</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50.4808%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"29.8077%\" x=\"62.5%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"48.3871%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:acc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">w</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"24.1935%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"51.6129%\" x=\"48.3871%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:acc:f</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Moskwę</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"74.1935%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"77.4038%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"7.69231%\" x=\"92.3077%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">\"</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"96.1538%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"53.1532%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"71.0938%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.0117%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"48.8604%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"2.2792%\" x=\"97.7208%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"98.8604%\" y1=\"1.2em\" y2=\"3em\" /></svg>"
  277 + ],
  278 + "text/plain": [
  279 + "Tree('ROOT', [Tree('*S', [Tree('S', [Tree('PrepNP', [Tree('*Prep', [Tree('prep:loc:nwok', ['W'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:loc:m3', ['samolocie'])])])]), Tree('*VP', [Tree('*V', [Tree('praet:sg:m1:imperf', ['czytał'])])]), Tree('NP', [Tree('*NP', [Tree('*N', [Tree('subst:pl:acc:m3', ['wycinki'])])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:gen:nwok', ['z'])]), Tree('NP', [Tree('*NP', [Tree('*N', [Tree('subst:sg:gen:f', ['prasy'])])]), Tree('AdjP', [Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:gen:f:pos', ['polskiej'])])]), Tree('*Conj', [Tree('conj', ['i'])]), Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:gen:f:pos', ['polonijnej'])])])])])])])]), Tree('*Conj', [Tree('interp', ['-'])]), Tree('S', [Tree('PrepAdjP', [Tree('*Prep', [Tree('prep:loc:nwok', ['w'])]), Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:loc:m3:pos', ['każdym'])])])]), Tree('*VP', [Tree('*V', [Tree('imps:imperf', ['piętnowano'])])]), Tree('NP', [Tree('*N', [Tree('ppron3:sg:acc:m1:ter:nakc:npraep', ['go'])])]), Tree('Compar', [Tree('*Comp', [Tree('comp', ['jako'])]), Tree('AdjP', [Tree('Punct', [Tree('interp', ['\"'])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:gen:nwok', ['bez'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:gen:m3', ['umiaru'])])])]), Tree('*AdjP', [Tree('*Adj', [Tree('ppas:sg:acc:m1:perf:aff', ['zapatrzonego'])])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:acc:nwok', ['w'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:acc:f', ['Moskwę'])])])]), Tree('Punct', [Tree('interp', ['\"'])])])])])]), Tree('Punct', [Tree('interp', ['.'])])])"
  280 + ]
  281 + },
  282 + "execution_count": 20,
  283 + "metadata": {},
  284 + "output_type": "execute_result"
  285 + }
  286 + ],
  287 + "source": [
  288 + "val_pred_trees[504]"
  289 + ]
  290 + },
  291 + {
  292 + "cell_type": "code",
  293 + "execution_count": 25,
  294 + "id": "4d6c7096",
  295 + "metadata": {},
  296 + "outputs": [],
  297 + "source": [
  298 + "'''\n",
  299 + "def undummy(_tree):\n",
  300 + " tree = _tree.copy(deep=True)\n",
  301 + " for node in tree.subtrees():\n",
  302 + " for i, child in enumerate(node):\n",
  303 + " if type(child) != str and child.label() == 'DUMMY_PRE':\n",
  304 + " node[i] = child[0]\n",
  305 + " return tree\n",
  306 + "\n",
  307 + "'''\n",
  308 + "def untag(_tree):\n",
  309 + " tree = _tree.copy(deep=True)\n",
  310 + " for node in tree.subtrees():\n",
  311 + " for i, child in enumerate(node):\n",
  312 + " if type(child) != str and set(map(type, child)) == {str}:\n",
  313 + " assert(len(child) == 1)\n",
  314 + " node[i] = child[0]\n",
  315 + " #if set(map(type, node)) == {str}:\n",
  316 + " # assert(len(node) == 1)\n",
  317 + " # node.set_label('xxx')\n",
  318 + " return tree\n",
  319 + "\n",
  320 + "'''\n",
  321 + "\n",
  322 + "ZDANIE_HEADS = {'*ff', '*spójnik', '*przec', '*zdanie', '*formawykrz'}\n",
  323 + "ZDANIE_HEADS2 = set(l.strip('*') for l in ZDANIE_HEADS)\n",
  324 + "ZDANIE_HEAD_HIERARCHY = ('ff', 'spójnik', 'przec', 'zdanie')\n",
  325 + "\n",
  326 + "\n",
  327 + "def correct(_tree):\n",
  328 + " tree = _tree.copy(deep=True)\n",
  329 + " for node in tree.subtrees():\n",
  330 + " if len(node) == 1 and type(node[0]) != str and node.label() == node[0].label():\n",
  331 + " new_children = [child for child in node[0]]\n",
  332 + " node.pop()\n",
  333 + " node += new_children\n",
  334 + " for node in tree.subtrees():\n",
  335 + " if 'zdanie' in node.label():\n",
  336 + " heads = []\n",
  337 + " non_heads = []\n",
  338 + " for child in node:\n",
  339 + " if child.label().startswith('*') and child.label() not in ZDANIE_HEADS:\n",
  340 + " child.set_label(child.label()[1:])\n",
  341 + " (heads if child.label().startswith('*') else non_heads).append(child)\n",
  342 + " \n",
  343 + " return tree\n",
  344 + "''';"
  345 + ]
  346 + },
  347 + {
  348 + "cell_type": "code",
  349 + "execution_count": 35,
  350 + "id": "4a26b2e1",
  351 + "metadata": {
  352 + "scrolled": false
  353 + },
  354 + "outputs": [],
  355 + "source": [
  356 + "from collections import Counter, defaultdict\n",
  357 + "\n",
  358 + "c = Counter()\n",
  359 + "\n",
  360 + "def tree2spans(_tree):\n",
  361 + " # make tokens unique\n",
  362 + " tree = _tree.copy(deep=True)\n",
  363 + " idx = 0\n",
  364 + " for node in tree.subtrees():\n",
  365 + " for i, child in enumerate(node):\n",
  366 + " if type(child) == str:\n",
  367 + " node[i] = f'{idx}##{child}'\n",
  368 + " idx += 1\n",
  369 + " spans = []\n",
  370 + " for node in tree.subtrees():\n",
  371 + " spans.append((\n",
  372 + " node.label(),\n",
  373 + " tuple(child if type(child) == str else child.label() for child in node),\n",
  374 + " ' '.join(node.leaves())\n",
  375 + " ))\n",
  376 + " assert (len(set(spans)) == len(spans))\n",
  377 + " return set(spans)\n",
  378 + "\n",
  379 + "def spans2dict(spans):\n",
  380 + " s = defaultdict(set)\n",
  381 + " for node, children, text in spans:\n",
  382 + " if node in s[text]:\n",
  383 + " print('!!!!!!!!!!!!!!!', node, text)\n",
  384 + " display(spans)\n",
  385 + " s[text].add(node)\n",
  386 + " return s\n",
  387 + "\n",
  388 + "def spans2errors(spans_gold, spans_pred):\n",
  389 + " sg = spans2dict(spans_gold)\n",
  390 + " sp = spans2dict(spans_pred)\n",
  391 + " errors = []\n",
  392 + " tp, fp, fn = 0, 0, 0\n",
  393 + " for text in set(sg.keys()).union(sp.keys()):\n",
  394 + " txt = ' '.join('X' for _ in text.split())\n",
  395 + " errs = []\n",
  396 + " for span in sg[text].union(sp[text]):\n",
  397 + " if span in sg[text] and span not in sp[text]:\n",
  398 + " errs.append(f'-{span}')\n",
  399 + " fn += 1\n",
  400 + " elif span not in sg[text] and span in sp[text]:\n",
  401 + " errs.append(f'+{span}')\n",
  402 + " fp += 1\n",
  403 + " else:\n",
  404 + " tp += 1\n",
  405 + " if errs:\n",
  406 + " errors.append((tuple(sorted(errs)), text))\n",
  407 + " #display(errors)\n",
  408 + " #print('tp:', tp, 'fp:', fp, 'fn:', fn)\n",
  409 + " #p, r = tp / (tp + fp), tp / (tp + fn)\n",
  410 + " #f1 = 2 * tp / (2 * tp + fp + fn)\n",
  411 + " #print('precision: ', p)\n",
  412 + " #print('recall: ', r)\n",
  413 + " #print('f1: ', f1)\n",
  414 + " return (tp, fp, fn), errors"
  415 + ]
  416 + },
  417 + {
  418 + "cell_type": "code",
  419 + "execution_count": 54,
  420 + "id": "397e3750",
  421 + "metadata": {},
  422 + "outputs": [],
  423 + "source": [
  424 + "def eval_trees(trees_gold, trees_pred):\n",
  425 + " evaluation, errors = [], []\n",
  426 + " for _tree_gold, _tree_pred in list(zip(trees_gold, trees_pred)):\n",
  427 + " try:\n",
  428 + " assert(''.join(_tree_gold.leaves()) == ''.join(_tree_pred.leaves()))\n",
  429 + " except:\n",
  430 + " print(_tree_gold.leaves())\n",
  431 + " print(_tree_pred.leaves())\n",
  432 + " raise\n",
  433 + " tree_gold = untag(_tree_gold)\n",
  434 + " tree_pred = untag(_tree_pred)\n",
  435 + " spans_gold = tree2spans(tree_gold)\n",
  436 + " spans_pred = tree2spans(tree_pred)\n",
  437 + " if tree_gold.leaves() == ['Poszedł', 'em', 'do', 'adwokata', '.']:\n",
  438 + " display(tree_gold)\n",
  439 + " display(tree_pred)\n",
  440 + " print(spans_gold)\n",
  441 + " print(spans_pred)\n",
  442 + " try:\n",
  443 + " evl, errs = spans2errors(spans_gold, spans_pred)\n",
  444 + " evaluation.append(evl)\n",
  445 + " errors += [(err, _tree_gold, _tree_pred) for err in errs]\n",
  446 + " except:\n",
  447 + " display(tree_pred)\n",
  448 + " display2(_tree_pred)\n",
  449 + " print(i)\n",
  450 + " raise\n",
  451 + " return evaluation, errors"
  452 + ]
  453 + },
  454 + {
  455 + "cell_type": "code",
  456 + "execution_count": 55,
  457 + "id": "5dcd68fd",
  458 + "metadata": {},
  459 + "outputs": [
  460 + {
  461 + "data": {
  462 + "image/svg+xml": [
  463 + "<svg baseProfile=\"full\" height=\"264px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,296.0,264.0\" width=\"296px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"81.0811%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"43.3333%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"69.2308%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Poszedł</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"34.6154%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"30.7692%\" x=\"69.2308%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">em</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"84.6154%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"21.6667%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"56.6667%\" x=\"43.3333%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"41.1765%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">do</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.5882%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"58.8235%\" x=\"41.1765%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adwokata</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"70.5882%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"71.6667%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"40.5405%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"18.9189%\" x=\"81.0811%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"90.5405%\" y1=\"1.2em\" y2=\"3em\" /></svg>"
  464 + ],
  465 + "text/plain": [
  466 + "Tree('ROOT', [Tree('*S', [Tree('*VP', [Tree('*V', ['Poszedł', 'em'])]), Tree('PrepNP', [Tree('*Prep', ['do']), Tree('NP', [Tree('*N', ['adwokata'])])])]), Tree('Punct', ['.'])])"
  467 + ]
  468 + },
  469 + "metadata": {},
  470 + "output_type": "display_data"
  471 + },
  472 + {
  473 + "data": {
  474 + "image/svg+xml": [
  475 + "<svg baseProfile=\"full\" height=\"264px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,296.0,264.0\" width=\"296px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"81.0811%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"43.3333%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"69.2308%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Poszedł</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"34.6154%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"30.7692%\" x=\"69.2308%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">em</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"84.6154%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"21.6667%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"56.6667%\" x=\"43.3333%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"41.1765%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">do</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.5882%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"58.8235%\" x=\"41.1765%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adwokata</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"70.5882%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"71.6667%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"40.5405%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"18.9189%\" x=\"81.0811%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"90.5405%\" y1=\"1.2em\" y2=\"3em\" /></svg>"
  476 + ],
  477 + "text/plain": [
  478 + "Tree('ROOT', [Tree('*S', [Tree('*VP', [Tree('*V', ['Poszedł', 'em'])]), Tree('PrepNP', [Tree('*Prep', ['do']), Tree('NP', [Tree('*N', ['adwokata'])])])]), Tree('Punct', ['.'])])"
  479 + ]
  480 + },
  481 + "metadata": {},
  482 + "output_type": "display_data"
  483 + },
  484 + {
  485 + "name": "stdout",
  486 + "output_type": "stream",
  487 + "text": [
  488 + "{('Punct', ('4##.',), '4##.'), ('*Prep', ('2##do',), '2##do'), ('ROOT', ('*S', 'Punct'), '0##Poszedł 1##em 2##do 3##adwokata 4##.'), ('NP', ('*N',), '3##adwokata'), ('*V', ('0##Poszedł', '1##em'), '0##Poszedł 1##em'), ('PrepNP', ('*Prep', 'NP'), '2##do 3##adwokata'), ('*S', ('*VP', 'PrepNP'), '0##Poszedł 1##em 2##do 3##adwokata'), ('*N', ('3##adwokata',), '3##adwokata'), ('*VP', ('*V',), '0##Poszedł 1##em')}\n",
  489 + "{('Punct', ('4##.',), '4##.'), ('*Prep', ('2##do',), '2##do'), ('ROOT', ('*S', 'Punct'), '0##Poszedł 1##em 2##do 3##adwokata 4##.'), ('NP', ('*N',), '3##adwokata'), ('*V', ('0##Poszedł', '1##em'), '0##Poszedł 1##em'), ('PrepNP', ('*Prep', 'NP'), '2##do 3##adwokata'), ('*S', ('*VP', 'PrepNP'), '0##Poszedł 1##em 2##do 3##adwokata'), ('*N', ('3##adwokata',), '3##adwokata'), ('*VP', ('*V',), '0##Poszedł 1##em')}\n",
  490 + "!!!!!!!!!!!!!!! *AdvP 0##Trudno\n"
  491 + ]
  492 + },
  493 + {
  494 + "data": {
  495 + "text/plain": [
  496 + "{('*Adv', ('0##Trudno',), '0##Trudno'),\n",
  497 + " ('*AdvP', ('*Adv',), '0##Trudno'),\n",
  498 + " ('*AdvP', ('*AdvP',), '0##Trudno'),\n",
  499 + " ('Punct', ('1##.',), '1##.'),\n",
  500 + " ('ROOT', ('*AdvP', 'Punct'), '0##Trudno 1##.')}"
  501 + ]
  502 + },
  503 + "metadata": {},
  504 + "output_type": "display_data"
  505 + },
  506 + {
  507 + "name": "stdout",
  508 + "output_type": "stream",
  509 + "text": [
  510 + "!!!!!!!!!!!!!!! *AdvP 0##Trudno\n"
  511 + ]
  512 + },
  513 + {
  514 + "data": {
  515 + "text/plain": [
  516 + "{('*Adv', ('0##Trudno',), '0##Trudno'),\n",
  517 + " ('*AdvP', ('*Adv',), '0##Trudno'),\n",
  518 + " ('*AdvP', ('*AdvP',), '0##Trudno'),\n",
  519 + " ('Punct', ('1##.',), '1##.'),\n",
  520 + " ('ROOT', ('*AdvP', 'Punct'), '0##Trudno 1##.')}"
  521 + ]
  522 + },
  523 + "metadata": {},
  524 + "output_type": "display_data"
  525 + },
  526 + {
  527 + "name": "stdout",
  528 + "output_type": "stream",
  529 + "text": [
  530 + "!!!!!!!!!!!!!!! *PrepNP 0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic\n"
  531 + ]
  532 + },
  533 + {
  534 + "data": {
  535 + "text/plain": [
  536 + "{('*Comp', ('3##iż',), '3##iż'),\n",
  537 + " ('*Comp', ('7##iż',), '7##iż'),\n",
  538 + " ('*Conj', ('6##,',), '6##,'),\n",
  539 + " ('*N', ('1##tym',), '1##tym'),\n",
  540 + " ('*N', ('11##nic',), '11##nic'),\n",
  541 + " ('*N', ('5##nikim',), '5##nikim'),\n",
  542 + " ('*N', ('9##cię',), '9##cię'),\n",
  543 + " ('*NP', ('*N',), '1##tym'),\n",
  544 + " ('*Prep', ('0##O',), '0##O'),\n",
  545 + " ('*Prep', ('10##za',), '10##za'),\n",
  546 + " ('*PrepNP',\n",
  547 + " ('*Prep', 'NP'),\n",
  548 + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  549 + " ('*PrepNP',\n",
  550 + " ('*PrepNP',),\n",
  551 + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  552 + " ('*V', ('4##jesteś',), '4##jesteś'),\n",
  553 + " ('*V', ('8##mają',), '8##mają'),\n",
  554 + " ('*VP', ('*V',), '4##jesteś'),\n",
  555 + " ('*VP', ('*V',), '8##mają'),\n",
  556 + " ('CP', ('*Comp', 'S'), '7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  557 + " ('CP',\n",
  558 + " ('CP', '*Conj', 'CP'),\n",
  559 + " '2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  560 + " ('CP', ('Punct', '*Comp', 'S'), '2##, 3##iż 4##jesteś 5##nikim'),\n",
  561 + " ('NP', ('*N',), '11##nic'),\n",
  562 + " ('NP', ('*N',), '5##nikim'),\n",
  563 + " ('NP', ('*N',), '9##cię'),\n",
  564 + " ('NP',\n",
  565 + " ('*NP', 'CP'),\n",
  566 + " '1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  567 + " ('PrepNP', ('*Prep', 'NP'), '10##za 11##nic'),\n",
  568 + " ('Punct', ('12##!',), '12##!'),\n",
  569 + " ('Punct', ('2##,',), '2##,'),\n",
  570 + " ('ROOT',\n",
  571 + " ('*PrepNP', 'Punct'),\n",
  572 + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic 12##!'),\n",
  573 + " ('S', ('*VP', 'NP'), '4##jesteś 5##nikim'),\n",
  574 + " ('S', ('*VP', 'NP', 'PrepNP'), '8##mają 9##cię 10##za 11##nic')}"
  575 + ]
  576 + },
  577 + "metadata": {},
  578 + "output_type": "display_data"
  579 + },
  580 + {
  581 + "name": "stdout",
  582 + "output_type": "stream",
  583 + "text": [
  584 + "!!!!!!!!!!!!!!! *PrepNP 0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic\n"
  585 + ]
  586 + },
  587 + {
  588 + "data": {
  589 + "text/plain": [
  590 + "{('*Comp', ('3##iż',), '3##iż'),\n",
  591 + " ('*Comp', ('7##iż',), '7##iż'),\n",
  592 + " ('*Conj', ('6##,',), '6##,'),\n",
  593 + " ('*N', ('1##tym',), '1##tym'),\n",
  594 + " ('*N', ('11##nic',), '11##nic'),\n",
  595 + " ('*N', ('5##nikim',), '5##nikim'),\n",
  596 + " ('*N', ('9##cię',), '9##cię'),\n",
  597 + " ('*NP', ('*N',), '1##tym'),\n",
  598 + " ('*Prep', ('0##O',), '0##O'),\n",
  599 + " ('*Prep', ('10##za',), '10##za'),\n",
  600 + " ('*PrepNP',\n",
  601 + " ('*Prep', 'NP'),\n",
  602 + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  603 + " ('*PrepNP',\n",
  604 + " ('*PrepNP',),\n",
  605 + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  606 + " ('*V', ('4##jesteś',), '4##jesteś'),\n",
  607 + " ('*V', ('8##mają',), '8##mają'),\n",
  608 + " ('*VP', ('*V',), '4##jesteś'),\n",
  609 + " ('*VP', ('*V',), '8##mają'),\n",
  610 + " ('CP', ('*Comp', 'S'), '7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  611 + " ('CP',\n",
  612 + " ('CP', '*Conj', 'CP'),\n",
  613 + " '2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  614 + " ('CP', ('Punct', '*Comp', 'S'), '2##, 3##iż 4##jesteś 5##nikim'),\n",
  615 + " ('NP', ('*N',), '11##nic'),\n",
  616 + " ('NP', ('*N',), '5##nikim'),\n",
  617 + " ('NP', ('*N',), '9##cię'),\n",
  618 + " ('NP',\n",
  619 + " ('*NP', 'CP'),\n",
  620 + " '1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n",
  621 + " ('PrepNP', ('*Prep', 'NP'), '10##za 11##nic'),\n",
  622 + " ('Punct', ('12##!',), '12##!'),\n",
  623 + " ('Punct', ('2##,',), '2##,'),\n",
  624 + " ('ROOT',\n",
  625 + " ('*PrepNP', 'Punct'),\n",
  626 + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic 12##!'),\n",
  627 + " ('S', ('*VP', 'NP'), '4##jesteś 5##nikim'),\n",
  628 + " ('S', ('*VP', 'NP', 'PrepNP'), '8##mają 9##cię 10##za 11##nic')}"
  629 + ]
  630 + },
  631 + "metadata": {},
  632 + "output_type": "display_data"
  633 + },
  634 + {
  635 + "name": "stdout",
  636 + "output_type": "stream",
  637 + "text": [
  638 + "!!!!!!!!!!!!!!! *NP 0##Cztery 1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości\n"
  639 + ]
  640 + },
  641 + {
  642 + "data": {
  643 + "text/plain": [
  644 + "{('*Adj', ('1##małe',), '1##małe'),\n",
  645 + " ('*N', ('2##groszki',), '2##groszki'),\n",
  646 + " ('*N', ('4##strąku',), '4##strąku'),\n",
  647 + " ('*N', ('6##tunelu',), '6##tunelu'),\n",
  648 + " ('*N', ('7##miłości',), '7##miłości'),\n",
  649 + " ('*NP', ('*N',), '2##groszki'),\n",
  650 + " ('*NP', ('*N',), '6##tunelu'),\n",
  651 + " ('*NP',\n",
  652 + " ('*NP',),\n",
  653 + " '0##Cztery 1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości'),\n",
  654 + " ('*NP',\n",
  655 + " ('*NumP', 'NP'),\n",
  656 + " '0##Cztery 1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości'),\n",
  657 + " ('*Num', ('0##Cztery',), '0##Cztery'),\n",
  658 + " ('*NumP', ('*Num',), '0##Cztery'),\n",
  659 + " ('*Prep', ('3##w',), '3##w'),\n",
  660 + " ('*Prep', ('5##w',), '5##w'),\n",
  661 + " ('AdjP', ('*Adj',), '1##małe'),\n",
  662 + " ('NP', ('*N',), '4##strąku'),\n",
  663 + " ('NP', ('*N',), '7##miłości'),\n",
  664 + " ('NP', ('*NP', 'NP'), '6##tunelu 7##miłości'),\n",
  665 + " ('NP',\n",
  666 + " ('AdjP', '*NP', 'PrepNP', 'PrepNP'),\n",
  667 + " '1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości'),\n",
  668 + " ('PrepNP', ('*Prep', 'NP'), '3##w 4##strąku'),\n",
  669 + " ('PrepNP', ('*Prep', 'NP'), '5##w 6##tunelu 7##miłości'),\n",
  670 + " ('Punct', ('8##.',), '8##.'),\n",
  671 + " ('ROOT',\n",
  672 + " ('*NP', 'Punct'),\n",
  673 + " '0##Cztery 1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości 8##.')}"
  674 + ]
  675 + },
  676 + "metadata": {},
  677 + "output_type": "display_data"
  678 + },
  679 + {
  680 + "name": "stdout",
  681 + "output_type": "stream",
  682 + "text": [
  683 + "!!!!!!!!!!!!!!! *PrepNP 0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala\n"
  684 + ]
  685 + },
  686 + {
  687 + "data": {
  688 + "text/plain": [
  689 + "{('*Adv', ('3##gdy',), '3##gdy'),\n",
  690 + " ('*N', ('1##chwili',), '1##chwili'),\n",
  691 + " ('*N', ('7##Alpy',), '7##Alpy'),\n",
  692 + " ('*N', ('8##słonie',), '8##słonie'),\n",
  693 + " ('*N', ('9##Hannibala',), '9##Hannibala'),\n",
  694 + " ('*NP', ('*N',), '1##chwili'),\n",
  695 + " ('*NP', ('*N',), '8##słonie'),\n",
  696 + " ('*Prep', ('0##W',), '0##W'),\n",
  697 + " ('*Prep', ('6##przez',), '6##przez'),\n",
  698 + " ('*PrepNP',\n",
  699 + " ('*Prep', 'NP'),\n",
  700 + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  701 + " ('*PrepNP',\n",
  702 + " ('*PrepNP',),\n",
  703 + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  704 + " ('*S',\n",
  705 + " ('AdvP', '*VP', 'PrepNP', 'NP'),\n",
  706 + " '3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  707 + " ('*V', ('4##przeprowadzał', '5##em'), '4##przeprowadzał 5##em'),\n",
  708 + " ('*VP', ('*V',), '4##przeprowadzał 5##em'),\n",
  709 + " ('AdvP', ('*Adv',), '3##gdy'),\n",
  710 + " ('CP',\n",
  711 + " ('Punct', '*S'),\n",
  712 + " '2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  713 + " ('NP', ('*N',), '7##Alpy'),\n",
  714 + " ('NP', ('*N',), '9##Hannibala'),\n",
  715 + " ('NP',\n",
  716 + " ('*NP', 'CP'),\n",
  717 + " '1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  718 + " ('NP', ('*NP', 'NP'), '8##słonie 9##Hannibala'),\n",
  719 + " ('PrepNP', ('*Prep', 'NP'), '6##przez 7##Alpy'),\n",
  720 + " ('Punct', ('10##.',), '10##.'),\n",
  721 + " ('Punct', ('2##,',), '2##,'),\n",
  722 + " ('ROOT',\n",
  723 + " ('*PrepNP', 'Punct'),\n",
  724 + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala 10##.')}"
  725 + ]
  726 + },
  727 + "metadata": {},
  728 + "output_type": "display_data"
  729 + },
  730 + {
  731 + "name": "stdout",
  732 + "output_type": "stream",
  733 + "text": [
  734 + "!!!!!!!!!!!!!!! *PrepNP 0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala\n"
  735 + ]
  736 + },
  737 + {
  738 + "data": {
  739 + "text/plain": [
  740 + "{('*Adv', ('3##gdy',), '3##gdy'),\n",
  741 + " ('*N', ('1##chwili',), '1##chwili'),\n",
  742 + " ('*N', ('7##Alpy',), '7##Alpy'),\n",
  743 + " ('*N', ('8##słonie',), '8##słonie'),\n",
  744 + " ('*N', ('9##Hannibala',), '9##Hannibala'),\n",
  745 + " ('*NP', ('*N',), '1##chwili'),\n",
  746 + " ('*NP', ('*N',), '8##słonie'),\n",
  747 + " ('*Prep', ('0##W',), '0##W'),\n",
  748 + " ('*Prep', ('6##przez',), '6##przez'),\n",
  749 + " ('*PrepNP',\n",
  750 + " ('*Prep', 'NP'),\n",
  751 + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  752 + " ('*PrepNP',\n",
  753 + " ('*PrepNP',),\n",
  754 + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  755 + " ('*S',\n",
  756 + " ('AdvP', '*VP', 'PrepNP', 'NP'),\n",
  757 + " '3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  758 + " ('*V', ('4##przeprowadzał', '5##em'), '4##przeprowadzał 5##em'),\n",
  759 + " ('*VP', ('*V',), '4##przeprowadzał 5##em'),\n",
  760 + " ('AdvP', ('*Adv',), '3##gdy'),\n",
  761 + " ('CP',\n",
  762 + " ('Punct', '*S'),\n",
  763 + " '2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  764 + " ('NP', ('*N',), '7##Alpy'),\n",
  765 + " ('NP', ('*N',), '9##Hannibala'),\n",
  766 + " ('NP',\n",
  767 + " ('*NP', 'CP'),\n",
  768 + " '1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n",
  769 + " ('NP', ('*NP', 'NP'), '8##słonie 9##Hannibala'),\n",
  770 + " ('PrepNP', ('*Prep', 'NP'), '6##przez 7##Alpy'),\n",
  771 + " ('Punct', ('10##.',), '10##.'),\n",
  772 + " ('Punct', ('2##,',), '2##,'),\n",
  773 + " ('ROOT',\n",
  774 + " ('*PrepNP', 'Punct'),\n",
  775 + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala 10##.')}"
  776 + ]
  777 + },
  778 + "metadata": {},
  779 + "output_type": "display_data"
  780 + },
  781 + {
  782 + "name": "stdout",
  783 + "output_type": "stream",
  784 + "text": [
  785 + "!!!!!!!!!!!!!!! *PrepNP 1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna\n"
  786 + ]
  787 + },
  788 + {
  789 + "data": {
  790 + "text/plain": [
  791 + "{('*Adj', ('2##roczną',), '2##roczną'),\n",
  792 + " ('*Adj', ('7##czternasty',), '7##czternasty'),\n",
  793 + " ('*N', ('3##misję',), '3##misję'),\n",
  794 + " ('*N', ('5##Tytana',), '5##Tytana'),\n",
  795 + " ('*N', ('8##księżyc',), '8##księżyc'),\n",
  796 + " ('*N', ('9##Saturna',), '9##Saturna'),\n",
  797 + " ('*NP', ('*N',), '3##misję'),\n",
  798 + " ('*NP', ('*N',), '5##Tytana'),\n",
  799 + " ('*NP', ('*N',), '8##księżyc'),\n",
  800 + " ('*Prep', ('1##na',), '1##na'),\n",
  801 + " ('*Prep', ('4##na',), '4##na'),\n",
  802 + " ('*PrepNP',\n",
  803 + " ('*Prep', 'NP'),\n",
  804 + " '1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  805 + " ('*PrepNP',\n",
  806 + " ('*PrepNP',),\n",
  807 + " '1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  808 + " ('AdjP', ('*Adj',), '2##roczną'),\n",
  809 + " ('AdjP', ('*Adj',), '7##czternasty'),\n",
  810 + " ('NP', ('*N',), '9##Saturna'),\n",
  811 + " ('NP', ('*NP', 'NP'), '5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  812 + " ('NP',\n",
  813 + " ('AdjP', '*NP', 'PrepNP'),\n",
  814 + " '2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  815 + " ('NP',\n",
  816 + " ('Punct', 'AdjP', '*NP', 'NP'),\n",
  817 + " '6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  818 + " ('PrepNP',\n",
  819 + " ('*Prep', 'NP'),\n",
  820 + " '4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  821 + " ('Punct', ('0##-',), '0##-'),\n",
  822 + " ('Punct', ('10##.',), '10##.'),\n",
  823 + " ('Punct', ('6##,',), '6##,'),\n",
  824 + " ('ROOT',\n",
  825 + " ('Punct', '*PrepNP', 'Punct'),\n",
  826 + " '0##- 1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna 10##.')}"
  827 + ]
  828 + },
  829 + "metadata": {},
  830 + "output_type": "display_data"
  831 + },
  832 + {
  833 + "name": "stdout",
  834 + "output_type": "stream",
  835 + "text": [
  836 + "!!!!!!!!!!!!!!! *PrepNP 1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna\n"
  837 + ]
  838 + },
  839 + {
  840 + "data": {
  841 + "text/plain": [
  842 + "{('*Adj', ('2##roczną',), '2##roczną'),\n",
  843 + " ('*Adj', ('7##czternasty',), '7##czternasty'),\n",
  844 + " ('*N', ('3##misję',), '3##misję'),\n",
  845 + " ('*N', ('5##Tytana',), '5##Tytana'),\n",
  846 + " ('*N', ('8##księżyc',), '8##księżyc'),\n",
  847 + " ('*N', ('9##Saturna',), '9##Saturna'),\n",
  848 + " ('*NP', ('*N',), '3##misję'),\n",
  849 + " ('*NP', ('*N',), '5##Tytana'),\n",
  850 + " ('*NP', ('*N',), '8##księżyc'),\n",
  851 + " ('*Prep', ('1##na',), '1##na'),\n",
  852 + " ('*Prep', ('4##na',), '4##na'),\n",
  853 + " ('*PrepNP',\n",
  854 + " ('*Prep', 'NP'),\n",
  855 + " '1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  856 + " ('*PrepNP',\n",
  857 + " ('*PrepNP',),\n",
  858 + " '1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  859 + " ('AdjP', ('*Adj',), '2##roczną'),\n",
  860 + " ('AdjP', ('*Adj',), '7##czternasty'),\n",
  861 + " ('NP', ('*N',), '9##Saturna'),\n",
  862 + " ('NP', ('*NP', 'NP'), '5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  863 + " ('NP',\n",
  864 + " ('AdjP', '*NP', 'PrepNP'),\n",
  865 + " '2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  866 + " ('NP',\n",
  867 + " ('Punct', 'AdjP', '*NP', 'NP'),\n",
  868 + " '6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  869 + " ('PrepNP',\n",
  870 + " ('*Prep', 'NP'),\n",
  871 + " '4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n",
  872 + " ('Punct', ('0##-',), '0##-'),\n",
  873 + " ('Punct', ('10##.',), '10##.'),\n",
  874 + " ('Punct', ('6##,',), '6##,'),\n",
  875 + " ('ROOT',\n",
  876 + " ('Punct', '*PrepNP', 'Punct'),\n",
  877 + " '0##- 1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna 10##.')}"
  878 + ]
  879 + },
  880 + "metadata": {},
  881 + "output_type": "display_data"
  882 + }
  883 + ],
  884 + "source": [
  885 + "evaluation_val, errors_val = eval_trees(val_trees, val_pred_trees)\n",
  886 + "evaluation_test, errors_test = eval_trees(test_trees, test_pred_trees)"
  887 + ]
  888 + },
  889 + {
  890 + "cell_type": "code",
  891 + "execution_count": 56,
  892 + "id": "65af3522",
  893 + "metadata": {},
  894 + "outputs": [
  895 + {
  896 + "name": "stdout",
  897 + "output_type": "stream",
  898 + "text": [
  899 + "DEV:\n",
  900 + "precision: 0.974400637684714\n",
  901 + "recall: 0.9742960930674555\n",
  902 + "f1: 0.9743483625717548\n"
  903 + ]
  904 + }
  905 + ],
  906 + "source": [
  907 + "tp, fp, fn = list(map(sum, zip(*evaluation_val)))\n",
  908 + "p, r = tp / (tp + fp), tp / (tp + fn)\n",
  909 + "f1 = 2 * tp / (2 * tp + fp + fn)\n",
  910 + "print('DEV:')\n",
  911 + "print('precision: ', p)\n",
  912 + "print('recall: ', r)\n",
  913 + "print('f1: ', f1)"
  914 + ]
  915 + },
  916 + {
  917 + "cell_type": "code",
  918 + "execution_count": 57,
  919 + "id": "8e0f3f93",
  920 + "metadata": {},
  921 + "outputs": [
  922 + {
  923 + "name": "stdout",
  924 + "output_type": "stream",
  925 + "text": [
  926 + "TEST:\n",
  927 + "precision: 0.9774147274466051\n",
  928 + "recall: 0.9775082092645137\n",
  929 + "f1: 0.9774614661204711\n"
  930 + ]
  931 + }
  932 + ],
  933 + "source": [
  934 + "tp, fp, fn = list(map(sum, zip(*evaluation_test)))\n",
  935 + "p, r = tp / (tp + fp), tp / (tp + fn)\n",
  936 + "f1 = 2 * tp / (2 * tp + fp + fn)\n",
  937 + "print('TEST:')\n",
  938 + "print('precision: ', p)\n",
  939 + "print('recall: ', r)\n",
  940 + "print('f1: ', f1)"
  941 + ]
  942 + },
  943 + {
  944 + "cell_type": "code",
  945 + "execution_count": null,
  946 + "id": "302b2333",
  947 + "metadata": {},
  948 + "outputs": [],
  949 + "source": []
  950 + }
  951 + ],
  952 + "metadata": {
  953 + "kernelspec": {
  954 + "display_name": "torch_benepar",
  955 + "language": "python",
  956 + "name": "torch_benepar"
  957 + },
  958 + "language_info": {
  959 + "codemirror_mode": {
  960 + "name": "ipython",
  961 + "version": 3
  962 + },
  963 + "file_extension": ".py",
  964 + "mimetype": "text/x-python",
  965 + "name": "python",
  966 + "nbconvert_exporter": "python",
  967 + "pygments_lexer": "ipython3",
  968 + "version": "3.10.6"
  969 + }
  970 + },
  971 + "nbformat": 4,
  972 + "nbformat_minor": 5
  973 +}
... ...
COMBO/DataPreparation.ipynb 0 → 100644
  1 +++ a/COMBO/DataPreparation.ipynb
  1 +{
  2 + "cells": [
  3 + {
  4 + "cell_type": "code",
  5 + "execution_count": 46,
  6 + "id": "5cd26f6f",
  7 + "metadata": {},
  8 + "outputs": [],
  9 + "source": [
  10 + "import os\n",
  11 + "\n",
  12 + "from datasets import load_dataset\n",
  13 + "\n",
  14 + "from IPython.display import display"
  15 + ]
  16 + },
  17 + {
  18 + "cell_type": "code",
  19 + "execution_count": 47,
  20 + "id": "fecef4af",
  21 + "metadata": {},
  22 + "outputs": [
  23 + {
  24 + "name": "stderr",
  25 + "output_type": "stream",
  26 + "text": [
  27 + "Found cached dataset pdb_c_beta (/home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1)\n"
  28 + ]
  29 + },
  30 + {
  31 + "data": {
  32 + "application/vnd.jupyter.widget-view+json": {
  33 + "model_id": "1c89c7103bba4347a3fa7d23cac42cfe",
  34 + "version_major": 2,
  35 + "version_minor": 0
  36 + },
  37 + "text/plain": [
  38 + " 0%| | 0/3 [00:00<?, ?it/s]"
  39 + ]
  40 + },
  41 + "metadata": {},
  42 + "output_type": "display_data"
  43 + }
  44 + ],
  45 + "source": [
  46 + "pdbc_dataset = load_dataset('../pdb_c_beta')"
  47 + ]
  48 + },
  49 + {
  50 + "cell_type": "code",
  51 + "execution_count": 48,
  52 + "id": "23da801f",
  53 + "metadata": {},
  54 + "outputs": [],
  55 + "source": [
  56 + "CONLLU_DIR = 'connlu'\n",
  57 + "! rm -r {CONLLU_DIR}\n",
  58 + "! mkdir {CONLLU_DIR}"
  59 + ]
  60 + },
  61 + {
  62 + "cell_type": "code",
  63 + "execution_count": 50,
  64 + "id": "91fb3bf3",
  65 + "metadata": {},
  66 + "outputs": [],
  67 + "source": [
  68 + "import sys\n",
  69 + "sys.path.append('../')\n",
  70 + "from neural_parser.hybrid_tree_utils import tree_from_dataset_instance"
  71 + ]
  72 + },
  73 + {
  74 + "cell_type": "code",
  75 + "execution_count": 60,
  76 + "id": "c105feff",
  77 + "metadata": {},
  78 + "outputs": [
  79 + {
  80 + "name": "stdout",
  81 + "output_type": "stream",
  82 + "text": [
  83 + "train\n",
  84 + " connlu/pdbc-train.conllu\n",
  85 + " 17659\n",
  86 + " connlu/pdbc-cont-train.conllu\n",
  87 + " 15903\n",
  88 + "validation\n",
  89 + " connlu/pdbc-validation.conllu\n",
  90 + " 2211\n",
  91 + " connlu/pdbc-cont-validation.conllu\n",
  92 + " 1980\n",
  93 + "test\n",
  94 + " connlu/pdbc-test.conllu\n",
  95 + " 2205\n",
  96 + " connlu/pdbc-cont-test.conllu\n",
  97 + " 1990\n"
  98 + ]
  99 + }
  100 + ],
  101 + "source": [
  102 + "features = pdbc_dataset['train'].features\n",
  103 + "\n",
  104 + "for part, dataset in pdbc_dataset.items():\n",
  105 + " print(part)\n",
  106 + " s_cont, s_all = [], [] \n",
  107 + " for sentence in dataset:\n",
  108 + " # TODO! check if discont\n",
  109 + " tokens = sentence['tokens']\n",
  110 + " lemmas = sentence['lemmas']\n",
  111 + " heads = sentence['heads']\n",
  112 + " heads = [h + 1 if h is not None else 0 for i, h in enumerate(heads)]\n",
  113 + " deprels = [features['deprels'].feature.int2str(d) for d in sentence['deprels']]\n",
  114 + " deprels = ['root' if deprel == 'ROOT' else deprel for deprel in deprels]\n",
  115 + " rows = [f'# text = {\" \".join(tokens)}'] + [\n",
  116 + " f'{i + 1}\\t{t}\\t{l}\\t_\\t_\\t_\\t{h}\\t{d}\\t{h}:{d}\\t_'\n",
  117 + " for i, (t, l, h, d) in enumerate(zip(tokens, lemmas, heads, deprels))\n",
  118 + " ]\n",
  119 + " s_all.append(rows)\n",
  120 + " if tree_from_dataset_instance(sentence, features).is_continuous():\n",
  121 + " s_cont.append(rows)\n",
  122 + " f_all = os.path.join(CONLLU_DIR, f'pdbc-{part}.conllu')\n",
  123 + " f_cont = os.path.join(CONLLU_DIR, f'pdbc-cont-{part}.conllu')\n",
  124 + " with open(f_all, 'w') as f:\n",
  125 + " print(' ', f_all)\n",
  126 + " print(' ', len(s_all))\n",
  127 + " for rows in s_all:\n",
  128 + " print('\\n'.join(rows), end='\\n\\n', file=f)\n",
  129 + " with open(f_cont, 'w') as f:\n",
  130 + " print(' ', f_cont)\n",
  131 + " print(' ', len(s_cont))\n",
  132 + " for rows in s_cont:\n",
  133 + " print('\\n'.join(rows), end='\\n\\n', file=f)"
  134 + ]
  135 + },
  136 + {
  137 + "cell_type": "code",
  138 + "execution_count": 61,
  139 + "id": "c849233c",
  140 + "metadata": {},
  141 + "outputs": [
  142 + {
  143 + "name": "stdout",
  144 + "output_type": "stream",
  145 + "text": [
  146 + " 32509 319813 1398303 connlu/pdbc-cont-test.conllu\n",
  147 + " 32509 319813 1198902 connlu/pdbc-cont-test-pred.conllu\n",
  148 + " 271337 2682725 11781617 connlu/pdbc-cont-train.conllu\n",
  149 + " 33491 330792 1452373 connlu/pdbc-cont-validation.conllu\n",
  150 + " 33491 330792 1244192 connlu/pdbc-cont-validation-pred.conllu\n",
  151 + " 37754 373431 1639937 connlu/pdbc-test.conllu\n",
  152 + " 37754 373431 1406776 connlu/pdbc-test-pred.conllu\n",
  153 + " 315364 3133712 13808053 connlu/pdbc-train.conllu\n",
  154 + " 38987 386865 1704685 connlu/pdbc-validation.conllu\n",
  155 + " 38987 386865 1461922 connlu/pdbc-validation-pred.conllu\n",
  156 + " 872183 8638239 37096760 total\n"
  157 + ]
  158 + }
  159 + ],
  160 + "source": [
  161 + "! wc {CONLLU_DIR}/*.conllu"
  162 + ]
  163 + },
  164 + {
  165 + "cell_type": "code",
  166 + "execution_count": 62,
  167 + "id": "6b571716",
  168 + "metadata": {},
  169 + "outputs": [
  170 + {
  171 + "name": "stdout",
  172 + "output_type": "stream",
  173 + "text": [
  174 + "# text = Skośnooka dziewczynka trzyma w rękach drewniane pałeczki , a przed nią znajdują się naczynia kuchenne .\r\n",
  175 + "1\tSkośnooka\tskośnooki\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n",
  176 + "2\tdziewczynka\tdziewczynka\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
  177 + "3\ttrzyma\ttrzymać\t_\t_\t_\t9\tconjunct\t9:conjunct\t_\r\n",
  178 + "4\tw\tw\t_\t_\t_\t3\tadjunct_locat\t3:adjunct_locat\t_\r\n",
  179 + "5\trękach\tręka\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n",
  180 + "6\tdrewniane\tdrewniany\t_\t_\t_\t7\tadjunct\t7:adjunct\t_\r\n",
  181 + "7\tpałeczki\tpałeczka\t_\t_\t_\t3\tobj\t3:obj\t_\r\n",
  182 + "8\t,\t,\t_\t_\t_\t9\tpunct\t9:punct\t_\r\n",
  183 + "9\ta\ta\t_\t_\t_\t0\troot\t0:root\t_\r\n"
  184 + ]
  185 + }
  186 + ],
  187 + "source": [
  188 + "! head {CONLLU_DIR}/pdbc-train.conllu"
  189 + ]
  190 + }
  191 + ],
  192 + "metadata": {
  193 + "kernelspec": {
  194 + "display_name": "TF_zajecia",
  195 + "language": "python",
  196 + "name": "tf_zajecia"
  197 + },
  198 + "language_info": {
  199 + "codemirror_mode": {
  200 + "name": "ipython",
  201 + "version": 3
  202 + },
  203 + "file_extension": ".py",
  204 + "mimetype": "text/x-python",
  205 + "name": "python",
  206 + "nbconvert_exporter": "python",
  207 + "pygments_lexer": "ipython3",
  208 + "version": "3.10.6"
  209 + }
  210 + },
  211 + "nbformat": 4,
  212 + "nbformat_minor": 5
  213 +}
... ...
COMBO/ParseValAndTrain.ipynb 0 → 100644
  1 +++ a/COMBO/ParseValAndTrain.ipynb
  1 +{
  2 + "cells": [
  3 + {
  4 + "cell_type": "code",
  5 + "execution_count": 1,
  6 + "id": "aabfb24b",
  7 + "metadata": {},
  8 + "outputs": [],
  9 + "source": [
  10 + "COMBO = '/home/kkrasnowska/anaconda3/envs/combo_p39/bin/combo'"
  11 + ]
  12 + },
  13 + {
  14 + "cell_type": "markdown",
  15 + "id": "787fff78",
  16 + "metadata": {},
  17 + "source": [
  18 + "Main model"
  19 + ]
  20 + },
  21 + {
  22 + "cell_type": "code",
  23 + "execution_count": 2,
  24 + "id": "1d9daaa9",
  25 + "metadata": {},
  26 + "outputs": [
  27 + {
  28 + "name": "stdout",
  29 + "output_type": "stream",
  30 + "text": [
  31 + "I0407 10:49:31.448594 140072765682752 archival.py:184] loading archive file model-pdbc/model.tar.gz\n",
  32 + "I0407 10:49:31.449148 140072765682752 archival.py:263] extracting archive file model-pdbc/model.tar.gz to temp dir /tmp/tmp_htckuhc\n",
  33 + "I0407 10:49:48.075045 140072765682752 params.py:248] dataset_reader.type = conllu\n",
  34 + "I0407 10:49:48.075561 140072765682752 params.py:248] dataset_reader.lazy = False\n",
  35 + "I0407 10:49:48.075693 140072765682752 params.py:248] dataset_reader.cache_directory = None\n",
  36 + "I0407 10:49:48.075764 140072765682752 params.py:248] dataset_reader.max_instances = None\n",
  37 + "I0407 10:49:48.075832 140072765682752 params.py:248] dataset_reader.manual_distributed_sharding = False\n",
  38 + "I0407 10:49:48.075901 140072765682752 params.py:248] dataset_reader.manual_multi_process_sharding = False\n",
  39 + "I0407 10:49:48.076193 140072765682752 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n",
  40 + "I0407 10:49:48.076388 140072765682752 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n",
  41 + "I0407 10:49:48.076621 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n",
  42 + "I0407 10:49:48.076697 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n",
  43 + "I0407 10:49:48.076790 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  44 + "I0407 10:49:48.076939 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  45 + "I0407 10:49:48.077063 140072765682752 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n",
  46 + "I0407 10:49:48.077118 140072765682752 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n",
  47 + "I0407 10:49:48.077185 140072765682752 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n",
  48 + "I0407 10:49:48.077238 140072765682752 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n",
  49 + "I0407 10:49:48.077383 140072765682752 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n",
  50 + "I0407 10:49:48.077555 140072765682752 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n",
  51 + "I0407 10:49:48.077628 140072765682752 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n",
  52 + "I0407 10:49:48.077702 140072765682752 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n",
  53 + "I0407 10:49:48.077838 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n",
  54 + "I0407 10:49:48.078031 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n",
  55 + "I0407 10:49:48.078231 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n",
  56 + "I0407 10:49:48.078300 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n",
  57 + "I0407 10:49:48.078378 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n",
  58 + "I0407 10:49:48.078666 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n",
  59 + "I0407 10:49:48.078786 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n",
  60 + "I0407 10:49:48.078862 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n",
  61 + "I0407 10:49:48.078916 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n",
  62 + "I0407 10:49:48.078969 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n",
  63 + "I0407 10:49:48.079103 140072765682752 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n",
  64 + "I0407 10:49:48.079328 140072765682752 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n",
  65 + "I0407 10:49:48.079406 140072765682752 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n",
  66 + "I0407 10:49:48.079461 140072765682752 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n",
  67 + "I0407 10:49:48.079525 140072765682752 params.py:248] dataset_reader.token_indexers.token.max_length = None\n",
  68 + "I0407 10:49:48.079628 140072765682752 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n",
  69 + "I0407 10:49:51.185825 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n",
  70 + "I0407 10:49:51.186234 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n",
  71 + "I0407 10:49:51.186336 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n",
  72 + "I0407 10:49:51.186398 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n",
  73 + "I0407 10:49:51.186465 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n",
  74 + "I0407 10:49:51.186517 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n",
  75 + "I0407 10:49:51.186579 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  76 + "I0407 10:49:51.186631 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n",
  77 + "I0407 10:49:51.186791 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n",
  78 + "I0407 10:49:51.186975 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n",
  79 + "I0407 10:49:51.187041 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n",
  80 + "I0407 10:49:51.187107 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n",
  81 + "I0407 10:49:51.187170 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n",
  82 + "I0407 10:49:51.187220 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n",
  83 + "I0407 10:49:51.187275 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  84 + "I0407 10:49:51.187334 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n",
  85 + "I0407 10:49:51.187556 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n",
  86 + "I0407 10:49:51.187731 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n",
  87 + "I0407 10:49:51.187935 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n",
  88 + "I0407 10:49:51.187995 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n",
  89 + "I0407 10:49:51.188073 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  90 + "I0407 10:49:51.188217 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  91 + "I0407 10:49:51.188334 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n",
  92 + "I0407 10:49:51.188398 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n",
  93 + "I0407 10:49:51.188460 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n",
  94 + "I0407 10:49:51.188522 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n",
  95 + "I0407 10:49:51.188614 140072765682752 params.py:248] dataset_reader.features = ['token', 'char']\n",
  96 + "I0407 10:49:51.188712 140072765682752 params.py:248] dataset_reader.targets = ['head', 'deprel']\n",
  97 + "I0407 10:49:51.188802 140072765682752 params.py:248] dataset_reader.use_sem = False\n",
  98 + "I0407 10:49:51.188952 140072765682752 params.py:248] dataset_reader.type = conllu\n",
  99 + "I0407 10:49:51.189191 140072765682752 params.py:248] dataset_reader.lazy = False\n",
  100 + "I0407 10:49:51.189266 140072765682752 params.py:248] dataset_reader.cache_directory = None\n",
  101 + "I0407 10:49:51.189324 140072765682752 params.py:248] dataset_reader.max_instances = None\n",
  102 + "I0407 10:49:51.189382 140072765682752 params.py:248] dataset_reader.manual_distributed_sharding = False\n",
  103 + "I0407 10:49:51.189436 140072765682752 params.py:248] dataset_reader.manual_multi_process_sharding = False\n",
  104 + "I0407 10:49:51.189675 140072765682752 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n",
  105 + "I0407 10:49:51.189843 140072765682752 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n",
  106 + "I0407 10:49:51.190060 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n",
  107 + "I0407 10:49:51.190128 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n",
  108 + "I0407 10:49:51.190197 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  109 + "I0407 10:49:51.190324 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  110 + "I0407 10:49:51.190443 140072765682752 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n",
  111 + "I0407 10:49:51.190508 140072765682752 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n",
  112 + "I0407 10:49:51.190564 140072765682752 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n",
  113 + "I0407 10:49:51.190627 140072765682752 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n",
  114 + "I0407 10:49:51.190772 140072765682752 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n",
  115 + "I0407 10:49:51.190932 140072765682752 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n",
  116 + "I0407 10:49:51.191003 140072765682752 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n",
  117 + "I0407 10:49:51.191065 140072765682752 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n",
  118 + "I0407 10:49:51.191206 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n",
  119 + "I0407 10:49:51.191369 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n",
  120 + "I0407 10:49:51.191561 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n",
  121 + "I0407 10:49:51.191629 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n",
  122 + "I0407 10:49:51.191706 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n",
  123 + "I0407 10:49:51.191827 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n",
  124 + "I0407 10:49:51.191938 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n",
  125 + "I0407 10:49:51.191999 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n",
  126 + "I0407 10:49:51.192067 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n",
  127 + "I0407 10:49:51.192142 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n",
  128 + "I0407 10:49:51.192281 140072765682752 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n",
  129 + "I0407 10:49:51.192501 140072765682752 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n",
  130 + "I0407 10:49:51.192575 140072765682752 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n",
  131 + "I0407 10:49:51.192638 140072765682752 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n",
  132 + "I0407 10:49:51.192698 140072765682752 params.py:248] dataset_reader.token_indexers.token.max_length = None\n",
  133 + "I0407 10:49:51.192795 140072765682752 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n",
  134 + "I0407 10:49:51.194080 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n"
  135 + ]
  136 + },
  137 + {
  138 + "name": "stdout",
  139 + "output_type": "stream",
  140 + "text": [
  141 + "I0407 10:49:51.194318 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n",
  142 + "I0407 10:49:51.194404 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n",
  143 + "I0407 10:49:51.194471 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n",
  144 + "I0407 10:49:51.194532 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n",
  145 + "I0407 10:49:51.194586 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n",
  146 + "I0407 10:49:51.194648 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  147 + "I0407 10:49:51.194708 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n",
  148 + "I0407 10:49:51.194854 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n",
  149 + "I0407 10:49:51.195033 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n",
  150 + "I0407 10:49:51.195105 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n",
  151 + "I0407 10:49:51.195167 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n",
  152 + "I0407 10:49:51.195222 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n",
  153 + "I0407 10:49:51.195280 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n",
  154 + "I0407 10:49:51.195338 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  155 + "I0407 10:49:51.195398 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n",
  156 + "I0407 10:49:51.195601 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n",
  157 + "I0407 10:49:51.195774 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n",
  158 + "I0407 10:49:51.195971 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n",
  159 + "I0407 10:49:51.196039 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n",
  160 + "I0407 10:49:51.196113 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  161 + "I0407 10:49:51.196244 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  162 + "I0407 10:49:51.196364 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n",
  163 + "I0407 10:49:51.196430 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n",
  164 + "I0407 10:49:51.196492 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n",
  165 + "I0407 10:49:51.196552 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n",
  166 + "I0407 10:49:51.196640 140072765682752 params.py:248] dataset_reader.features = ['token', 'char']\n",
  167 + "I0407 10:49:51.196732 140072765682752 params.py:248] dataset_reader.targets = ['head', 'deprel']\n",
  168 + "I0407 10:49:51.196815 140072765682752 params.py:248] dataset_reader.use_sem = False\n",
  169 + "I0407 10:49:51.197346 140072765682752 params.py:248] vocabulary.type = from_instances_extended\n",
  170 + "I0407 10:49:51.197421 140072765682752 vocabulary.py:323] Loading token dictionary from /tmp/tmp_htckuhc/vocabulary.\n",
  171 + "I0407 10:49:51.197736 140072765682752 filelock.py:254] Lock 140069359832176 acquired on /tmp/tmp_htckuhc/vocabulary/.lock\n",
  172 + "I0407 10:49:51.198361 140072765682752 filelock.py:317] Lock 140069359832176 released on /tmp/tmp_htckuhc/vocabulary/.lock\n",
  173 + "I0407 10:49:51.198865 140072765682752 params.py:248] model.type = semantic_multitask\n",
  174 + "I0407 10:49:51.199399 140072765682752 params.py:248] model.text_field_embedder.type = basic\n",
  175 + "I0407 10:49:51.199762 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.type = char_embeddings_from_config\n",
  176 + "I0407 10:49:51.199955 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.embedding_dim = 64\n",
  177 + "I0407 10:49:51.200206 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.input_dim = 64\n",
  178 + "I0407 10:49:51.200286 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.filters = [512, 256, 64]\n",
  179 + "I0407 10:49:51.200380 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.kernel_size = [3, 3, 3]\n",
  180 + "I0407 10:49:51.200467 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.stride = [1, 1, 1]\n",
  181 + "I0407 10:49:51.200556 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.padding = [1, 2, 4]\n",
  182 + "I0407 10:49:51.200649 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.dilation = [1, 2, 4]\n",
  183 + "I0407 10:49:51.200745 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.activations = ['relu', 'relu', 'linear']\n",
  184 + "I0407 10:49:51.200886 140072765682752 params.py:248] type = relu\n",
  185 + "I0407 10:49:51.201073 140072765682752 params.py:248] type = relu\n",
  186 + "I0407 10:49:51.201222 140072765682752 params.py:248] type = linear\n",
  187 + "I0407 10:49:51.208180 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.vocab_namespace = token_characters\n",
  188 + "I0407 10:49:51.208718 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.type = transformers_word_embeddings\n",
  189 + "I0407 10:49:51.208946 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.model_name = allegro/herbert-large-cased\n",
  190 + "I0407 10:49:51.209028 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.projection_dim = 100\n",
  191 + "I0407 10:49:51.209110 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.projection_activation = <function TransformersWordEmbedder.<lambda> at 0x7f646dd85280>\n",
  192 + "I0407 10:49:51.209182 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.projection_dropout_rate = 0.0\n",
  193 + "I0407 10:49:51.209239 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.freeze_transformer = True\n",
  194 + "I0407 10:49:51.209295 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.last_layer_only = True\n",
  195 + "I0407 10:49:51.209401 140072765682752 params.py:384] model.text_field_embedder.token_embedders.token.tokenizer_kwargs.use_fast = False\n",
  196 + "I0407 10:49:51.209471 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.transformer_kwargs = None\n",
  197 + "I0407 10:49:58.747374 140072765682752 params.py:248] model.seq_encoder.type = combo_encoder\n",
  198 + "I0407 10:49:58.747746 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.input_size = 164\n",
  199 + "I0407 10:49:58.747819 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.hidden_size = 512\n",
  200 + "I0407 10:49:58.747869 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.num_layers = 2\n",
  201 + "I0407 10:49:58.747919 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.recurrent_dropout_probability = 0.33\n",
  202 + "I0407 10:49:58.747966 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.layer_dropout_probability = 0.33\n",
  203 + "I0407 10:49:58.748013 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.use_highway = False\n",
  204 + "I0407 10:49:59.084017 140072765682752 params.py:248] model.seq_encoder.layer_dropout_probability = 0.33\n",
  205 + "I0407 10:49:59.084280 140072765682752 params.py:248] model.use_sample_weight = True\n",
  206 + "I0407 10:49:59.084377 140072765682752 params.py:248] model.lemmatizer = None\n",
  207 + "I0407 10:49:59.084436 140072765682752 params.py:248] model.upos_tagger = None\n",
  208 + "I0407 10:49:59.084487 140072765682752 params.py:248] model.xpos_tagger = None\n",
  209 + "I0407 10:49:59.084537 140072765682752 params.py:248] model.semantic_relation = None\n",
  210 + "I0407 10:49:59.084585 140072765682752 params.py:248] model.morphological_feat = None\n",
  211 + "I0407 10:49:59.084832 140072765682752 params.py:248] model.dependency_relation.type = combo_dependency_parsing_from_vocab\n",
  212 + "I0407 10:49:59.085025 140072765682752 params.py:248] model.dependency_relation.vocab_namespace = deprel_labels\n",
  213 + "I0407 10:49:59.085301 140072765682752 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.in_features = 1024\n",
  214 + "I0407 10:49:59.085365 140072765682752 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.out_features = 512\n",
  215 + "I0407 10:49:59.085421 140072765682752 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.activation = tanh\n",
  216 + "I0407 10:49:59.085520 140072765682752 params.py:248] type = tanh\n",
  217 + "I0407 10:49:59.085608 140072765682752 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.dropout_rate = 0.0\n",
  218 + "I0407 10:49:59.089095 140072765682752 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.in_features = 1024\n",
  219 + "I0407 10:49:59.089183 140072765682752 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.out_features = 512\n",
  220 + "I0407 10:49:59.089244 140072765682752 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.activation = tanh\n",
  221 + "I0407 10:49:59.089346 140072765682752 params.py:248] type = tanh\n",
  222 + "I0407 10:49:59.089423 140072765682752 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.dropout_rate = 0.0\n",
  223 + "I0407 10:49:59.092701 140072765682752 params.py:248] model.dependency_relation.head_predictor.cycle_loss_n = 0\n",
  224 + "I0407 10:49:59.092917 140072765682752 params.py:248] model.dependency_relation.head_projection_layer.in_features = 1024\n",
  225 + "I0407 10:49:59.092972 140072765682752 params.py:248] model.dependency_relation.head_projection_layer.out_features = 128\n",
  226 + "I0407 10:49:59.093022 140072765682752 params.py:248] model.dependency_relation.head_projection_layer.activation = tanh\n",
  227 + "I0407 10:49:59.093108 140072765682752 params.py:248] type = tanh\n",
  228 + "I0407 10:49:59.093183 140072765682752 params.py:248] model.dependency_relation.head_projection_layer.dropout_rate = 0.25\n",
  229 + "I0407 10:49:59.094336 140072765682752 params.py:248] model.dependency_relation.dependency_projection_layer.in_features = 1024\n",
  230 + "I0407 10:49:59.094411 140072765682752 params.py:248] model.dependency_relation.dependency_projection_layer.out_features = 128\n",
  231 + "I0407 10:49:59.094463 140072765682752 params.py:248] model.dependency_relation.dependency_projection_layer.activation = tanh\n",
  232 + "I0407 10:49:59.094551 140072765682752 params.py:248] type = tanh\n",
  233 + "I0407 10:49:59.094618 140072765682752 params.py:248] model.dependency_relation.dependency_projection_layer.dropout_rate = 0.25\n",
  234 + "I0407 10:49:59.095806 140072765682752 params.py:248] model.enhanced_dependency_relation = None\n",
  235 + "I0407 10:49:59.096206 140072765682752 params.py:248] model.regularizer.regexes.0.1.type = l2\n",
  236 + "I0407 10:49:59.096345 140072765682752 params.py:248] model.regularizer.regexes.0.1.alpha = 1e-06\n",
  237 + "I0407 10:49:59.096471 140072765682752 params.py:248] model.regularizer.regexes.1.1.type = l2\n",
  238 + "I0407 10:49:59.096584 140072765682752 params.py:248] model.regularizer.regexes.1.1.alpha = 1e-06\n",
  239 + "I0407 10:49:59.096696 140072765682752 params.py:248] model.regularizer.regexes.2.1.type = l2\n",
  240 + "I0407 10:49:59.096809 140072765682752 params.py:248] model.regularizer.regexes.2.1.alpha = 1e-06\n",
  241 + "I0407 10:49:59.096917 140072765682752 params.py:248] model.regularizer.regexes.3.1.type = l2\n",
  242 + "I0407 10:49:59.097025 140072765682752 params.py:248] model.regularizer.regexes.3.1.alpha = 1e-05\n"
  243 + ]
  244 + },
  245 + {
  246 + "name": "stdout",
  247 + "output_type": "stream",
  248 + "text": [
  249 + "I0407 10:50:01.854557 140072765682752 archival.py:211] removing temporary unarchived model dir at /tmp/tmp_htckuhc\n",
  250 + "reading instances: 2211it [01:52, 19.69it/s]\n"
  251 + ]
  252 + }
  253 + ],
  254 + "source": [
  255 + "! {COMBO} --mode predict \\\n",
  256 + " --cuda_device 0 \\\n",
  257 + " --model_path model-pdbc/model.tar.gz \\\n",
  258 + " --input_file connlu/pdbc-validation.conllu \\\n",
  259 + " --output_file connlu/pdbc-validation-pred.conllu"
  260 + ]
  261 + },
  262 + {
  263 + "cell_type": "code",
  264 + "execution_count": 3,
  265 + "id": "11f1b7b1",
  266 + "metadata": {},
  267 + "outputs": [
  268 + {
  269 + "name": "stdout",
  270 + "output_type": "stream",
  271 + "text": [
  272 + "# text = Dwie dziewczynki opierają się o dach kapliczki , chłopiec wspina się na niego , a trzecia dziewczynka stoi obok .\r\n",
  273 + "1\tDwie\tdwa\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
  274 + "2\tdziewczynki\tdziewczynka\t_\t_\t_\t1\tcomp\t1:comp\t_\r\n",
  275 + "3\topierają\topierać\t_\t_\t_\t15\tconjunct\t15:conjunct\t_\r\n",
  276 + "4\tsię\tsię\t_\t_\t_\t3\trefl\t3:refl\t_\r\n",
  277 + "5\to\to\t_\t_\t_\t3\tcomp\t3:comp\t_\r\n",
  278 + "6\tdach\tdach\t_\t_\t_\t5\tcomp\t5:comp\t_\r\n",
  279 + "7\tkapliczki\tkapliczka\t_\t_\t_\t6\tadjunct\t6:adjunct\t_\r\n",
  280 + "8\t,\t,\t_\t_\t_\t15\tpunct\t15:punct\t_\r\n",
  281 + "9\tchłopiec\tchłopiec\t_\t_\t_\t10\tsubj\t10:subj\t_\r\n"
  282 + ]
  283 + }
  284 + ],
  285 + "source": [
  286 + "! head connlu/pdbc-validation.conllu"
  287 + ]
  288 + },
  289 + {
  290 + "cell_type": "code",
  291 + "execution_count": 4,
  292 + "id": "8fa72124",
  293 + "metadata": {},
  294 + "outputs": [
  295 + {
  296 + "name": "stdout",
  297 + "output_type": "stream",
  298 + "text": [
  299 + "# text = Dwie dziewczynki opierają się o dach kapliczki , chłopiec wspina się na niego , a trzecia dziewczynka stoi obok .\r\n",
  300 + "1\tDwie\tdwa\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
  301 + "2\tdziewczynki\tdziewczynka\t_\t_\t_\t1\tcomp\t1:comp\t_\r\n",
  302 + "3\topierają\topierać\t_\t_\t_\t15\tconjunct\t15:conjunct\t_\r\n",
  303 + "4\tsię\tsię\t_\t_\t_\t3\trefl\t3:refl\t_\r\n",
  304 + "5\to\to\t_\t_\t_\t3\tcomp\t3:comp\t_\r\n",
  305 + "6\tdach\tdach\t_\t_\t_\t5\tcomp\t5:comp\t_\r\n",
  306 + "7\tkapliczki\tkapliczka\t_\t_\t_\t6\tadjunct\t6:adjunct\t_\r\n",
  307 + "8\t,\t,\t_\t_\t_\t15\tpunct\t15:punct\t_\r\n",
  308 + "9\tchłopiec\tchłopiec\t_\t_\t_\t10\tsubj\t10:subj\t_\r\n"
  309 + ]
  310 + }
  311 + ],
  312 + "source": [
  313 + "! head connlu/pdbc-validation-pred.conllu"
  314 + ]
  315 + },
  316 + {
  317 + "cell_type": "code",
  318 + "execution_count": 5,
  319 + "id": "dde6dd31",
  320 + "metadata": {},
  321 + "outputs": [
  322 + {
  323 + "name": "stdout",
  324 + "output_type": "stream",
  325 + "text": [
  326 + "I0407 10:52:00.220404 139754138821696 archival.py:184] loading archive file model-pdbc/model.tar.gz\n",
  327 + "I0407 10:52:00.221079 139754138821696 archival.py:263] extracting archive file model-pdbc/model.tar.gz to temp dir /tmp/tmp2jhqu3i6\n",
  328 + "I0407 10:52:16.996590 139754138821696 params.py:248] dataset_reader.type = conllu\n",
  329 + "I0407 10:52:16.997079 139754138821696 params.py:248] dataset_reader.lazy = False\n",
  330 + "I0407 10:52:16.997236 139754138821696 params.py:248] dataset_reader.cache_directory = None\n",
  331 + "I0407 10:52:16.997326 139754138821696 params.py:248] dataset_reader.max_instances = None\n",
  332 + "I0407 10:52:16.997391 139754138821696 params.py:248] dataset_reader.manual_distributed_sharding = False\n",
  333 + "I0407 10:52:16.997456 139754138821696 params.py:248] dataset_reader.manual_multi_process_sharding = False\n",
  334 + "I0407 10:52:16.997756 139754138821696 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n",
  335 + "I0407 10:52:16.997950 139754138821696 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n",
  336 + "I0407 10:52:16.998211 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n",
  337 + "I0407 10:52:16.998285 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n",
  338 + "I0407 10:52:16.998367 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  339 + "I0407 10:52:16.998522 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  340 + "I0407 10:52:16.998643 139754138821696 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n",
  341 + "I0407 10:52:16.998707 139754138821696 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n",
  342 + "I0407 10:52:16.998770 139754138821696 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n",
  343 + "I0407 10:52:16.998831 139754138821696 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n",
  344 + "I0407 10:52:16.998980 139754138821696 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n",
  345 + "I0407 10:52:16.999143 139754138821696 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n",
  346 + "I0407 10:52:16.999213 139754138821696 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n",
  347 + "I0407 10:52:16.999269 139754138821696 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n",
  348 + "I0407 10:52:16.999412 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n",
  349 + "I0407 10:52:16.999578 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n",
  350 + "I0407 10:52:16.999774 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n",
  351 + "I0407 10:52:16.999842 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n",
  352 + "I0407 10:52:16.999923 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n",
  353 + "I0407 10:52:17.000045 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n",
  354 + "I0407 10:52:17.000156 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n",
  355 + "I0407 10:52:17.000220 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n",
  356 + "I0407 10:52:17.000282 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n",
  357 + "I0407 10:52:17.000344 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n",
  358 + "I0407 10:52:17.000521 139754138821696 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n",
  359 + "I0407 10:52:17.000770 139754138821696 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n",
  360 + "I0407 10:52:17.000865 139754138821696 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n",
  361 + "I0407 10:52:17.000947 139754138821696 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n",
  362 + "I0407 10:52:17.001028 139754138821696 params.py:248] dataset_reader.token_indexers.token.max_length = None\n",
  363 + "I0407 10:52:17.001172 139754138821696 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n",
  364 + "I0407 10:52:20.459573 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n",
  365 + "I0407 10:52:20.459947 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n",
  366 + "I0407 10:52:20.460046 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n",
  367 + "I0407 10:52:20.460119 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n",
  368 + "I0407 10:52:20.460172 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n",
  369 + "I0407 10:52:20.460235 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n",
  370 + "I0407 10:52:20.460288 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  371 + "I0407 10:52:20.460351 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n",
  372 + "I0407 10:52:20.460508 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n",
  373 + "I0407 10:52:20.460695 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n",
  374 + "I0407 10:52:20.460773 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n",
  375 + "I0407 10:52:20.460840 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n",
  376 + "I0407 10:52:20.460901 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n",
  377 + "I0407 10:52:20.460962 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n",
  378 + "I0407 10:52:20.461021 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  379 + "I0407 10:52:20.461083 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n",
  380 + "I0407 10:52:20.461313 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n",
  381 + "I0407 10:52:20.461496 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n",
  382 + "I0407 10:52:20.461706 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n",
  383 + "I0407 10:52:20.461774 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n",
  384 + "I0407 10:52:20.461853 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  385 + "I0407 10:52:20.462028 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  386 + "I0407 10:52:20.462157 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n",
  387 + "I0407 10:52:20.462226 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n",
  388 + "I0407 10:52:20.462283 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n",
  389 + "I0407 10:52:20.462336 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n",
  390 + "I0407 10:52:20.462417 139754138821696 params.py:248] dataset_reader.features = ['token', 'char']\n",
  391 + "I0407 10:52:20.462514 139754138821696 params.py:248] dataset_reader.targets = ['head', 'deprel']\n",
  392 + "I0407 10:52:20.462607 139754138821696 params.py:248] dataset_reader.use_sem = False\n",
  393 + "I0407 10:52:20.462767 139754138821696 params.py:248] dataset_reader.type = conllu\n",
  394 + "I0407 10:52:20.463083 139754138821696 params.py:248] dataset_reader.lazy = False\n",
  395 + "I0407 10:52:20.463172 139754138821696 params.py:248] dataset_reader.cache_directory = None\n",
  396 + "I0407 10:52:20.463237 139754138821696 params.py:248] dataset_reader.max_instances = None\n",
  397 + "I0407 10:52:20.463301 139754138821696 params.py:248] dataset_reader.manual_distributed_sharding = False\n",
  398 + "I0407 10:52:20.463361 139754138821696 params.py:248] dataset_reader.manual_multi_process_sharding = False\n",
  399 + "I0407 10:52:20.463605 139754138821696 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n",
  400 + "I0407 10:52:20.463779 139754138821696 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n",
  401 + "I0407 10:52:20.463980 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n",
  402 + "I0407 10:52:20.464051 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n",
  403 + "I0407 10:52:20.464129 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  404 + "I0407 10:52:20.464254 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  405 + "I0407 10:52:20.464366 139754138821696 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n",
  406 + "I0407 10:52:20.464429 139754138821696 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n",
  407 + "I0407 10:52:20.464490 139754138821696 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n",
  408 + "I0407 10:52:20.464552 139754138821696 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n",
  409 + "I0407 10:52:20.464691 139754138821696 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n",
  410 + "I0407 10:52:20.464847 139754138821696 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n",
  411 + "I0407 10:52:20.464918 139754138821696 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n",
  412 + "I0407 10:52:20.464980 139754138821696 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n",
  413 + "I0407 10:52:20.465120 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n",
  414 + "I0407 10:52:20.465285 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n",
  415 + "I0407 10:52:20.465479 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n",
  416 + "I0407 10:52:20.465544 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n",
  417 + "I0407 10:52:20.465618 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n",
  418 + "I0407 10:52:20.465741 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n",
  419 + "I0407 10:52:20.465851 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n",
  420 + "I0407 10:52:20.465914 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n",
  421 + "I0407 10:52:20.466024 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n",
  422 + "I0407 10:52:20.466112 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n",
  423 + "I0407 10:52:20.466268 139754138821696 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n",
  424 + "I0407 10:52:20.466485 139754138821696 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n",
  425 + "I0407 10:52:20.466559 139754138821696 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n",
  426 + "I0407 10:52:20.466621 139754138821696 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n",
  427 + "I0407 10:52:20.466682 139754138821696 params.py:248] dataset_reader.token_indexers.token.max_length = None\n",
  428 + "I0407 10:52:20.466777 139754138821696 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n",
  429 + "I0407 10:52:20.468071 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n",
  430 + "I0407 10:52:20.468319 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n",
  431 + "I0407 10:52:20.468404 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n",
  432 + "I0407 10:52:20.468464 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n",
  433 + "I0407 10:52:20.468523 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n",
  434 + "I0407 10:52:20.468573 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n",
  435 + "I0407 10:52:20.468636 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  436 + "I0407 10:52:20.468697 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n",
  437 + "I0407 10:52:20.468832 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n",
  438 + "I0407 10:52:20.469012 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n",
  439 + "I0407 10:52:20.469086 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n",
  440 + "I0407 10:52:20.469144 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n",
  441 + "I0407 10:52:20.469196 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n",
  442 + "I0407 10:52:20.469256 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n",
  443 + "I0407 10:52:20.469320 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  444 + "I0407 10:52:20.469382 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n",
  445 + "I0407 10:52:20.469586 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n",
  446 + "I0407 10:52:20.469758 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n",
  447 + "I0407 10:52:20.469957 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n",
  448 + "I0407 10:52:20.470050 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n",
  449 + "I0407 10:52:20.470128 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  450 + "I0407 10:52:20.470261 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  451 + "I0407 10:52:20.470381 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n",
  452 + "I0407 10:52:20.470448 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n",
  453 + "I0407 10:52:20.470509 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n",
  454 + "I0407 10:52:20.470579 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n",
  455 + "I0407 10:52:20.470668 139754138821696 params.py:248] dataset_reader.features = ['token', 'char']\n",
  456 + "I0407 10:52:20.470764 139754138821696 params.py:248] dataset_reader.targets = ['head', 'deprel']\n",
  457 + "I0407 10:52:20.470849 139754138821696 params.py:248] dataset_reader.use_sem = False\n",
  458 + "I0407 10:52:20.471387 139754138821696 params.py:248] vocabulary.type = from_instances_extended\n",
  459 + "I0407 10:52:20.471461 139754138821696 vocabulary.py:323] Loading token dictionary from /tmp/tmp2jhqu3i6/vocabulary.\n",
  460 + "I0407 10:52:20.471798 139754138821696 filelock.py:254] Lock 139750732975216 acquired on /tmp/tmp2jhqu3i6/vocabulary/.lock\n",
  461 + "I0407 10:52:20.472387 139754138821696 filelock.py:317] Lock 139750732975216 released on /tmp/tmp2jhqu3i6/vocabulary/.lock\n",
  462 + "I0407 10:52:20.472922 139754138821696 params.py:248] model.type = semantic_multitask\n",
  463 + "I0407 10:52:20.473455 139754138821696 params.py:248] model.text_field_embedder.type = basic\n",
  464 + "I0407 10:52:20.473808 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.type = char_embeddings_from_config\n",
  465 + "I0407 10:52:20.474030 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.embedding_dim = 64\n",
  466 + "I0407 10:52:20.474286 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.input_dim = 64\n",
  467 + "I0407 10:52:20.474377 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.filters = [512, 256, 64]\n",
  468 + "I0407 10:52:20.474480 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.kernel_size = [3, 3, 3]\n",
  469 + "I0407 10:52:20.474578 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.stride = [1, 1, 1]\n",
  470 + "I0407 10:52:20.474673 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.padding = [1, 2, 4]\n",
  471 + "I0407 10:52:20.474768 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.dilation = [1, 2, 4]\n",
  472 + "I0407 10:52:20.474864 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.activations = ['relu', 'relu', 'linear']\n",
  473 + "I0407 10:52:20.475005 139754138821696 params.py:248] type = relu\n",
  474 + "I0407 10:52:20.475197 139754138821696 params.py:248] type = relu\n",
  475 + "I0407 10:52:20.475347 139754138821696 params.py:248] type = linear\n",
  476 + "I0407 10:52:20.481609 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.vocab_namespace = token_characters\n",
  477 + "I0407 10:52:20.482178 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.type = transformers_word_embeddings\n",
  478 + "I0407 10:52:20.482446 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.model_name = allegro/herbert-large-cased\n",
  479 + "I0407 10:52:20.482533 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.projection_dim = 100\n",
  480 + "I0407 10:52:20.482632 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.projection_activation = <function TransformersWordEmbedder.<lambda> at 0x7f1a3e346280>\n",
  481 + "I0407 10:52:20.482703 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.projection_dropout_rate = 0.0\n",
  482 + "I0407 10:52:20.482769 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.freeze_transformer = True\n",
  483 + "I0407 10:52:20.482831 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.last_layer_only = True\n",
  484 + "I0407 10:52:20.482933 139754138821696 params.py:384] model.text_field_embedder.token_embedders.token.tokenizer_kwargs.use_fast = False\n",
  485 + "I0407 10:52:20.483003 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.transformer_kwargs = None\n"
  486 + ]
  487 + },
  488 + {
  489 + "name": "stdout",
  490 + "output_type": "stream",
  491 + "text": [
  492 + "I0407 10:52:28.699278 139754138821696 params.py:248] model.seq_encoder.type = combo_encoder\n",
  493 + "I0407 10:52:28.699747 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.input_size = 164\n",
  494 + "I0407 10:52:28.699841 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.hidden_size = 512\n",
  495 + "I0407 10:52:28.699910 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.num_layers = 2\n",
  496 + "I0407 10:52:28.699976 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.recurrent_dropout_probability = 0.33\n",
  497 + "I0407 10:52:28.700042 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.layer_dropout_probability = 0.33\n",
  498 + "I0407 10:52:28.700106 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.use_highway = False\n",
  499 + "I0407 10:52:29.089101 139754138821696 params.py:248] model.seq_encoder.layer_dropout_probability = 0.33\n",
  500 + "I0407 10:52:29.089426 139754138821696 params.py:248] model.use_sample_weight = True\n",
  501 + "I0407 10:52:29.089556 139754138821696 params.py:248] model.lemmatizer = None\n",
  502 + "I0407 10:52:29.089638 139754138821696 params.py:248] model.upos_tagger = None\n",
  503 + "I0407 10:52:29.089704 139754138821696 params.py:248] model.xpos_tagger = None\n",
  504 + "I0407 10:52:29.089766 139754138821696 params.py:248] model.semantic_relation = None\n",
  505 + "I0407 10:52:29.089827 139754138821696 params.py:248] model.morphological_feat = None\n",
  506 + "I0407 10:52:29.090160 139754138821696 params.py:248] model.dependency_relation.type = combo_dependency_parsing_from_vocab\n",
  507 + "I0407 10:52:29.090409 139754138821696 params.py:248] model.dependency_relation.vocab_namespace = deprel_labels\n",
  508 + "I0407 10:52:29.090762 139754138821696 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.in_features = 1024\n",
  509 + "I0407 10:52:29.090843 139754138821696 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.out_features = 512\n",
  510 + "I0407 10:52:29.090915 139754138821696 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.activation = tanh\n",
  511 + "I0407 10:52:29.091041 139754138821696 params.py:248] type = tanh\n",
  512 + "I0407 10:52:29.091149 139754138821696 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.dropout_rate = 0.0\n",
  513 + "I0407 10:52:29.096003 139754138821696 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.in_features = 1024\n",
  514 + "I0407 10:52:29.096106 139754138821696 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.out_features = 512\n",
  515 + "I0407 10:52:29.096185 139754138821696 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.activation = tanh\n",
  516 + "I0407 10:52:29.096311 139754138821696 params.py:248] type = tanh\n",
  517 + "I0407 10:52:29.096407 139754138821696 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.dropout_rate = 0.0\n",
  518 + "I0407 10:52:29.101276 139754138821696 params.py:248] model.dependency_relation.head_predictor.cycle_loss_n = 0\n",
  519 + "I0407 10:52:29.101581 139754138821696 params.py:248] model.dependency_relation.head_projection_layer.in_features = 1024\n",
  520 + "I0407 10:52:29.101692 139754138821696 params.py:248] model.dependency_relation.head_projection_layer.out_features = 128\n",
  521 + "I0407 10:52:29.101771 139754138821696 params.py:248] model.dependency_relation.head_projection_layer.activation = tanh\n",
  522 + "I0407 10:52:29.101904 139754138821696 params.py:248] type = tanh\n",
  523 + "I0407 10:52:29.102032 139754138821696 params.py:248] model.dependency_relation.head_projection_layer.dropout_rate = 0.25\n",
  524 + "I0407 10:52:29.103649 139754138821696 params.py:248] model.dependency_relation.dependency_projection_layer.in_features = 1024\n",
  525 + "I0407 10:52:29.103747 139754138821696 params.py:248] model.dependency_relation.dependency_projection_layer.out_features = 128\n",
  526 + "I0407 10:52:29.103819 139754138821696 params.py:248] model.dependency_relation.dependency_projection_layer.activation = tanh\n",
  527 + "I0407 10:52:29.103948 139754138821696 params.py:248] type = tanh\n",
  528 + "I0407 10:52:29.104044 139754138821696 params.py:248] model.dependency_relation.dependency_projection_layer.dropout_rate = 0.25\n",
  529 + "I0407 10:52:29.105780 139754138821696 params.py:248] model.enhanced_dependency_relation = None\n",
  530 + "I0407 10:52:29.106371 139754138821696 params.py:248] model.regularizer.regexes.0.1.type = l2\n",
  531 + "I0407 10:52:29.106555 139754138821696 params.py:248] model.regularizer.regexes.0.1.alpha = 1e-06\n",
  532 + "I0407 10:52:29.106724 139754138821696 params.py:248] model.regularizer.regexes.1.1.type = l2\n",
  533 + "I0407 10:52:29.106879 139754138821696 params.py:248] model.regularizer.regexes.1.1.alpha = 1e-06\n",
  534 + "I0407 10:52:29.107035 139754138821696 params.py:248] model.regularizer.regexes.2.1.type = l2\n",
  535 + "I0407 10:52:29.107207 139754138821696 params.py:248] model.regularizer.regexes.2.1.alpha = 1e-06\n",
  536 + "I0407 10:52:29.107368 139754138821696 params.py:248] model.regularizer.regexes.3.1.type = l2\n",
  537 + "I0407 10:52:29.107544 139754138821696 params.py:248] model.regularizer.regexes.3.1.alpha = 1e-05\n",
  538 + "I0407 10:52:32.063793 139754138821696 archival.py:211] removing temporary unarchived model dir at /tmp/tmp2jhqu3i6\n",
  539 + "reading instances: 2205it [01:49, 20.15it/s]\n"
  540 + ]
  541 + }
  542 + ],
  543 + "source": [
  544 + "! {COMBO} --mode predict \\\n",
  545 + " --cuda_device 0 \\\n",
  546 + " --model_path model-pdbc/model.tar.gz \\\n",
  547 + " --input_file connlu/pdbc-test.conllu \\\n",
  548 + " --output_file connlu/pdbc-test-pred.conllu"
  549 + ]
  550 + },
  551 + {
  552 + "cell_type": "code",
  553 + "execution_count": 6,
  554 + "id": "13748ca1",
  555 + "metadata": {},
  556 + "outputs": [
  557 + {
  558 + "name": "stdout",
  559 + "output_type": "stream",
  560 + "text": [
  561 + "# text = Mały chłopiec patrzy w bok po ściągnięciu okularów .\r\n",
  562 + "1\tMały\tmały\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n",
  563 + "2\tchłopiec\tchłopiec\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
  564 + "3\tpatrzy\tpatrzeć\t_\t_\t_\t0\troot\t0:root\t_\r\n",
  565 + "4\tw\tw\t_\t_\t_\t3\tadjunct_adl\t3:adjunct_adl\t_\r\n",
  566 + "5\tbok\tbok\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n",
  567 + "6\tpo\tpo\t_\t_\t_\t3\tadjunct_temp\t3:adjunct_temp\t_\r\n",
  568 + "7\tściągnięciu\tściągnąć\t_\t_\t_\t6\tcomp\t6:comp\t_\r\n",
  569 + "8\tokularów\tokulary\t_\t_\t_\t7\tobj\t7:obj\t_\r\n",
  570 + "9\t.\t.\t_\t_\t_\t3\tpunct\t3:punct\t_\r\n"
  571 + ]
  572 + }
  573 + ],
  574 + "source": [
  575 + "! head connlu/pdbc-test.conllu"
  576 + ]
  577 + },
  578 + {
  579 + "cell_type": "code",
  580 + "execution_count": 7,
  581 + "id": "30021124",
  582 + "metadata": {},
  583 + "outputs": [
  584 + {
  585 + "name": "stdout",
  586 + "output_type": "stream",
  587 + "text": [
  588 + "# text = Mały chłopiec patrzy w bok po ściągnięciu okularów .\r\n",
  589 + "1\tMały\tmały\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n",
  590 + "2\tchłopiec\tchłopiec\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
  591 + "3\tpatrzy\tpatrzeć\t_\t_\t_\t0\troot\t0:root\t_\r\n",
  592 + "4\tw\tw\t_\t_\t_\t3\tcomp\t3:adjunct_adl\t_\r\n",
  593 + "5\tbok\tbok\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n",
  594 + "6\tpo\tpo\t_\t_\t_\t3\tadjunct_temp\t3:adjunct_temp\t_\r\n",
  595 + "7\tściągnięciu\tściągnąć\t_\t_\t_\t6\tcomp\t6:comp\t_\r\n",
  596 + "8\tokularów\tokulary\t_\t_\t_\t7\tobj\t7:obj\t_\r\n",
  597 + "9\t.\t.\t_\t_\t_\t3\tpunct\t3:punct\t_\r\n"
  598 + ]
  599 + }
  600 + ],
  601 + "source": [
  602 + "! head connlu/pdbc-test-pred.conllu"
  603 + ]
  604 + },
  605 + {
  606 + "cell_type": "markdown",
  607 + "id": "99359d8c",
  608 + "metadata": {},
  609 + "source": [
  610 + "Continuous-only model"
  611 + ]
  612 + },
  613 + {
  614 + "cell_type": "code",
  615 + "execution_count": 8,
  616 + "id": "30a66da6",
  617 + "metadata": {},
  618 + "outputs": [
  619 + {
  620 + "name": "stdout",
  621 + "output_type": "stream",
  622 + "text": [
  623 + "I0407 10:54:27.401382 140321380496448 archival.py:184] loading archive file model-pdbc-cont/model.tar.gz\n",
  624 + "I0407 10:54:27.402150 140321380496448 archival.py:263] extracting archive file model-pdbc-cont/model.tar.gz to temp dir /tmp/tmpuvesoi4q\n",
  625 + "I0407 10:54:43.091615 140321380496448 params.py:248] dataset_reader.type = conllu\n",
  626 + "I0407 10:54:43.092000 140321380496448 params.py:248] dataset_reader.lazy = False\n",
  627 + "I0407 10:54:43.092082 140321380496448 params.py:248] dataset_reader.cache_directory = None\n",
  628 + "I0407 10:54:43.092129 140321380496448 params.py:248] dataset_reader.max_instances = None\n",
  629 + "I0407 10:54:43.092173 140321380496448 params.py:248] dataset_reader.manual_distributed_sharding = False\n",
  630 + "I0407 10:54:43.092208 140321380496448 params.py:248] dataset_reader.manual_multi_process_sharding = False\n",
  631 + "I0407 10:54:43.092409 140321380496448 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n",
  632 + "I0407 10:54:43.092535 140321380496448 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n",
  633 + "I0407 10:54:43.092682 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n",
  634 + "I0407 10:54:43.092730 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n",
  635 + "I0407 10:54:43.092786 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  636 + "I0407 10:54:43.092888 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  637 + "I0407 10:54:43.092970 140321380496448 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n",
  638 + "I0407 10:54:43.093014 140321380496448 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n",
  639 + "I0407 10:54:43.093051 140321380496448 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n",
  640 + "I0407 10:54:43.093093 140321380496448 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n",
  641 + "I0407 10:54:43.093198 140321380496448 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n",
  642 + "I0407 10:54:43.093306 140321380496448 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n",
  643 + "I0407 10:54:43.093353 140321380496448 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n",
  644 + "I0407 10:54:43.093388 140321380496448 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n",
  645 + "I0407 10:54:43.093482 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n",
  646 + "I0407 10:54:43.093593 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n",
  647 + "I0407 10:54:43.093723 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n",
  648 + "I0407 10:54:43.093769 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n",
  649 + "I0407 10:54:43.093816 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n",
  650 + "I0407 10:54:43.093899 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n",
  651 + "I0407 10:54:43.093993 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n",
  652 + "I0407 10:54:43.094043 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n",
  653 + "I0407 10:54:43.094079 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n",
  654 + "I0407 10:54:43.094121 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n",
  655 + "I0407 10:54:43.094226 140321380496448 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n",
  656 + "I0407 10:54:43.094377 140321380496448 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n",
  657 + "I0407 10:54:43.094430 140321380496448 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n",
  658 + "I0407 10:54:43.094474 140321380496448 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n",
  659 + "I0407 10:54:43.094522 140321380496448 params.py:248] dataset_reader.token_indexers.token.max_length = None\n",
  660 + "I0407 10:54:43.094592 140321380496448 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n",
  661 + "I0407 10:54:45.858621 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n",
  662 + "I0407 10:54:45.858990 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n",
  663 + "I0407 10:54:45.859087 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n",
  664 + "I0407 10:54:45.859157 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n",
  665 + "I0407 10:54:45.859210 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n",
  666 + "I0407 10:54:45.859268 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n",
  667 + "I0407 10:54:45.859321 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  668 + "I0407 10:54:45.859382 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n",
  669 + "I0407 10:54:45.859541 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n",
  670 + "I0407 10:54:45.859729 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n",
  671 + "I0407 10:54:45.859802 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n",
  672 + "I0407 10:54:45.859875 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n",
  673 + "I0407 10:54:45.859931 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n",
  674 + "I0407 10:54:45.859991 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n",
  675 + "I0407 10:54:45.860045 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  676 + "I0407 10:54:45.860103 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n",
  677 + "I0407 10:54:45.860332 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n",
  678 + "I0407 10:54:45.860523 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n",
  679 + "I0407 10:54:45.860739 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n",
  680 + "I0407 10:54:45.860809 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n",
  681 + "I0407 10:54:45.860888 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  682 + "I0407 10:54:45.861032 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  683 + "I0407 10:54:45.861149 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n",
  684 + "I0407 10:54:45.861213 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n",
  685 + "I0407 10:54:45.861277 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n",
  686 + "I0407 10:54:45.861337 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n",
  687 + "I0407 10:54:45.861427 140321380496448 params.py:248] dataset_reader.features = ['token', 'char']\n",
  688 + "I0407 10:54:45.861522 140321380496448 params.py:248] dataset_reader.targets = ['head', 'deprel']\n",
  689 + "I0407 10:54:45.861611 140321380496448 params.py:248] dataset_reader.use_sem = False\n",
  690 + "I0407 10:54:45.861762 140321380496448 params.py:248] dataset_reader.type = conllu\n",
  691 + "I0407 10:54:45.862029 140321380496448 params.py:248] dataset_reader.lazy = False\n",
  692 + "I0407 10:54:45.862116 140321380496448 params.py:248] dataset_reader.cache_directory = None\n",
  693 + "I0407 10:54:45.862177 140321380496448 params.py:248] dataset_reader.max_instances = None\n",
  694 + "I0407 10:54:45.862234 140321380496448 params.py:248] dataset_reader.manual_distributed_sharding = False\n",
  695 + "I0407 10:54:45.862295 140321380496448 params.py:248] dataset_reader.manual_multi_process_sharding = False\n",
  696 + "I0407 10:54:45.862535 140321380496448 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n",
  697 + "I0407 10:54:45.862701 140321380496448 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n",
  698 + "I0407 10:54:45.862900 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n",
  699 + "I0407 10:54:45.862966 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n",
  700 + "I0407 10:54:45.863043 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  701 + "I0407 10:54:45.863168 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  702 + "I0407 10:54:45.863281 140321380496448 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n",
  703 + "I0407 10:54:45.863344 140321380496448 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n",
  704 + "I0407 10:54:45.863406 140321380496448 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n",
  705 + "I0407 10:54:45.863469 140321380496448 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n",
  706 + "I0407 10:54:45.863596 140321380496448 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n",
  707 + "I0407 10:54:45.863752 140321380496448 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n",
  708 + "I0407 10:54:45.863821 140321380496448 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n",
  709 + "I0407 10:54:45.863883 140321380496448 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n",
  710 + "I0407 10:54:45.864030 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n",
  711 + "I0407 10:54:45.864196 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n",
  712 + "I0407 10:54:45.864392 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n",
  713 + "I0407 10:54:45.864460 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n",
  714 + "I0407 10:54:45.864540 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n",
  715 + "I0407 10:54:45.864660 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n",
  716 + "I0407 10:54:45.864772 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n",
  717 + "I0407 10:54:45.864835 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n",
  718 + "I0407 10:54:45.864896 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n",
  719 + "I0407 10:54:45.864965 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n",
  720 + "I0407 10:54:45.865104 140321380496448 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n",
  721 + "I0407 10:54:45.865323 140321380496448 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n",
  722 + "I0407 10:54:45.865396 140321380496448 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n",
  723 + "I0407 10:54:45.865460 140321380496448 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n",
  724 + "I0407 10:54:45.865518 140321380496448 params.py:248] dataset_reader.token_indexers.token.max_length = None\n",
  725 + "I0407 10:54:45.865614 140321380496448 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n",
  726 + "I0407 10:54:45.866884 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n",
  727 + "I0407 10:54:45.867116 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n",
  728 + "I0407 10:54:45.867190 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n",
  729 + "I0407 10:54:45.867258 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n",
  730 + "I0407 10:54:45.867316 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n",
  731 + "I0407 10:54:45.867376 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n",
  732 + "I0407 10:54:45.867437 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  733 + "I0407 10:54:45.867497 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n",
  734 + "I0407 10:54:45.867640 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n",
  735 + "I0407 10:54:45.867815 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n",
  736 + "I0407 10:54:45.867887 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n",
  737 + "I0407 10:54:45.867951 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n",
  738 + "I0407 10:54:45.868006 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n",
  739 + "I0407 10:54:45.868063 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n",
  740 + "I0407 10:54:45.868122 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  741 + "I0407 10:54:45.868181 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n",
  742 + "I0407 10:54:45.868388 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n",
  743 + "I0407 10:54:45.868559 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n",
  744 + "I0407 10:54:45.868757 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n",
  745 + "I0407 10:54:45.868824 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n",
  746 + "I0407 10:54:45.868897 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  747 + "I0407 10:54:45.869028 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  748 + "I0407 10:54:45.869139 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n",
  749 + "I0407 10:54:45.869202 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n",
  750 + "I0407 10:54:45.869256 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n",
  751 + "I0407 10:54:45.869315 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n",
  752 + "I0407 10:54:45.869398 140321380496448 params.py:248] dataset_reader.features = ['token', 'char']\n",
  753 + "I0407 10:54:45.869489 140321380496448 params.py:248] dataset_reader.targets = ['head', 'deprel']\n",
  754 + "I0407 10:54:45.869572 140321380496448 params.py:248] dataset_reader.use_sem = False\n",
  755 + "I0407 10:54:45.870136 140321380496448 params.py:248] vocabulary.type = from_instances_extended\n",
  756 + "I0407 10:54:45.870218 140321380496448 vocabulary.py:323] Loading token dictionary from /tmp/tmpuvesoi4q/vocabulary.\n",
  757 + "I0407 10:54:45.870543 140321380496448 filelock.py:254] Lock 140317974842768 acquired on /tmp/tmpuvesoi4q/vocabulary/.lock\n",
  758 + "I0407 10:54:45.871132 140321380496448 filelock.py:317] Lock 140317974842768 released on /tmp/tmpuvesoi4q/vocabulary/.lock\n",
  759 + "I0407 10:54:45.871641 140321380496448 params.py:248] model.type = semantic_multitask\n",
  760 + "I0407 10:54:45.872183 140321380496448 params.py:248] model.text_field_embedder.type = basic\n",
  761 + "I0407 10:54:45.872548 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.type = char_embeddings_from_config\n",
  762 + "I0407 10:54:45.872749 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.embedding_dim = 64\n",
  763 + "I0407 10:54:45.873004 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.input_dim = 64\n",
  764 + "I0407 10:54:45.873091 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.filters = [512, 256, 64]\n",
  765 + "I0407 10:54:45.873195 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.kernel_size = [3, 3, 3]\n",
  766 + "I0407 10:54:45.873291 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.stride = [1, 1, 1]\n",
  767 + "I0407 10:54:45.873384 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.padding = [1, 2, 4]\n",
  768 + "I0407 10:54:45.873478 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.dilation = [1, 2, 4]\n",
  769 + "I0407 10:54:45.873572 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.activations = ['relu', 'relu', 'linear']\n",
  770 + "I0407 10:54:45.873714 140321380496448 params.py:248] type = relu\n",
  771 + "I0407 10:54:45.873904 140321380496448 params.py:248] type = relu\n",
  772 + "I0407 10:54:45.874098 140321380496448 params.py:248] type = linear\n",
  773 + "I0407 10:54:45.880232 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.vocab_namespace = token_characters\n",
  774 + "I0407 10:54:45.880783 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.type = transformers_word_embeddings\n",
  775 + "I0407 10:54:45.881011 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.model_name = allegro/herbert-large-cased\n",
  776 + "I0407 10:54:45.881093 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.projection_dim = 100\n",
  777 + "I0407 10:54:45.881184 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.projection_activation = <function TransformersWordEmbedder.<lambda> at 0x7f9e50745280>\n",
  778 + "I0407 10:54:45.881261 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.projection_dropout_rate = 0.0\n",
  779 + "I0407 10:54:45.881328 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.freeze_transformer = True\n",
  780 + "I0407 10:54:45.881389 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.last_layer_only = True\n",
  781 + "I0407 10:54:45.881492 140321380496448 params.py:384] model.text_field_embedder.token_embedders.token.tokenizer_kwargs.use_fast = False\n",
  782 + "I0407 10:54:45.881562 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.transformer_kwargs = None\n"
  783 + ]
  784 + },
  785 + {
  786 + "name": "stdout",
  787 + "output_type": "stream",
  788 + "text": [
  789 + "I0407 10:54:52.911276 140321380496448 params.py:248] model.seq_encoder.type = combo_encoder\n",
  790 + "I0407 10:54:52.911743 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.input_size = 164\n",
  791 + "I0407 10:54:52.911836 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.hidden_size = 512\n",
  792 + "I0407 10:54:52.911902 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.num_layers = 2\n",
  793 + "I0407 10:54:52.911965 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.recurrent_dropout_probability = 0.33\n",
  794 + "I0407 10:54:52.912029 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.layer_dropout_probability = 0.33\n",
  795 + "I0407 10:54:52.912090 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.use_highway = False\n",
  796 + "I0407 10:54:53.279199 140321380496448 params.py:248] model.seq_encoder.layer_dropout_probability = 0.33\n",
  797 + "I0407 10:54:53.279505 140321380496448 params.py:248] model.use_sample_weight = True\n",
  798 + "I0407 10:54:53.279624 140321380496448 params.py:248] model.lemmatizer = None\n",
  799 + "I0407 10:54:53.279695 140321380496448 params.py:248] model.upos_tagger = None\n",
  800 + "I0407 10:54:53.279757 140321380496448 params.py:248] model.xpos_tagger = None\n",
  801 + "I0407 10:54:53.279815 140321380496448 params.py:248] model.semantic_relation = None\n",
  802 + "I0407 10:54:53.279873 140321380496448 params.py:248] model.morphological_feat = None\n",
  803 + "I0407 10:54:53.280155 140321380496448 params.py:248] model.dependency_relation.type = combo_dependency_parsing_from_vocab\n",
  804 + "I0407 10:54:53.280393 140321380496448 params.py:248] model.dependency_relation.vocab_namespace = deprel_labels\n",
  805 + "I0407 10:54:53.280741 140321380496448 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.in_features = 1024\n",
  806 + "I0407 10:54:53.280819 140321380496448 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.out_features = 512\n",
  807 + "I0407 10:54:53.280887 140321380496448 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.activation = tanh\n",
  808 + "I0407 10:54:53.281012 140321380496448 params.py:248] type = tanh\n",
  809 + "I0407 10:54:53.281121 140321380496448 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.dropout_rate = 0.0\n",
  810 + "I0407 10:54:53.285843 140321380496448 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.in_features = 1024\n",
  811 + "I0407 10:54:53.286010 140321380496448 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.out_features = 512\n",
  812 + "I0407 10:54:53.286088 140321380496448 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.activation = tanh\n",
  813 + "I0407 10:54:53.286234 140321380496448 params.py:248] type = tanh\n",
  814 + "I0407 10:54:53.286334 140321380496448 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.dropout_rate = 0.0\n",
  815 + "I0407 10:54:53.290788 140321380496448 params.py:248] model.dependency_relation.head_predictor.cycle_loss_n = 0\n",
  816 + "I0407 10:54:53.291093 140321380496448 params.py:248] model.dependency_relation.head_projection_layer.in_features = 1024\n",
  817 + "I0407 10:54:53.291184 140321380496448 params.py:248] model.dependency_relation.head_projection_layer.out_features = 128\n",
  818 + "I0407 10:54:53.291281 140321380496448 params.py:248] model.dependency_relation.head_projection_layer.activation = tanh\n",
  819 + "I0407 10:54:53.291444 140321380496448 params.py:248] type = tanh\n",
  820 + "I0407 10:54:53.291567 140321380496448 params.py:248] model.dependency_relation.head_projection_layer.dropout_rate = 0.25\n",
  821 + "I0407 10:54:53.293048 140321380496448 params.py:248] model.dependency_relation.dependency_projection_layer.in_features = 1024\n",
  822 + "I0407 10:54:53.293147 140321380496448 params.py:248] model.dependency_relation.dependency_projection_layer.out_features = 128\n",
  823 + "I0407 10:54:53.293218 140321380496448 params.py:248] model.dependency_relation.dependency_projection_layer.activation = tanh\n",
  824 + "I0407 10:54:53.293342 140321380496448 params.py:248] type = tanh\n",
  825 + "I0407 10:54:53.293437 140321380496448 params.py:248] model.dependency_relation.dependency_projection_layer.dropout_rate = 0.25\n",
  826 + "I0407 10:54:53.295091 140321380496448 params.py:248] model.enhanced_dependency_relation = None\n",
  827 + "I0407 10:54:53.295609 140321380496448 params.py:248] model.regularizer.regexes.0.1.type = l2\n",
  828 + "I0407 10:54:53.295784 140321380496448 params.py:248] model.regularizer.regexes.0.1.alpha = 1e-06\n",
  829 + "I0407 10:54:53.295953 140321380496448 params.py:248] model.regularizer.regexes.1.1.type = l2\n",
  830 + "I0407 10:54:53.296107 140321380496448 params.py:248] model.regularizer.regexes.1.1.alpha = 1e-06\n",
  831 + "I0407 10:54:53.296261 140321380496448 params.py:248] model.regularizer.regexes.2.1.type = l2\n",
  832 + "I0407 10:54:53.296412 140321380496448 params.py:248] model.regularizer.regexes.2.1.alpha = 1e-06\n",
  833 + "I0407 10:54:53.296564 140321380496448 params.py:248] model.regularizer.regexes.3.1.type = l2\n",
  834 + "I0407 10:54:53.296715 140321380496448 params.py:248] model.regularizer.regexes.3.1.alpha = 1e-05\n",
  835 + "I0407 10:54:56.194218 140321380496448 archival.py:211] removing temporary unarchived model dir at /tmp/tmpuvesoi4q\n",
  836 + "reading instances: 1980it [01:33, 21.15it/s]\n"
  837 + ]
  838 + }
  839 + ],
  840 + "source": [
  841 + "! {COMBO} --mode predict \\\n",
  842 + " --cuda_device 0 \\\n",
  843 + " --model_path model-pdbc-cont/model.tar.gz \\\n",
  844 + " --input_file connlu/pdbc-cont-validation.conllu \\\n",
  845 + " --output_file connlu/pdbc-cont-validation-pred.conllu"
  846 + ]
  847 + },
  848 + {
  849 + "cell_type": "code",
  850 + "execution_count": 9,
  851 + "id": "cfe7a3c4",
  852 + "metadata": {},
  853 + "outputs": [
  854 + {
  855 + "name": "stdout",
  856 + "output_type": "stream",
  857 + "text": [
  858 + "# text = Dwie dziewczynki opierają się o dach kapliczki , chłopiec wspina się na niego , a trzecia dziewczynka stoi obok .\r\n",
  859 + "1\tDwie\tdwa\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
  860 + "2\tdziewczynki\tdziewczynka\t_\t_\t_\t1\tcomp\t1:comp\t_\r\n",
  861 + "3\topierają\topierać\t_\t_\t_\t15\tconjunct\t15:conjunct\t_\r\n",
  862 + "4\tsię\tsię\t_\t_\t_\t3\trefl\t3:refl\t_\r\n",
  863 + "5\to\to\t_\t_\t_\t3\tcomp\t3:comp\t_\r\n",
  864 + "6\tdach\tdach\t_\t_\t_\t5\tcomp\t5:comp\t_\r\n",
  865 + "7\tkapliczki\tkapliczka\t_\t_\t_\t6\tadjunct\t6:adjunct\t_\r\n",
  866 + "8\t,\t,\t_\t_\t_\t15\tpunct\t15:punct\t_\r\n",
  867 + "9\tchłopiec\tchłopiec\t_\t_\t_\t10\tsubj\t10:subj\t_\r\n"
  868 + ]
  869 + }
  870 + ],
  871 + "source": [
  872 + "! head connlu/pdbc-cont-validation.conllu"
  873 + ]
  874 + },
  875 + {
  876 + "cell_type": "code",
  877 + "execution_count": 10,
  878 + "id": "7dba9571",
  879 + "metadata": {},
  880 + "outputs": [
  881 + {
  882 + "name": "stdout",
  883 + "output_type": "stream",
  884 + "text": [
  885 + "# text = Dwie dziewczynki opierają się o dach kapliczki , chłopiec wspina się na niego , a trzecia dziewczynka stoi obok .\r\n",
  886 + "1\tDwie\tdwa\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
  887 + "2\tdziewczynki\tdziewczynka\t_\t_\t_\t1\tcomp\t1:comp\t_\r\n",
  888 + "3\topierają\topierać\t_\t_\t_\t15\tconjunct\t15:conjunct\t_\r\n",
  889 + "4\tsię\tsię\t_\t_\t_\t3\trefl\t3:refl\t_\r\n",
  890 + "5\to\to\t_\t_\t_\t3\tcomp\t3:comp\t_\r\n",
  891 + "6\tdach\tdach\t_\t_\t_\t5\tcomp\t5:comp\t_\r\n",
  892 + "7\tkapliczki\tkapliczka\t_\t_\t_\t6\tadjunct\t6:adjunct\t_\r\n",
  893 + "8\t,\t,\t_\t_\t_\t15\tpunct\t15:punct\t_\r\n",
  894 + "9\tchłopiec\tchłopiec\t_\t_\t_\t10\tsubj\t10:subj\t_\r\n"
  895 + ]
  896 + }
  897 + ],
  898 + "source": [
  899 + "! head connlu/pdbc-cont-validation-pred.conllu"
  900 + ]
  901 + },
  902 + {
  903 + "cell_type": "code",
  904 + "execution_count": 11,
  905 + "id": "679601c2",
  906 + "metadata": {},
  907 + "outputs": [
  908 + {
  909 + "name": "stdout",
  910 + "output_type": "stream",
  911 + "text": [
  912 + "I0407 10:56:35.295660 140254825452608 archival.py:184] loading archive file model-pdbc-cont/model.tar.gz\n",
  913 + "I0407 10:56:35.296370 140254825452608 archival.py:263] extracting archive file model-pdbc-cont/model.tar.gz to temp dir /tmp/tmpdhtf4et1\n",
  914 + "I0407 10:56:52.876630 140254825452608 params.py:248] dataset_reader.type = conllu\n",
  915 + "I0407 10:56:52.877122 140254825452608 params.py:248] dataset_reader.lazy = False\n",
  916 + "I0407 10:56:52.877243 140254825452608 params.py:248] dataset_reader.cache_directory = None\n",
  917 + "I0407 10:56:52.877313 140254825452608 params.py:248] dataset_reader.max_instances = None\n",
  918 + "I0407 10:56:52.877380 140254825452608 params.py:248] dataset_reader.manual_distributed_sharding = False\n",
  919 + "I0407 10:56:52.877446 140254825452608 params.py:248] dataset_reader.manual_multi_process_sharding = False\n",
  920 + "I0407 10:56:52.877737 140254825452608 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n",
  921 + "I0407 10:56:52.877938 140254825452608 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n",
  922 + "I0407 10:56:52.878201 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n",
  923 + "I0407 10:56:52.878276 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n",
  924 + "I0407 10:56:52.878360 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  925 + "I0407 10:56:52.878507 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  926 + "I0407 10:56:52.878633 140254825452608 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n",
  927 + "I0407 10:56:52.878702 140254825452608 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n",
  928 + "I0407 10:56:52.878761 140254825452608 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n",
  929 + "I0407 10:56:52.878825 140254825452608 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n",
  930 + "I0407 10:56:52.878969 140254825452608 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n",
  931 + "I0407 10:56:52.879144 140254825452608 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n",
  932 + "I0407 10:56:52.879218 140254825452608 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n",
  933 + "I0407 10:56:52.879282 140254825452608 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n",
  934 + "I0407 10:56:52.879426 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n",
  935 + "I0407 10:56:52.879594 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n",
  936 + "I0407 10:56:52.879792 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n",
  937 + "I0407 10:56:52.879862 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n",
  938 + "I0407 10:56:52.879944 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n",
  939 + "I0407 10:56:52.880068 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n",
  940 + "I0407 10:56:52.880184 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n",
  941 + "I0407 10:56:52.880254 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n",
  942 + "I0407 10:56:52.880316 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n",
  943 + "I0407 10:56:52.880378 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n",
  944 + "I0407 10:56:52.880523 140254825452608 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n",
  945 + "I0407 10:56:52.880748 140254825452608 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n",
  946 + "I0407 10:56:52.880829 140254825452608 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n",
  947 + "I0407 10:56:52.880893 140254825452608 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n",
  948 + "I0407 10:56:52.880957 140254825452608 params.py:248] dataset_reader.token_indexers.token.max_length = None\n",
  949 + "I0407 10:56:52.881069 140254825452608 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n",
  950 + "I0407 10:56:55.893562 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n",
  951 + "I0407 10:56:55.894115 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n",
  952 + "I0407 10:56:55.894256 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n",
  953 + "I0407 10:56:55.894343 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n",
  954 + "I0407 10:56:55.894395 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n",
  955 + "I0407 10:56:55.894465 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n",
  956 + "I0407 10:56:55.894520 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  957 + "I0407 10:56:55.894590 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n",
  958 + "I0407 10:56:55.894762 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n",
  959 + "I0407 10:56:55.894958 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n",
  960 + "I0407 10:56:55.895048 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n",
  961 + "I0407 10:56:55.895111 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n",
  962 + "I0407 10:56:55.895176 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n",
  963 + "I0407 10:56:55.895228 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n",
  964 + "I0407 10:56:55.895297 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  965 + "I0407 10:56:55.895349 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n",
  966 + "I0407 10:56:55.895593 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n",
  967 + "I0407 10:56:55.895786 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n",
  968 + "I0407 10:56:55.896016 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n",
  969 + "I0407 10:56:55.896095 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n",
  970 + "I0407 10:56:55.896188 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  971 + "I0407 10:56:55.896353 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  972 + "I0407 10:56:55.896480 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n",
  973 + "I0407 10:56:55.896552 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n",
  974 + "I0407 10:56:55.896607 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n",
  975 + "I0407 10:56:55.896675 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n",
  976 + "I0407 10:56:55.896760 140254825452608 params.py:248] dataset_reader.features = ['token', 'char']\n",
  977 + "I0407 10:56:55.896864 140254825452608 params.py:248] dataset_reader.targets = ['head', 'deprel']\n",
  978 + "I0407 10:56:55.896962 140254825452608 params.py:248] dataset_reader.use_sem = False\n",
  979 + "I0407 10:56:55.897153 140254825452608 params.py:248] dataset_reader.type = conllu\n",
  980 + "I0407 10:56:55.897414 140254825452608 params.py:248] dataset_reader.lazy = False\n",
  981 + "I0407 10:56:55.897499 140254825452608 params.py:248] dataset_reader.cache_directory = None\n",
  982 + "I0407 10:56:55.897570 140254825452608 params.py:248] dataset_reader.max_instances = None\n",
  983 + "I0407 10:56:55.897637 140254825452608 params.py:248] dataset_reader.manual_distributed_sharding = False\n",
  984 + "I0407 10:56:55.897707 140254825452608 params.py:248] dataset_reader.manual_multi_process_sharding = False\n",
  985 + "I0407 10:56:55.897995 140254825452608 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n",
  986 + "I0407 10:56:55.898183 140254825452608 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n",
  987 + "I0407 10:56:55.898398 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n",
  988 + "I0407 10:56:55.898473 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n",
  989 + "I0407 10:56:55.898542 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  990 + "I0407 10:56:55.898677 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  991 + "I0407 10:56:55.898799 140254825452608 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n",
  992 + "I0407 10:56:55.898869 140254825452608 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n",
  993 + "I0407 10:56:55.898936 140254825452608 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n",
  994 + "I0407 10:56:55.898998 140254825452608 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n",
  995 + "I0407 10:56:55.899158 140254825452608 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n",
  996 + "I0407 10:56:55.899337 140254825452608 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n",
  997 + "I0407 10:56:55.899414 140254825452608 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n",
  998 + "I0407 10:56:55.899485 140254825452608 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n",
  999 + "I0407 10:56:55.899629 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n",
  1000 + "I0407 10:56:55.899797 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n",
  1001 + "I0407 10:56:55.899995 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n",
  1002 + "I0407 10:56:55.900055 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n",
  1003 + "I0407 10:56:55.900130 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n",
  1004 + "I0407 10:56:55.900250 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n",
  1005 + "I0407 10:56:55.900363 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n",
  1006 + "I0407 10:56:55.900426 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n",
  1007 + "I0407 10:56:55.900486 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n",
  1008 + "I0407 10:56:55.900547 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n",
  1009 + "I0407 10:56:55.900689 140254825452608 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n",
  1010 + "I0407 10:56:55.900916 140254825452608 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n",
  1011 + "I0407 10:56:55.900995 140254825452608 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n",
  1012 + "I0407 10:56:55.901061 140254825452608 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n",
  1013 + "I0407 10:56:55.901125 140254825452608 params.py:248] dataset_reader.token_indexers.token.max_length = None\n",
  1014 + "I0407 10:56:55.901226 140254825452608 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n",
  1015 + "I0407 10:56:55.902561 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n",
  1016 + "I0407 10:56:55.902824 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n",
  1017 + "I0407 10:56:55.902909 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n",
  1018 + "I0407 10:56:55.902969 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n",
  1019 + "I0407 10:56:55.903034 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n",
  1020 + "I0407 10:56:55.903095 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n",
  1021 + "I0407 10:56:55.903159 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  1022 + "I0407 10:56:55.903219 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n",
  1023 + "I0407 10:56:55.903364 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n",
  1024 + "I0407 10:56:55.903547 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n",
  1025 + "I0407 10:56:55.903621 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n",
  1026 + "I0407 10:56:55.903687 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n",
  1027 + "I0407 10:56:55.903748 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n",
  1028 + "I0407 10:56:55.903811 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n",
  1029 + "I0407 10:56:55.903868 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n",
  1030 + "I0407 10:56:55.903931 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n",
  1031 + "I0407 10:56:55.904146 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n",
  1032 + "I0407 10:56:55.904325 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n",
  1033 + "I0407 10:56:55.904539 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n",
  1034 + "I0407 10:56:55.904611 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n",
  1035 + "I0407 10:56:55.904691 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n",
  1036 + "I0407 10:56:55.904827 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n",
  1037 + "I0407 10:56:55.904946 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n",
  1038 + "I0407 10:56:55.905013 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n",
  1039 + "I0407 10:56:55.905084 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n",
  1040 + "I0407 10:56:55.905149 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n",
  1041 + "I0407 10:56:55.905237 140254825452608 params.py:248] dataset_reader.features = ['token', 'char']\n",
  1042 + "I0407 10:56:55.905334 140254825452608 params.py:248] dataset_reader.targets = ['head', 'deprel']\n",
  1043 + "I0407 10:56:55.905422 140254825452608 params.py:248] dataset_reader.use_sem = False\n",
  1044 + "I0407 10:56:55.906047 140254825452608 params.py:248] vocabulary.type = from_instances_extended\n",
  1045 + "I0407 10:56:55.906157 140254825452608 vocabulary.py:323] Loading token dictionary from /tmp/tmpdhtf4et1/vocabulary.\n",
  1046 + "I0407 10:56:55.906635 140254825452608 filelock.py:254] Lock 140251419626896 acquired on /tmp/tmpdhtf4et1/vocabulary/.lock\n",
  1047 + "I0407 10:56:55.907354 140254825452608 filelock.py:317] Lock 140251419626896 released on /tmp/tmpdhtf4et1/vocabulary/.lock\n",
  1048 + "I0407 10:56:55.907914 140254825452608 params.py:248] model.type = semantic_multitask\n",
  1049 + "I0407 10:56:55.908506 140254825452608 params.py:248] model.text_field_embedder.type = basic\n",
  1050 + "I0407 10:56:55.908878 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.type = char_embeddings_from_config\n",
  1051 + "I0407 10:56:55.909080 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.embedding_dim = 64\n",
  1052 + "I0407 10:56:55.909353 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.input_dim = 64\n",
  1053 + "I0407 10:56:55.909446 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.filters = [512, 256, 64]\n",
  1054 + "I0407 10:56:55.909554 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.kernel_size = [3, 3, 3]\n"
  1055 + ]
  1056 + },
  1057 + {
  1058 + "name": "stdout",
  1059 + "output_type": "stream",
  1060 + "text": [
  1061 + "I0407 10:56:55.909654 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.stride = [1, 1, 1]\n",
  1062 + "I0407 10:56:55.909750 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.padding = [1, 2, 4]\n",
  1063 + "I0407 10:56:55.909847 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.dilation = [1, 2, 4]\n",
  1064 + "I0407 10:56:55.909946 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.activations = ['relu', 'relu', 'linear']\n",
  1065 + "I0407 10:56:55.910176 140254825452608 params.py:248] type = relu\n",
  1066 + "I0407 10:56:55.910410 140254825452608 params.py:248] type = relu\n",
  1067 + "I0407 10:56:55.910567 140254825452608 params.py:248] type = linear\n",
  1068 + "I0407 10:56:55.917278 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.vocab_namespace = token_characters\n",
  1069 + "I0407 10:56:55.917941 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.type = transformers_word_embeddings\n",
  1070 + "I0407 10:56:55.918267 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.model_name = allegro/herbert-large-cased\n",
  1071 + "I0407 10:56:55.918358 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.projection_dim = 100\n",
  1072 + "I0407 10:56:55.918458 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.projection_activation = <function TransformersWordEmbedder.<lambda> at 0x7f8ed1745280>\n",
  1073 + "I0407 10:56:55.918541 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.projection_dropout_rate = 0.0\n",
  1074 + "I0407 10:56:55.918609 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.freeze_transformer = True\n",
  1075 + "I0407 10:56:55.918674 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.last_layer_only = True\n",
  1076 + "I0407 10:56:55.918785 140254825452608 params.py:384] model.text_field_embedder.token_embedders.token.tokenizer_kwargs.use_fast = False\n",
  1077 + "I0407 10:56:55.918858 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.transformer_kwargs = None\n",
  1078 + "I0407 10:57:03.624983 140254825452608 params.py:248] model.seq_encoder.type = combo_encoder\n",
  1079 + "I0407 10:57:03.625626 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.input_size = 164\n",
  1080 + "I0407 10:57:03.625742 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.hidden_size = 512\n",
  1081 + "I0407 10:57:03.625796 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.num_layers = 2\n",
  1082 + "I0407 10:57:03.625844 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.recurrent_dropout_probability = 0.33\n",
  1083 + "I0407 10:57:03.625942 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.layer_dropout_probability = 0.33\n",
  1084 + "I0407 10:57:03.626068 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.use_highway = False\n",
  1085 + "I0407 10:57:03.933019 140254825452608 params.py:248] model.seq_encoder.layer_dropout_probability = 0.33\n",
  1086 + "I0407 10:57:03.933302 140254825452608 params.py:248] model.use_sample_weight = True\n",
  1087 + "I0407 10:57:03.933391 140254825452608 params.py:248] model.lemmatizer = None\n",
  1088 + "I0407 10:57:03.933440 140254825452608 params.py:248] model.upos_tagger = None\n",
  1089 + "I0407 10:57:03.933486 140254825452608 params.py:248] model.xpos_tagger = None\n",
  1090 + "I0407 10:57:03.933528 140254825452608 params.py:248] model.semantic_relation = None\n",
  1091 + "I0407 10:57:03.933570 140254825452608 params.py:248] model.morphological_feat = None\n",
  1092 + "I0407 10:57:03.933835 140254825452608 params.py:248] model.dependency_relation.type = combo_dependency_parsing_from_vocab\n",
  1093 + "I0407 10:57:03.934096 140254825452608 params.py:248] model.dependency_relation.vocab_namespace = deprel_labels\n",
  1094 + "I0407 10:57:03.934389 140254825452608 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.in_features = 1024\n",
  1095 + "I0407 10:57:03.934459 140254825452608 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.out_features = 512\n",
  1096 + "I0407 10:57:03.934515 140254825452608 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.activation = tanh\n",
  1097 + "I0407 10:57:03.934614 140254825452608 params.py:248] type = tanh\n",
  1098 + "I0407 10:57:03.934703 140254825452608 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.dropout_rate = 0.0\n",
  1099 + "I0407 10:57:03.938141 140254825452608 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.in_features = 1024\n",
  1100 + "I0407 10:57:03.938247 140254825452608 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.out_features = 512\n",
  1101 + "I0407 10:57:03.938306 140254825452608 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.activation = tanh\n",
  1102 + "I0407 10:57:03.938404 140254825452608 params.py:248] type = tanh\n",
  1103 + "I0407 10:57:03.938489 140254825452608 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.dropout_rate = 0.0\n",
  1104 + "I0407 10:57:03.941669 140254825452608 params.py:248] model.dependency_relation.head_predictor.cycle_loss_n = 0\n",
  1105 + "I0407 10:57:03.941908 140254825452608 params.py:248] model.dependency_relation.head_projection_layer.in_features = 1024\n",
  1106 + "I0407 10:57:03.941985 140254825452608 params.py:248] model.dependency_relation.head_projection_layer.out_features = 128\n",
  1107 + "I0407 10:57:03.942037 140254825452608 params.py:248] model.dependency_relation.head_projection_layer.activation = tanh\n",
  1108 + "I0407 10:57:03.942123 140254825452608 params.py:248] type = tanh\n",
  1109 + "I0407 10:57:03.942194 140254825452608 params.py:248] model.dependency_relation.head_projection_layer.dropout_rate = 0.25\n",
  1110 + "I0407 10:57:03.943288 140254825452608 params.py:248] model.dependency_relation.dependency_projection_layer.in_features = 1024\n",
  1111 + "I0407 10:57:03.943376 140254825452608 params.py:248] model.dependency_relation.dependency_projection_layer.out_features = 128\n",
  1112 + "I0407 10:57:03.943423 140254825452608 params.py:248] model.dependency_relation.dependency_projection_layer.activation = tanh\n",
  1113 + "I0407 10:57:03.943510 140254825452608 params.py:248] type = tanh\n",
  1114 + "I0407 10:57:03.943577 140254825452608 params.py:248] model.dependency_relation.dependency_projection_layer.dropout_rate = 0.25\n",
  1115 + "I0407 10:57:03.944838 140254825452608 params.py:248] model.enhanced_dependency_relation = None\n",
  1116 + "I0407 10:57:03.945286 140254825452608 params.py:248] model.regularizer.regexes.0.1.type = l2\n",
  1117 + "I0407 10:57:03.945443 140254825452608 params.py:248] model.regularizer.regexes.0.1.alpha = 1e-06\n",
  1118 + "I0407 10:57:03.945568 140254825452608 params.py:248] model.regularizer.regexes.1.1.type = l2\n",
  1119 + "I0407 10:57:03.945679 140254825452608 params.py:248] model.regularizer.regexes.1.1.alpha = 1e-06\n",
  1120 + "I0407 10:57:03.945787 140254825452608 params.py:248] model.regularizer.regexes.2.1.type = l2\n",
  1121 + "I0407 10:57:03.945892 140254825452608 params.py:248] model.regularizer.regexes.2.1.alpha = 1e-06\n",
  1122 + "I0407 10:57:03.946047 140254825452608 params.py:248] model.regularizer.regexes.3.1.type = l2\n",
  1123 + "I0407 10:57:03.946158 140254825452608 params.py:248] model.regularizer.regexes.3.1.alpha = 1e-05\n",
  1124 + "I0407 10:57:06.549506 140254825452608 archival.py:211] removing temporary unarchived model dir at /tmp/tmpdhtf4et1\n",
  1125 + "reading instances: 1990it [01:39, 20.00it/s]\n"
  1126 + ]
  1127 + }
  1128 + ],
  1129 + "source": [
  1130 + "! {COMBO} --mode predict \\\n",
  1131 + " --cuda_device 0 \\\n",
  1132 + " --model_path model-pdbc-cont/model.tar.gz \\\n",
  1133 + " --input_file connlu/pdbc-cont-test.conllu \\\n",
  1134 + " --output_file connlu/pdbc-cont-test-pred.conllu"
  1135 + ]
  1136 + },
  1137 + {
  1138 + "cell_type": "code",
  1139 + "execution_count": 12,
  1140 + "id": "ddc3986b",
  1141 + "metadata": {},
  1142 + "outputs": [
  1143 + {
  1144 + "name": "stdout",
  1145 + "output_type": "stream",
  1146 + "text": [
  1147 + "# text = Mały chłopiec patrzy w bok po ściągnięciu okularów .\r\n",
  1148 + "1\tMały\tmały\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n",
  1149 + "2\tchłopiec\tchłopiec\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
  1150 + "3\tpatrzy\tpatrzeć\t_\t_\t_\t0\troot\t0:root\t_\r\n",
  1151 + "4\tw\tw\t_\t_\t_\t3\tadjunct_adl\t3:adjunct_adl\t_\r\n",
  1152 + "5\tbok\tbok\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n",
  1153 + "6\tpo\tpo\t_\t_\t_\t3\tadjunct_temp\t3:adjunct_temp\t_\r\n",
  1154 + "7\tściągnięciu\tściągnąć\t_\t_\t_\t6\tcomp\t6:comp\t_\r\n",
  1155 + "8\tokularów\tokulary\t_\t_\t_\t7\tobj\t7:obj\t_\r\n",
  1156 + "9\t.\t.\t_\t_\t_\t3\tpunct\t3:punct\t_\r\n"
  1157 + ]
  1158 + }
  1159 + ],
  1160 + "source": [
  1161 + "! head connlu/pdbc-cont-test.conllu"
  1162 + ]
  1163 + },
  1164 + {
  1165 + "cell_type": "code",
  1166 + "execution_count": 13,
  1167 + "id": "34aa16d9",
  1168 + "metadata": {},
  1169 + "outputs": [
  1170 + {
  1171 + "name": "stdout",
  1172 + "output_type": "stream",
  1173 + "text": [
  1174 + "# text = Mały chłopiec patrzy w bok po ściągnięciu okularów .\r\n",
  1175 + "1\tMały\tmały\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n",
  1176 + "2\tchłopiec\tchłopiec\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
  1177 + "3\tpatrzy\tpatrzeć\t_\t_\t_\t0\troot\t0:root\t_\r\n",
  1178 + "4\tw\tw\t_\t_\t_\t3\tcomp\t3:adjunct_adl\t_\r\n",
  1179 + "5\tbok\tbok\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n",
  1180 + "6\tpo\tpo\t_\t_\t_\t3\tadjunct_temp\t3:adjunct_temp\t_\r\n",
  1181 + "7\tściągnięciu\tściągnąć\t_\t_\t_\t6\tcomp\t6:comp\t_\r\n",
  1182 + "8\tokularów\tokulary\t_\t_\t_\t7\tobj\t7:obj\t_\r\n",
  1183 + "9\t.\t.\t_\t_\t_\t3\tpunct\t3:punct\t_\r\n"
  1184 + ]
  1185 + }
  1186 + ],
  1187 + "source": [
  1188 + "! head connlu/pdbc-cont-test-pred.conllu"
  1189 + ]
  1190 + }
  1191 + ],
  1192 + "metadata": {
  1193 + "kernelspec": {
  1194 + "display_name": "combo_python39",
  1195 + "language": "python",
  1196 + "name": "combo_python39"
  1197 + },
  1198 + "language_info": {
  1199 + "codemirror_mode": {
  1200 + "name": "ipython",
  1201 + "version": 3
  1202 + },
  1203 + "file_extension": ".py",
  1204 + "mimetype": "text/x-python",
  1205 + "name": "python",
  1206 + "nbconvert_exporter": "python",
  1207 + "pygments_lexer": "ipython3",
  1208 + "version": "3.8.16"
  1209 + }
  1210 + },
  1211 + "nbformat": 4,
  1212 + "nbformat_minor": 5
  1213 +}
... ...