Commit 0dfb2dfb2c7d5b0308aa37015da22c0bd77c2767
0 parents
initial commit
Showing
14 changed files
with
30572 additions
and
0 deletions
Too many changes to show.
To preserve performance only 4 of 14 files are displayed.
BeNePar/DataPreparation.ipynb
0 → 100644
1 | +++ a/BeNePar/DataPreparation.ipynb | |
1 | +{ | |
2 | + "cells": [ | |
3 | + { | |
4 | + "cell_type": "code", | |
5 | + "execution_count": 1, | |
6 | + "id": "5cd26f6f", | |
7 | + "metadata": {}, | |
8 | + "outputs": [], | |
9 | + "source": [ | |
10 | + "import os\n", | |
11 | + "\n", | |
12 | + "from datasets import load_dataset\n", | |
13 | + "\n", | |
14 | + "from IPython.display import display\n", | |
15 | + "\n", | |
16 | + "import sys\n", | |
17 | + "sys.path.append('../')\n", | |
18 | + "from neural_parser import hybrid_tree_utils" | |
19 | + ] | |
20 | + }, | |
21 | + { | |
22 | + "cell_type": "code", | |
23 | + "execution_count": 2, | |
24 | + "id": "fecef4af", | |
25 | + "metadata": {}, | |
26 | + "outputs": [ | |
27 | + { | |
28 | + "name": "stderr", | |
29 | + "output_type": "stream", | |
30 | + "text": [ | |
31 | + "Found cached dataset pdb_c_beta (/home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1)\n" | |
32 | + ] | |
33 | + }, | |
34 | + { | |
35 | + "data": { | |
36 | + "application/vnd.jupyter.widget-view+json": { | |
37 | + "model_id": "d6fc0deda216433982f304d7451158b2", | |
38 | + "version_major": 2, | |
39 | + "version_minor": 0 | |
40 | + }, | |
41 | + "text/plain": [ | |
42 | + " 0%| | 0/3 [00:00<?, ?it/s]" | |
43 | + ] | |
44 | + }, | |
45 | + "metadata": {}, | |
46 | + "output_type": "display_data" | |
47 | + } | |
48 | + ], | |
49 | + "source": [ | |
50 | + "pdbc_dataset = load_dataset('../pdb_c_beta/')" | |
51 | + ] | |
52 | + }, | |
53 | + { | |
54 | + "cell_type": "code", | |
55 | + "execution_count": 3, | |
56 | + "id": "23da801f", | |
57 | + "metadata": {}, | |
58 | + "outputs": [], | |
59 | + "source": [ | |
60 | + "BRACKETS_DIR = 'brackets'\n", | |
61 | + "! rm -r {BRACKETS_DIR}\n", | |
62 | + "! mkdir {BRACKETS_DIR}" | |
63 | + ] | |
64 | + }, | |
65 | + { | |
66 | + "cell_type": "code", | |
67 | + "execution_count": 5, | |
68 | + "id": "c105feff", | |
69 | + "metadata": {}, | |
70 | + "outputs": [ | |
71 | + { | |
72 | + "name": "stdout", | |
73 | + "output_type": "stream", | |
74 | + "text": [ | |
75 | + "train\n", | |
76 | + " brackets/pdbc-cont-train.dat\n", | |
77 | + " 15903\n", | |
78 | + "validation\n", | |
79 | + " brackets/pdbc-cont-validation.dat\n", | |
80 | + " 1980\n", | |
81 | + "test\n", | |
82 | + " brackets/pdbc-cont-test.dat\n", | |
83 | + " 1990\n" | |
84 | + ] | |
85 | + } | |
86 | + ], | |
87 | + "source": [ | |
88 | + "features = pdbc_dataset['train'].features\n", | |
89 | + "\n", | |
90 | + "for part, dataset in pdbc_dataset.items():\n", | |
91 | + " print(part)\n", | |
92 | + " b_cont = []\n", | |
93 | + " for sentence in dataset:\n", | |
94 | + " tree = hybrid_tree_utils.tree_from_dataset_instance(sentence, features)\n", | |
95 | + " if tree.is_continuous():\n", | |
96 | + " b_cont.append(f'(TOP {tree.to_brackets(morph_tags=True)})')\n", | |
97 | + " filepath = os.path.join(BRACKETS_DIR, f'pdbc-cont-{part}.dat')\n", | |
98 | + " with open(filepath, 'w') as f:\n", | |
99 | + " print(' ', filepath)\n", | |
100 | + " print(' ', len(b_cont))\n", | |
101 | + " for row in b_cont:\n", | |
102 | + " print(row, file=f)" | |
103 | + ] | |
104 | + }, | |
105 | + { | |
106 | + "cell_type": "code", | |
107 | + "execution_count": 6, | |
108 | + "id": "c849233c", | |
109 | + "metadata": {}, | |
110 | + "outputs": [ | |
111 | + { | |
112 | + "name": "stdout", | |
113 | + "output_type": "stream", | |
114 | + "text": [ | |
115 | + " 1990 121784 1024525 brackets/pdbc-cont-test.dat\n", | |
116 | + " 15903 1022627 8620535 brackets/pdbc-cont-train.dat\n", | |
117 | + " 1980 126288 1065593 brackets/pdbc-cont-validation.dat\n", | |
118 | + " 19873 1270699 10710653 total\n" | |
119 | + ] | |
120 | + } | |
121 | + ], | |
122 | + "source": [ | |
123 | + "! wc {BRACKETS_DIR}/*.dat" | |
124 | + ] | |
125 | + }, | |
126 | + { | |
127 | + "cell_type": "code", | |
128 | + "execution_count": 8, | |
129 | + "id": "679b9f10", | |
130 | + "metadata": {}, | |
131 | + "outputs": [ | |
132 | + { | |
133 | + "name": "stdout", | |
134 | + "output_type": "stream", | |
135 | + "text": [ | |
136 | + "(TOP (ROOT (*S (S (NP (AdjP (*Adj (adj:sg:nom:f:pos Skośnooka))) (*NP (*N (subst:sg:nom:f dziewczynka)))) (*VP (*V (fin:sg:ter:imperf trzyma))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:pl:loc:f rękach)))) (NP (AdjP (*Adj (adj:pl:nom:f:pos drewniane))) (*NP (*N (subst:pl:nom:f pałeczki))))) (Punct (interp ,)) (*Conj (conj a)) (S (PrepNP (*Prep (prep:inst:nwok przed)) (NP (*N (ppron3:sg:inst:f:ter:akc:praep nią)))) (*VP (*V (fin:pl:ter:imperf znajdują))) (Part (part się)) (NP (*NP (*N (subst:pl:nom:n:ncol naczynia))) (AdjP (*Adj (adj:pl:nom:n:pos kuchenne)))))) (Punct (interp .))))\r\n", | |
137 | + "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:n:col Dziecko))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (AdjP (*Adj (adj:sg:loc:f:pos różowej))) (*NP (*N (subst:sg:loc:f opasce)))))) (*VP (*V (fin:sg:ter:imperf unosi))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:pl:loc:f rękach)))) (NP (AdjP (*Adj (adj:pl:acc:m3:pos drewniane))) (*NP (*N (subst:pl:acc:m3 patyczki)))) (PrepNP (*Prep (prep:inst:nwok nad)) (NP (AdjP (*AdjP (*Adj (ppas:pl:inst:n:perf:aff postawionymi))) (NP (PrepNP (*Prep (prep:gen do)) (NP (*N (subst:sg:gen:f góry)))) (*NP (*N (subst:sg:inst:n:ncol dnem))))) (*NP (NP (*N (subst:sg:inst:f miską))) (*Conj (conj i)) (NP (*N (subst:sg:inst:m3 garnkiem))))))) (Punct (interp .))))\r\n", | |
138 | + "(TOP (ROOT (*S (NP (*NP (*N (subst:pl:nom:m1 Zawodnicy))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*NP (*N (subst:sg:loc:n:ncol pobliżu))) (NP (*N (subst:sg:gen:f piłki)))))) (*VP (*V (fin:pl:ter:imperf przepychają))) (Part (part się)) (PrepNP (*Prep (prep:inst między)) (NP (*N (siebie:inst sobą)))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:n:ncol boisku))))) (Punct (interp .))))\r\n", | |
139 | + "(TOP (ROOT (*S (S (NP (*NP (*N (subst:sg:nom:f Dziewczynka))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:sg:loc:f sukience))))) (*VP (*V (fin:sg:ter:imperf puszcza))) (NP (*NP (*N (subst:pl:acc:f bańki))) (AdjP (*Adj (adj:pl:acc:f:pos mydlane)))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:f trawie))))) (Punct (interp ,)) (*Conj (conj a)) (S (PrepNP (*Prep (prep:inst za)) (NP (*N (ppron3:sg:inst:f:ter:akc:praep nią)))) (*VP (*V (fin:sg:ter:imperf stoi))) (NP (AdjP (*Adj (adj:sg:nom:f:pos druga))) (*NP (*N (subst:sg:nom:f dziewczynka)))))) (Punct (interp .))))\r\n", | |
140 | + "(TOP (ROOT (*S (NP (*NP (*N (subst:pl:nom:f Dziewczynki))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (AdjP (*Adj (adj:pl:loc:f:pos kolorowych))) (*NP (*N (subst:pl:loc:f sukienkach)))))) (*VP (*V (fin:pl:ter:imperf stoją))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:f trawie)))) (VP (Punct (interp ,)) (*VP (*V (pcon:imperf puszczając))) (NP (*NP (*N (subst:pl:acc:f bańki))) (AdjP (*Adj (adj:pl:acc:f:pos mydlane)))))) (Punct (interp .))))\r\n", | |
141 | + "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:f Grupa))) (NP (*N (subst:pl:gen:n:col dzieci)))) (*VP (*V (fin:sg:ter:imperf moczy))) (Part (part się)) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*NP (*N (subst:sg:loc:f wodzie))) (PrepNP (*Prep (prep:gen:nwok z)) (NP (*N (subst:sg:gen:f fontanny))))))) (Punct (interp .))))\r\n", | |
142 | + "(TOP (ROOT (*S (NP (*NumP (*Num (num:pl:nom:m1:rec:ncol Kilku))) (NP (*N (subst:pl:gen:m1 chłopców)))) (*VP (*V (fin:sg:ter:imperf kąpie))) (Part (part się)) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*NP (*N (subst:sg:loc:f fontannie))) (PrepNP (*Prep (prep:gen obok)) (NP (*NP (*N (subst:pl:gen:m3 stolików))) (CP (Punct (interp ,)) (*S (PrepAdjP (*Prep (prep:loc przy)) (AdjP (*Adj (adj:pl:loc:m3:pos których)))) (*VP (*V (fin:pl:ter:imperf siedzą))) (NP (*N (subst:pl:nom:m1 ludzie)))))))))) (Punct (interp .))))\r\n", | |
143 | + "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:f Dwójka))) (NP (AdjP (*Adj (adj:pl:gen:n:pos nagich))) (*NP (*N (subst:pl:gen:n:col dzieci))) (AdjP (*AdjP (*Adj (ppas:pl:gen:n:perf:aff ubrudzonych))) (NP (*N (subst:pl:inst:f farbkami)))))) (*VP (*V (fin:sg:ter:imperf siedzi))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:f podłodze)))) (PrepNP (*Prep (prep:gen obok)) (NP (AdjP (*Adj (adj:pl:gen:f:pos porozrzucanych))) (*NP (*N (subst:pl:gen:f kartek)))))) (Punct (interp .))))\r\n", | |
144 | + "(TOP (ROOT (*S (S (NP (*NumP (*Num (num:pl:nom:n:rec:col Dwoje))) (NP (AdjP (AdjP (*Adj (adj:pl:gen:n:pos nagich))) (*Conj (interp ,)) (AdjP (*Adj (adj:pl:gen:n:pos małych)))) (*NP (*N (subst:pl:gen:n:col dzieci))))) (*VP (*V (fin:sg:ter:imperf siedzi))) (PrepNP (*Prep (prep:gen naprzeciwko)) (NP (*N (siebie:gen siebie))))) (*Conj (conj i)) (S (NP (AdjP (*Adj (adj:sg:nom:n:com większe))) (*NP (*N (subst:sg:nom:n:col dziecko)))) (*VP (*V (fin:sg:ter:imperf smaruje))) (NP (*N (subst:sg:inst:f farbą))) (NP (AdjP (*Adj (adj:sg:acc:n:com mniejsze))) (*NP (*N (subst:sg:acc:n:col dziecko)))))) (Punct (interp .))))\r\n", | |
145 | + "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:f Dziewczynka))) (PrepNP (*Prep (prep:loc o)) (NP (AdjP (*Adj (adj:pl:loc:n:pos ciemnych))) (*NP (*N (subst:pl:loc:n:col oczach)))))) (*VP (*V (fin:sg:ter:imperf patrzy))) (PrepNP (*Prep (prep:acc na)) (NP (AdjP (*Adj (adj:sg:acc:m3:pos czarny))) (*NP (*N (subst:sg:acc:m3 przedmiot))) (CP (Punct (interp ,)) (*S (AdjP (*Adj (adj:sg:acc:m3:pos który))) (*VP (*V (fin:sg:ter:imperf trzyma))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:sg:loc:f ręce))))))))) (Punct (interp .))))\r\n" | |
146 | + ] | |
147 | + } | |
148 | + ], | |
149 | + "source": [ | |
150 | + "! head {BRACKETS_DIR}/pdbc-cont-train.dat" | |
151 | + ] | |
152 | + } | |
153 | + ], | |
154 | + "metadata": { | |
155 | + "kernelspec": { | |
156 | + "display_name": "TF_zajecia", | |
157 | + "language": "python", | |
158 | + "name": "tf_zajecia" | |
159 | + }, | |
160 | + "language_info": { | |
161 | + "codemirror_mode": { | |
162 | + "name": "ipython", | |
163 | + "version": 3 | |
164 | + }, | |
165 | + "file_extension": ".py", | |
166 | + "mimetype": "text/x-python", | |
167 | + "name": "python", | |
168 | + "nbconvert_exporter": "python", | |
169 | + "pygments_lexer": "ipython3", | |
170 | + "version": "3.10.6" | |
171 | + } | |
172 | + }, | |
173 | + "nbformat": 4, | |
174 | + "nbformat_minor": 5 | |
175 | +} | |
... | ... |
BeNePar/TrainAndParse.ipynb
0 → 100644
1 | +++ a/BeNePar/TrainAndParse.ipynb | |
1 | +{ | |
2 | + "cells": [ | |
3 | + { | |
4 | + "cell_type": "code", | |
5 | + "execution_count": 5, | |
6 | + "id": "d8404675", | |
7 | + "metadata": {}, | |
8 | + "outputs": [], | |
9 | + "source": [ | |
10 | + "#BENEPAR = '/home/kkrasnowska/benepar_experiments/self-attentive-parser/src/main.py'" | |
11 | + ] | |
12 | + }, | |
13 | + { | |
14 | + "cell_type": "code", | |
15 | + "execution_count": 6, | |
16 | + "id": "88603098", | |
17 | + "metadata": {}, | |
18 | + "outputs": [], | |
19 | + "source": [ | |
20 | + "#! mkdir models" | |
21 | + ] | |
22 | + }, | |
23 | + { | |
24 | + "cell_type": "code", | |
25 | + "execution_count": 7, | |
26 | + "id": "d5aedb53", | |
27 | + "metadata": {}, | |
28 | + "outputs": [], | |
29 | + "source": [ | |
30 | + "#! python {BENEPAR} train \\\n", | |
31 | + "# --train-path brackets/pdbc-cont-train.dat \\\n", | |
32 | + "# --dev-path brackets/pdbc-cont-validation.dat \\\n", | |
33 | + "# --evalb-dir /home/kkrasnowska/benepar_experiments/self-attentive-parser/EVALB_SPMRL \\\n", | |
34 | + "# --use-pretrained --pretrained-model \"allegro/herbert-large-cased\" \\\n", | |
35 | + "# --use-encoder --num-layers 2 \\\n", | |
36 | + "# --predict-tags \\\n", | |
37 | + "# --model-path-base models" | |
38 | + ] | |
39 | + }, | |
40 | + { | |
41 | + "cell_type": "code", | |
42 | + "execution_count": 8, | |
43 | + "id": "3f6aaf27", | |
44 | + "metadata": {}, | |
45 | + "outputs": [], | |
46 | + "source": [ | |
47 | + "from IPython.display import display, HTML" | |
48 | + ] | |
49 | + }, | |
50 | + { | |
51 | + "cell_type": "code", | |
52 | + "execution_count": 9, | |
53 | + "id": "8d9d5103", | |
54 | + "metadata": {}, | |
55 | + "outputs": [], | |
56 | + "source": [ | |
57 | + "import benepar\n", | |
58 | + "import nltk\n", | |
59 | + "import spacy" | |
60 | + ] | |
61 | + }, | |
62 | + { | |
63 | + "cell_type": "code", | |
64 | + "execution_count": 10, | |
65 | + "id": "c56eda57", | |
66 | + "metadata": {}, | |
67 | + "outputs": [ | |
68 | + { | |
69 | + "name": "stderr", | |
70 | + "output_type": "stream", | |
71 | + "text": [ | |
72 | + "Some weights of the model checkpoint at allegro/herbert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']\n", | |
73 | + "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", | |
74 | + "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" | |
75 | + ] | |
76 | + } | |
77 | + ], | |
78 | + "source": [ | |
79 | + "MODEL = 'models_dev=97.36.pt'\n", | |
80 | + "parser = benepar.Parser(MODEL)" | |
81 | + ] | |
82 | + }, | |
83 | + { | |
84 | + "cell_type": "code", | |
85 | + "execution_count": null, | |
86 | + "id": "35ffd9af", | |
87 | + "metadata": {}, | |
88 | + "outputs": [], | |
89 | + "source": [] | |
90 | + }, | |
91 | + { | |
92 | + "cell_type": "code", | |
93 | + "execution_count": 11, | |
94 | + "id": "06ae821c", | |
95 | + "metadata": {}, | |
96 | + "outputs": [], | |
97 | + "source": [ | |
98 | + "def postprocess(tree):\n", | |
99 | + " for node in tree.subtrees():\n", | |
100 | + " l = node.label()\n", | |
101 | + " node.set_label(l.replace('LPAR', '(').replace('RPAR', ')'))\n", | |
102 | + " for i, child in enumerate(node):\n", | |
103 | + " if type(child) == str:\n", | |
104 | + " node[i] = child.replace('-LSB-', '[').replace('-RSB-', ']')\n", | |
105 | + " return tree\n", | |
106 | + "\n", | |
107 | + "def parse_tokenized_sentences(sentences, parser):\n", | |
108 | + " [benepar.InputSentence(words=tokens) for tokens in sentences]\n", | |
109 | + " return list(map(postprocess, parser.parse_sents(\n", | |
110 | + " [benepar.InputSentence(words=tokens) for tokens in sentences]\n", | |
111 | + " )))\n", | |
112 | + "\n", | |
113 | + "def parse_sentence(sentence, parser):\n", | |
114 | + " return parse_tokenized_sentences([sentence.split()], parser)[0]" | |
115 | + ] | |
116 | + }, | |
117 | + { | |
118 | + "cell_type": "code", | |
119 | + "execution_count": 12, | |
120 | + "id": "c96dc9d9", | |
121 | + "metadata": {}, | |
122 | + "outputs": [ | |
123 | + { | |
124 | + "name": "stderr", | |
125 | + "output_type": "stream", | |
126 | + "text": [ | |
127 | + "You're using a HerbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n", | |
128 | + "/home/kkrasnowska/venvs/torch_benepar/lib/python3.10/site-packages/torch/distributions/distribution.py:44: UserWarning: <class 'torch_struct.distributions.TreeCRF'> does not define `arg_constraints`. Please set `arg_constraints = {}` or initialize the distribution with `validate_args=False` to turn off validation.\n", | |
129 | + " warnings.warn(f'{self.__class__} does not define `arg_constraints`. ' +\n" | |
130 | + ] | |
131 | + }, | |
132 | + { | |
133 | + "data": { | |
134 | + "image/svg+xml": [ | |
135 | + "<svg baseProfile=\"full\" height=\"312px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,512.0,312.0\" width=\"512px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">TOP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"12.5%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">[</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"6.25%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"75%\" x=\"12.5%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"35.4167%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:pl:nom:m2</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Koty</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"17.7083%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"12.5%\" x=\"35.4167%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">pred</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">to</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"41.6667%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"35.4167%\" x=\"47.9167%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:pl:nom:m1</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">złodzieje</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"65.625%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"16.6667%\" x=\"83.3333%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"91.6667%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"12.5%\" x=\"87.5%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">]</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"93.75%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg>" | |
136 | + ], | |
137 | + "text/plain": [ | |
138 | + "Tree('TOP', [Tree('ROOT', [Tree('Punct', [Tree('interp', ['['])]), Tree('*S', [Tree('NP', [Tree('*N', [Tree('subst:pl:nom:m2', ['Koty'])])]), Tree('*VP', [Tree('*V', [Tree('pred', ['to'])])]), Tree('NP', [Tree('*N', [Tree('subst:pl:nom:m1', ['złodzieje'])])]), Tree('Punct', [Tree('interp', ['.'])])]), Tree('Punct', [Tree('interp', [']'])])])])" | |
139 | + ] | |
140 | + }, | |
141 | + "execution_count": 12, | |
142 | + "metadata": {}, | |
143 | + "output_type": "execute_result" | |
144 | + } | |
145 | + ], | |
146 | + "source": [ | |
147 | + "parse_sentence('[ Koty to złodzieje . ]', parser)" | |
148 | + ] | |
149 | + }, | |
150 | + { | |
151 | + "cell_type": "code", | |
152 | + "execution_count": 13, | |
153 | + "id": "d62d1e31", | |
154 | + "metadata": {}, | |
155 | + "outputs": [ | |
156 | + { | |
157 | + "data": { | |
158 | + "image/svg+xml": [ | |
159 | + "<svg baseProfile=\"full\" height=\"312px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,560.0,312.0\" width=\"560px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">TOP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"88.5714%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"72.5806%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"44.4444%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">praet:sg:m1:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Widział</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"22.2222%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.5556%\" x=\"44.4444%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">aglt:sg:pri:imperf:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">am</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.2222%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"36.2903%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"27.4194%\" x=\"72.5806%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:acc:m2</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">kotka</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"86.2903%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"44.2857%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"11.4286%\" x=\"88.5714%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"94.2857%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg>" | |
160 | + ], | |
161 | + "text/plain": [ | |
162 | + "Tree('TOP', [Tree('ROOT', [Tree('*S', [Tree('*VP', [Tree('*V', [Tree('praet:sg:m1:imperf', ['Widział']), Tree('aglt:sg:pri:imperf:nwok', ['am'])])]), Tree('NP', [Tree('*N', [Tree('subst:sg:acc:m2', ['kotka'])])])]), Tree('Punct', [Tree('interp', ['.'])])])])" | |
163 | + ] | |
164 | + }, | |
165 | + "execution_count": 13, | |
166 | + "metadata": {}, | |
167 | + "output_type": "execute_result" | |
168 | + } | |
169 | + ], | |
170 | + "source": [ | |
171 | + "parse_sentence('Widział am kotka .', parser)" | |
172 | + ] | |
173 | + }, | |
174 | + { | |
175 | + "cell_type": "code", | |
176 | + "execution_count": 14, | |
177 | + "id": "418db531", | |
178 | + "metadata": {}, | |
179 | + "outputs": [], | |
180 | + "source": [ | |
181 | + "with open('brackets/pdbc-cont-validation.dat') as f:\n", | |
182 | + " val_trees = [postprocess(nltk.Tree.fromstring(l.strip('\\n'))) for l in f.readlines()]\n", | |
183 | + "with open('brackets/pdbc-cont-test.dat') as f:\n", | |
184 | + " test_trees = [postprocess(nltk.Tree.fromstring(l.strip('\\n'))) for l in f.readlines()]" | |
185 | + ] | |
186 | + }, | |
187 | + { | |
188 | + "cell_type": "code", | |
189 | + "execution_count": 15, | |
190 | + "id": "2e5f4739", | |
191 | + "metadata": {}, | |
192 | + "outputs": [], | |
193 | + "source": [ | |
194 | + "val_sentences = [tree.leaves() for tree in val_trees]\n", | |
195 | + "test_sentences = [tree.leaves() for tree in test_trees]" | |
196 | + ] | |
197 | + }, | |
198 | + { | |
199 | + "cell_type": "code", | |
200 | + "execution_count": 16, | |
201 | + "id": "6c52ef3f", | |
202 | + "metadata": {}, | |
203 | + "outputs": [ | |
204 | + { | |
205 | + "name": "stderr", | |
206 | + "output_type": "stream", | |
207 | + "text": [ | |
208 | + "/home/kkrasnowska/venvs/torch_benepar/lib/python3.10/site-packages/torch/distributions/distribution.py:44: UserWarning: <class 'torch_struct.distributions.TreeCRF'> does not define `arg_constraints`. Please set `arg_constraints = {}` or initialize the distribution with `validate_args=False` to turn off validation.\n", | |
209 | + " warnings.warn(f'{self.__class__} does not define `arg_constraints`. ' +\n" | |
210 | + ] | |
211 | + } | |
212 | + ], | |
213 | + "source": [ | |
214 | + "val_pred_trees = parse_tokenized_sentences(val_sentences, parser)\n", | |
215 | + "test_pred_trees = parse_tokenized_sentences(test_sentences, parser)" | |
216 | + ] | |
217 | + }, | |
218 | + { | |
219 | + "cell_type": "code", | |
220 | + "execution_count": 17, | |
221 | + "id": "d6d45ba8", | |
222 | + "metadata": {}, | |
223 | + "outputs": [], | |
224 | + "source": [ | |
225 | + "assert(len(val_trees) == len(val_pred_trees))\n", | |
226 | + "assert(len(test_trees) == len(test_pred_trees))" | |
227 | + ] | |
228 | + }, | |
229 | + { | |
230 | + "cell_type": "code", | |
231 | + "execution_count": 18, | |
232 | + "id": "399c3f08", | |
233 | + "metadata": {}, | |
234 | + "outputs": [], | |
235 | + "source": [ | |
236 | + "# drop the TOP\n", | |
237 | + "val_trees = [t[0] for t in val_trees]\n", | |
238 | + "test_trees = [t[0] for t in test_trees]\n", | |
239 | + "val_pred_trees = [t[0] for t in val_pred_trees]\n", | |
240 | + "test_pred_trees = [t[0] for t in test_pred_trees]" | |
241 | + ] | |
242 | + }, | |
243 | + { | |
244 | + "cell_type": "code", | |
245 | + "execution_count": 19, | |
246 | + "id": "827be810", | |
247 | + "metadata": {}, | |
248 | + "outputs": [ | |
249 | + { | |
250 | + "data": { | |
251 | + "image/svg+xml": [ | |
252 | + "<svg baseProfile=\"full\" height=\"504px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,2808.0,504.0\" width=\"2808px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"97.7208%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"41.691%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">S</text></svg><svg width=\"22.3776%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"46.875%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:loc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">W</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.4375%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"53.125%\" x=\"46.875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:loc:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">samolocie</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"73.4375%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"11.1888%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"13.986%\" x=\"22.3776%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">praet:sg:m1:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">czytał</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"29.3706%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"63.6364%\" x=\"36.3636%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"18.6813%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:pl:acc:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">wycinki</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"9.34066%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"81.3187%\" x=\"18.6813%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"20.2703%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:gen:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">z</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"10.1351%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"79.7297%\" x=\"20.2703%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"27.1186%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:gen:f</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prasy</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"13.5593%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"72.8814%\" x=\"27.1186%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"41.8605%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:gen:f:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">polskiej</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.9302%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"16.2791%\" x=\"41.8605%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">i</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"41.8605%\" x=\"58.1395%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:gen:f:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">polonijnej</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"79.0698%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"63.5593%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"60.1351%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"59.3407%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"68.1818%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.8455%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"2.33236%\" x=\"41.691%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">-</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"42.8571%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.9767%\" x=\"44.0233%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">S</text></svg><svg width=\"17.7083%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepAdjP</text></svg><svg width=\"44.1176%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:loc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">w</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"22.0588%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.8824%\" x=\"44.1176%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:loc:m3:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">każdym</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.0588%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"8.85417%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"6.77083%\" x=\"17.7083%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">imps:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">piętnowano</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"21.0938%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"17.7083%\" x=\"24.4792%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ppron3:sg:acc:m1:ter:nakc:npraep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">go</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"33.3333%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"57.8125%\" x=\"42.1875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Compar</text></svg><svg width=\"6.30631%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Comp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">comp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">jako</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"3.15315%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"93.6937%\" x=\"6.30631%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"7.69231%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">\"</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"3.84615%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"30.7692%\" x=\"7.69231%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"46.875%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:gen:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">bez</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.4375%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"53.125%\" x=\"46.875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:gen:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">umiaru</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"73.4375%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.0769%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"24.0385%\" x=\"38.4615%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ppas:sg:acc:m1:perf:aff</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">zapatrzonego</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50.4808%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"29.8077%\" x=\"62.5%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"48.3871%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:acc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">w</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"24.1935%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"51.6129%\" x=\"48.3871%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:acc:f</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Moskwę</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"74.1935%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"77.4038%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"7.69231%\" x=\"92.3077%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">\"</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"96.1538%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"53.1532%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"71.0938%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.0117%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"48.8604%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"2.2792%\" x=\"97.7208%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"98.8604%\" y1=\"1.2em\" y2=\"3em\" /></svg>" | |
253 | + ], | |
254 | + "text/plain": [ | |
255 | + "Tree('ROOT', [Tree('*S', [Tree('S', [Tree('PrepNP', [Tree('*Prep', [Tree('prep:loc:nwok', ['W'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:loc:m3', ['samolocie'])])])]), Tree('*VP', [Tree('*V', [Tree('praet:sg:m1:imperf', ['czytał'])])]), Tree('NP', [Tree('*NP', [Tree('*N', [Tree('subst:pl:acc:m3', ['wycinki'])])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:gen:nwok', ['z'])]), Tree('NP', [Tree('*NP', [Tree('*N', [Tree('subst:sg:gen:f', ['prasy'])])]), Tree('AdjP', [Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:gen:f:pos', ['polskiej'])])]), Tree('*Conj', [Tree('conj', ['i'])]), Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:gen:f:pos', ['polonijnej'])])])])])])])]), Tree('*Conj', [Tree('interp', ['-'])]), Tree('S', [Tree('PrepAdjP', [Tree('*Prep', [Tree('prep:loc:nwok', ['w'])]), Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:loc:m3:pos', ['każdym'])])])]), Tree('*VP', [Tree('*V', [Tree('imps:imperf', ['piętnowano'])])]), Tree('NP', [Tree('*N', [Tree('ppron3:sg:acc:m1:ter:nakc:npraep', ['go'])])]), Tree('Compar', [Tree('*Comp', [Tree('comp', ['jako'])]), Tree('AdjP', [Tree('Punct', [Tree('interp', ['\"'])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:gen:nwok', ['bez'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:gen:m3', ['umiaru'])])])]), Tree('*AdjP', [Tree('*Adj', [Tree('ppas:sg:acc:m1:perf:aff', ['zapatrzonego'])])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:acc:nwok', ['w'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:acc:f', ['Moskwę'])])])]), Tree('Punct', [Tree('interp', ['\"'])])])])])]), Tree('Punct', [Tree('interp', ['.'])])])" | |
256 | + ] | |
257 | + }, | |
258 | + "execution_count": 19, | |
259 | + "metadata": {}, | |
260 | + "output_type": "execute_result" | |
261 | + } | |
262 | + ], | |
263 | + "source": [ | |
264 | + "val_trees[504]" | |
265 | + ] | |
266 | + }, | |
267 | + { | |
268 | + "cell_type": "code", | |
269 | + "execution_count": 20, | |
270 | + "id": "1059e782", | |
271 | + "metadata": {}, | |
272 | + "outputs": [ | |
273 | + { | |
274 | + "data": { | |
275 | + "image/svg+xml": [ | |
276 | + "<svg baseProfile=\"full\" height=\"504px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,2808.0,504.0\" width=\"2808px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"97.7208%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"41.691%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">S</text></svg><svg width=\"22.3776%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"46.875%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:loc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">W</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.4375%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"53.125%\" x=\"46.875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:loc:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">samolocie</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"73.4375%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"11.1888%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"13.986%\" x=\"22.3776%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">praet:sg:m1:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">czytał</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"29.3706%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"63.6364%\" x=\"36.3636%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"18.6813%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:pl:acc:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">wycinki</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"9.34066%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"81.3187%\" x=\"18.6813%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"20.2703%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:gen:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">z</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"10.1351%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"79.7297%\" x=\"20.2703%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"27.1186%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:gen:f</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prasy</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"13.5593%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"72.8814%\" x=\"27.1186%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"41.8605%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:gen:f:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">polskiej</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.9302%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"16.2791%\" x=\"41.8605%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">i</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"41.8605%\" x=\"58.1395%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:gen:f:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">polonijnej</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"79.0698%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"63.5593%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"60.1351%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"59.3407%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"68.1818%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.8455%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"2.33236%\" x=\"41.691%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Conj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">-</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"42.8571%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.9767%\" x=\"44.0233%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">S</text></svg><svg width=\"17.7083%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepAdjP</text></svg><svg width=\"44.1176%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:loc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">w</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"22.0588%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"55.8824%\" x=\"44.1176%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adj:sg:loc:m3:pos</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">każdym</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.0588%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"8.85417%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"6.77083%\" x=\"17.7083%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">imps:imperf</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">piętnowano</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"21.0938%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"17.7083%\" x=\"24.4792%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ppron3:sg:acc:m1:ter:nakc:npraep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">go</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"33.3333%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"57.8125%\" x=\"42.1875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Compar</text></svg><svg width=\"6.30631%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Comp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">comp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">jako</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"3.15315%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"93.6937%\" x=\"6.30631%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">AdjP</text></svg><svg width=\"7.69231%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">\"</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"3.84615%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"30.7692%\" x=\"7.69231%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"46.875%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:gen:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">bez</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.4375%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"53.125%\" x=\"46.875%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:gen:m3</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">umiaru</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"73.4375%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"23.0769%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"24.0385%\" x=\"38.4615%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*AdjP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Adj</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ppas:sg:acc:m1:perf:aff</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">zapatrzonego</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50.4808%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"29.8077%\" x=\"62.5%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"48.3871%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">prep:acc:nwok</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">w</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"24.1935%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"51.6129%\" x=\"48.3871%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">subst:sg:acc:f</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Moskwę</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"74.1935%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"77.4038%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"7.69231%\" x=\"92.3077%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">\"</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"96.1538%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"53.1532%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"71.0938%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"72.0117%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"48.8604%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"2.2792%\" x=\"97.7208%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">interp</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"98.8604%\" y1=\"1.2em\" y2=\"3em\" /></svg>" | |
277 | + ], | |
278 | + "text/plain": [ | |
279 | + "Tree('ROOT', [Tree('*S', [Tree('S', [Tree('PrepNP', [Tree('*Prep', [Tree('prep:loc:nwok', ['W'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:loc:m3', ['samolocie'])])])]), Tree('*VP', [Tree('*V', [Tree('praet:sg:m1:imperf', ['czytał'])])]), Tree('NP', [Tree('*NP', [Tree('*N', [Tree('subst:pl:acc:m3', ['wycinki'])])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:gen:nwok', ['z'])]), Tree('NP', [Tree('*NP', [Tree('*N', [Tree('subst:sg:gen:f', ['prasy'])])]), Tree('AdjP', [Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:gen:f:pos', ['polskiej'])])]), Tree('*Conj', [Tree('conj', ['i'])]), Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:gen:f:pos', ['polonijnej'])])])])])])])]), Tree('*Conj', [Tree('interp', ['-'])]), Tree('S', [Tree('PrepAdjP', [Tree('*Prep', [Tree('prep:loc:nwok', ['w'])]), Tree('AdjP', [Tree('*Adj', [Tree('adj:sg:loc:m3:pos', ['każdym'])])])]), Tree('*VP', [Tree('*V', [Tree('imps:imperf', ['piętnowano'])])]), Tree('NP', [Tree('*N', [Tree('ppron3:sg:acc:m1:ter:nakc:npraep', ['go'])])]), Tree('Compar', [Tree('*Comp', [Tree('comp', ['jako'])]), Tree('AdjP', [Tree('Punct', [Tree('interp', ['\"'])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:gen:nwok', ['bez'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:gen:m3', ['umiaru'])])])]), Tree('*AdjP', [Tree('*Adj', [Tree('ppas:sg:acc:m1:perf:aff', ['zapatrzonego'])])]), Tree('PrepNP', [Tree('*Prep', [Tree('prep:acc:nwok', ['w'])]), Tree('NP', [Tree('*N', [Tree('subst:sg:acc:f', ['Moskwę'])])])]), Tree('Punct', [Tree('interp', ['\"'])])])])])]), Tree('Punct', [Tree('interp', ['.'])])])" | |
280 | + ] | |
281 | + }, | |
282 | + "execution_count": 20, | |
283 | + "metadata": {}, | |
284 | + "output_type": "execute_result" | |
285 | + } | |
286 | + ], | |
287 | + "source": [ | |
288 | + "val_pred_trees[504]" | |
289 | + ] | |
290 | + }, | |
291 | + { | |
292 | + "cell_type": "code", | |
293 | + "execution_count": 25, | |
294 | + "id": "4d6c7096", | |
295 | + "metadata": {}, | |
296 | + "outputs": [], | |
297 | + "source": [ | |
298 | + "'''\n", | |
299 | + "def undummy(_tree):\n", | |
300 | + " tree = _tree.copy(deep=True)\n", | |
301 | + " for node in tree.subtrees():\n", | |
302 | + " for i, child in enumerate(node):\n", | |
303 | + " if type(child) != str and child.label() == 'DUMMY_PRE':\n", | |
304 | + " node[i] = child[0]\n", | |
305 | + " return tree\n", | |
306 | + "\n", | |
307 | + "'''\n", | |
308 | + "def untag(_tree):\n", | |
309 | + " tree = _tree.copy(deep=True)\n", | |
310 | + " for node in tree.subtrees():\n", | |
311 | + " for i, child in enumerate(node):\n", | |
312 | + " if type(child) != str and set(map(type, child)) == {str}:\n", | |
313 | + " assert(len(child) == 1)\n", | |
314 | + " node[i] = child[0]\n", | |
315 | + " #if set(map(type, node)) == {str}:\n", | |
316 | + " # assert(len(node) == 1)\n", | |
317 | + " # node.set_label('xxx')\n", | |
318 | + " return tree\n", | |
319 | + "\n", | |
320 | + "'''\n", | |
321 | + "\n", | |
322 | + "ZDANIE_HEADS = {'*ff', '*spójnik', '*przec', '*zdanie', '*formawykrz'}\n", | |
323 | + "ZDANIE_HEADS2 = set(l.strip('*') for l in ZDANIE_HEADS)\n", | |
324 | + "ZDANIE_HEAD_HIERARCHY = ('ff', 'spójnik', 'przec', 'zdanie')\n", | |
325 | + "\n", | |
326 | + "\n", | |
327 | + "def correct(_tree):\n", | |
328 | + " tree = _tree.copy(deep=True)\n", | |
329 | + " for node in tree.subtrees():\n", | |
330 | + " if len(node) == 1 and type(node[0]) != str and node.label() == node[0].label():\n", | |
331 | + " new_children = [child for child in node[0]]\n", | |
332 | + " node.pop()\n", | |
333 | + " node += new_children\n", | |
334 | + " for node in tree.subtrees():\n", | |
335 | + " if 'zdanie' in node.label():\n", | |
336 | + " heads = []\n", | |
337 | + " non_heads = []\n", | |
338 | + " for child in node:\n", | |
339 | + " if child.label().startswith('*') and child.label() not in ZDANIE_HEADS:\n", | |
340 | + " child.set_label(child.label()[1:])\n", | |
341 | + " (heads if child.label().startswith('*') else non_heads).append(child)\n", | |
342 | + " \n", | |
343 | + " return tree\n", | |
344 | + "''';" | |
345 | + ] | |
346 | + }, | |
347 | + { | |
348 | + "cell_type": "code", | |
349 | + "execution_count": 35, | |
350 | + "id": "4a26b2e1", | |
351 | + "metadata": { | |
352 | + "scrolled": false | |
353 | + }, | |
354 | + "outputs": [], | |
355 | + "source": [ | |
356 | + "from collections import Counter, defaultdict\n", | |
357 | + "\n", | |
358 | + "c = Counter()\n", | |
359 | + "\n", | |
360 | + "def tree2spans(_tree):\n", | |
361 | + " # make tokens unique\n", | |
362 | + " tree = _tree.copy(deep=True)\n", | |
363 | + " idx = 0\n", | |
364 | + " for node in tree.subtrees():\n", | |
365 | + " for i, child in enumerate(node):\n", | |
366 | + " if type(child) == str:\n", | |
367 | + " node[i] = f'{idx}##{child}'\n", | |
368 | + " idx += 1\n", | |
369 | + " spans = []\n", | |
370 | + " for node in tree.subtrees():\n", | |
371 | + " spans.append((\n", | |
372 | + " node.label(),\n", | |
373 | + " tuple(child if type(child) == str else child.label() for child in node),\n", | |
374 | + " ' '.join(node.leaves())\n", | |
375 | + " ))\n", | |
376 | + " assert (len(set(spans)) == len(spans))\n", | |
377 | + " return set(spans)\n", | |
378 | + "\n", | |
379 | + "def spans2dict(spans):\n", | |
380 | + " s = defaultdict(set)\n", | |
381 | + " for node, children, text in spans:\n", | |
382 | + " if node in s[text]:\n", | |
383 | + " print('!!!!!!!!!!!!!!!', node, text)\n", | |
384 | + " display(spans)\n", | |
385 | + " s[text].add(node)\n", | |
386 | + " return s\n", | |
387 | + "\n", | |
388 | + "def spans2errors(spans_gold, spans_pred):\n", | |
389 | + " sg = spans2dict(spans_gold)\n", | |
390 | + " sp = spans2dict(spans_pred)\n", | |
391 | + " errors = []\n", | |
392 | + " tp, fp, fn = 0, 0, 0\n", | |
393 | + " for text in set(sg.keys()).union(sp.keys()):\n", | |
394 | + " txt = ' '.join('X' for _ in text.split())\n", | |
395 | + " errs = []\n", | |
396 | + " for span in sg[text].union(sp[text]):\n", | |
397 | + " if span in sg[text] and span not in sp[text]:\n", | |
398 | + " errs.append(f'-{span}')\n", | |
399 | + " fn += 1\n", | |
400 | + " elif span not in sg[text] and span in sp[text]:\n", | |
401 | + " errs.append(f'+{span}')\n", | |
402 | + " fp += 1\n", | |
403 | + " else:\n", | |
404 | + " tp += 1\n", | |
405 | + " if errs:\n", | |
406 | + " errors.append((tuple(sorted(errs)), text))\n", | |
407 | + " #display(errors)\n", | |
408 | + " #print('tp:', tp, 'fp:', fp, 'fn:', fn)\n", | |
409 | + " #p, r = tp / (tp + fp), tp / (tp + fn)\n", | |
410 | + " #f1 = 2 * tp / (2 * tp + fp + fn)\n", | |
411 | + " #print('precision: ', p)\n", | |
412 | + " #print('recall: ', r)\n", | |
413 | + " #print('f1: ', f1)\n", | |
414 | + " return (tp, fp, fn), errors" | |
415 | + ] | |
416 | + }, | |
417 | + { | |
418 | + "cell_type": "code", | |
419 | + "execution_count": 54, | |
420 | + "id": "397e3750", | |
421 | + "metadata": {}, | |
422 | + "outputs": [], | |
423 | + "source": [ | |
424 | + "def eval_trees(trees_gold, trees_pred):\n", | |
425 | + " evaluation, errors = [], []\n", | |
426 | + " for _tree_gold, _tree_pred in list(zip(trees_gold, trees_pred)):\n", | |
427 | + " try:\n", | |
428 | + " assert(''.join(_tree_gold.leaves()) == ''.join(_tree_pred.leaves()))\n", | |
429 | + " except:\n", | |
430 | + " print(_tree_gold.leaves())\n", | |
431 | + " print(_tree_pred.leaves())\n", | |
432 | + " raise\n", | |
433 | + " tree_gold = untag(_tree_gold)\n", | |
434 | + " tree_pred = untag(_tree_pred)\n", | |
435 | + " spans_gold = tree2spans(tree_gold)\n", | |
436 | + " spans_pred = tree2spans(tree_pred)\n", | |
437 | + " if tree_gold.leaves() == ['Poszedł', 'em', 'do', 'adwokata', '.']:\n", | |
438 | + " display(tree_gold)\n", | |
439 | + " display(tree_pred)\n", | |
440 | + " print(spans_gold)\n", | |
441 | + " print(spans_pred)\n", | |
442 | + " try:\n", | |
443 | + " evl, errs = spans2errors(spans_gold, spans_pred)\n", | |
444 | + " evaluation.append(evl)\n", | |
445 | + " errors += [(err, _tree_gold, _tree_pred) for err in errs]\n", | |
446 | + " except:\n", | |
447 | + " display(tree_pred)\n", | |
448 | + " display2(_tree_pred)\n", | |
449 | + " print(i)\n", | |
450 | + " raise\n", | |
451 | + " return evaluation, errors" | |
452 | + ] | |
453 | + }, | |
454 | + { | |
455 | + "cell_type": "code", | |
456 | + "execution_count": 55, | |
457 | + "id": "5dcd68fd", | |
458 | + "metadata": {}, | |
459 | + "outputs": [ | |
460 | + { | |
461 | + "data": { | |
462 | + "image/svg+xml": [ | |
463 | + "<svg baseProfile=\"full\" height=\"264px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,296.0,264.0\" width=\"296px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"81.0811%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"43.3333%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"69.2308%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Poszedł</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"34.6154%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"30.7692%\" x=\"69.2308%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">em</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"84.6154%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"21.6667%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"56.6667%\" x=\"43.3333%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"41.1765%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">do</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.5882%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"58.8235%\" x=\"41.1765%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adwokata</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"70.5882%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"71.6667%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"40.5405%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"18.9189%\" x=\"81.0811%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"90.5405%\" y1=\"1.2em\" y2=\"3em\" /></svg>" | |
464 | + ], | |
465 | + "text/plain": [ | |
466 | + "Tree('ROOT', [Tree('*S', [Tree('*VP', [Tree('*V', ['Poszedł', 'em'])]), Tree('PrepNP', [Tree('*Prep', ['do']), Tree('NP', [Tree('*N', ['adwokata'])])])]), Tree('Punct', ['.'])])" | |
467 | + ] | |
468 | + }, | |
469 | + "metadata": {}, | |
470 | + "output_type": "display_data" | |
471 | + }, | |
472 | + { | |
473 | + "data": { | |
474 | + "image/svg+xml": [ | |
475 | + "<svg baseProfile=\"full\" height=\"264px\" preserveAspectRatio=\"xMidYMid meet\" style=\"font-family: times, serif; font-weight:normal; font-style: normal; font-size: 16px;\" version=\"1.1\" viewBox=\"0,0,296.0,264.0\" width=\"296px\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">ROOT</text></svg><svg width=\"81.0811%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*S</text></svg><svg width=\"43.3333%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*VP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*V</text></svg><svg width=\"69.2308%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Poszedł</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"34.6154%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"30.7692%\" x=\"69.2308%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">em</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"84.6154%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"21.6667%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"56.6667%\" x=\"43.3333%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">PrepNP</text></svg><svg width=\"41.1765%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*Prep</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">do</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"20.5882%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"58.8235%\" x=\"41.1765%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">NP</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">*N</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">adwokata</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"70.5882%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"71.6667%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"40.5405%\" y1=\"1.2em\" y2=\"3em\" /><svg width=\"18.9189%\" x=\"81.0811%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">Punct</text></svg><svg width=\"100%\" x=\"0%\" y=\"3em\"><defs /><svg width=\"100%\" x=\"0\" y=\"0em\"><defs /><text text-anchor=\"middle\" x=\"50%\" y=\"1em\">.</text></svg></svg><line stroke=\"black\" x1=\"50%\" x2=\"50%\" y1=\"1.2em\" y2=\"3em\" /></svg><line stroke=\"black\" x1=\"50%\" x2=\"90.5405%\" y1=\"1.2em\" y2=\"3em\" /></svg>" | |
476 | + ], | |
477 | + "text/plain": [ | |
478 | + "Tree('ROOT', [Tree('*S', [Tree('*VP', [Tree('*V', ['Poszedł', 'em'])]), Tree('PrepNP', [Tree('*Prep', ['do']), Tree('NP', [Tree('*N', ['adwokata'])])])]), Tree('Punct', ['.'])])" | |
479 | + ] | |
480 | + }, | |
481 | + "metadata": {}, | |
482 | + "output_type": "display_data" | |
483 | + }, | |
484 | + { | |
485 | + "name": "stdout", | |
486 | + "output_type": "stream", | |
487 | + "text": [ | |
488 | + "{('Punct', ('4##.',), '4##.'), ('*Prep', ('2##do',), '2##do'), ('ROOT', ('*S', 'Punct'), '0##Poszedł 1##em 2##do 3##adwokata 4##.'), ('NP', ('*N',), '3##adwokata'), ('*V', ('0##Poszedł', '1##em'), '0##Poszedł 1##em'), ('PrepNP', ('*Prep', 'NP'), '2##do 3##adwokata'), ('*S', ('*VP', 'PrepNP'), '0##Poszedł 1##em 2##do 3##adwokata'), ('*N', ('3##adwokata',), '3##adwokata'), ('*VP', ('*V',), '0##Poszedł 1##em')}\n", | |
489 | + "{('Punct', ('4##.',), '4##.'), ('*Prep', ('2##do',), '2##do'), ('ROOT', ('*S', 'Punct'), '0##Poszedł 1##em 2##do 3##adwokata 4##.'), ('NP', ('*N',), '3##adwokata'), ('*V', ('0##Poszedł', '1##em'), '0##Poszedł 1##em'), ('PrepNP', ('*Prep', 'NP'), '2##do 3##adwokata'), ('*S', ('*VP', 'PrepNP'), '0##Poszedł 1##em 2##do 3##adwokata'), ('*N', ('3##adwokata',), '3##adwokata'), ('*VP', ('*V',), '0##Poszedł 1##em')}\n", | |
490 | + "!!!!!!!!!!!!!!! *AdvP 0##Trudno\n" | |
491 | + ] | |
492 | + }, | |
493 | + { | |
494 | + "data": { | |
495 | + "text/plain": [ | |
496 | + "{('*Adv', ('0##Trudno',), '0##Trudno'),\n", | |
497 | + " ('*AdvP', ('*Adv',), '0##Trudno'),\n", | |
498 | + " ('*AdvP', ('*AdvP',), '0##Trudno'),\n", | |
499 | + " ('Punct', ('1##.',), '1##.'),\n", | |
500 | + " ('ROOT', ('*AdvP', 'Punct'), '0##Trudno 1##.')}" | |
501 | + ] | |
502 | + }, | |
503 | + "metadata": {}, | |
504 | + "output_type": "display_data" | |
505 | + }, | |
506 | + { | |
507 | + "name": "stdout", | |
508 | + "output_type": "stream", | |
509 | + "text": [ | |
510 | + "!!!!!!!!!!!!!!! *AdvP 0##Trudno\n" | |
511 | + ] | |
512 | + }, | |
513 | + { | |
514 | + "data": { | |
515 | + "text/plain": [ | |
516 | + "{('*Adv', ('0##Trudno',), '0##Trudno'),\n", | |
517 | + " ('*AdvP', ('*Adv',), '0##Trudno'),\n", | |
518 | + " ('*AdvP', ('*AdvP',), '0##Trudno'),\n", | |
519 | + " ('Punct', ('1##.',), '1##.'),\n", | |
520 | + " ('ROOT', ('*AdvP', 'Punct'), '0##Trudno 1##.')}" | |
521 | + ] | |
522 | + }, | |
523 | + "metadata": {}, | |
524 | + "output_type": "display_data" | |
525 | + }, | |
526 | + { | |
527 | + "name": "stdout", | |
528 | + "output_type": "stream", | |
529 | + "text": [ | |
530 | + "!!!!!!!!!!!!!!! *PrepNP 0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic\n" | |
531 | + ] | |
532 | + }, | |
533 | + { | |
534 | + "data": { | |
535 | + "text/plain": [ | |
536 | + "{('*Comp', ('3##iż',), '3##iż'),\n", | |
537 | + " ('*Comp', ('7##iż',), '7##iż'),\n", | |
538 | + " ('*Conj', ('6##,',), '6##,'),\n", | |
539 | + " ('*N', ('1##tym',), '1##tym'),\n", | |
540 | + " ('*N', ('11##nic',), '11##nic'),\n", | |
541 | + " ('*N', ('5##nikim',), '5##nikim'),\n", | |
542 | + " ('*N', ('9##cię',), '9##cię'),\n", | |
543 | + " ('*NP', ('*N',), '1##tym'),\n", | |
544 | + " ('*Prep', ('0##O',), '0##O'),\n", | |
545 | + " ('*Prep', ('10##za',), '10##za'),\n", | |
546 | + " ('*PrepNP',\n", | |
547 | + " ('*Prep', 'NP'),\n", | |
548 | + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
549 | + " ('*PrepNP',\n", | |
550 | + " ('*PrepNP',),\n", | |
551 | + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
552 | + " ('*V', ('4##jesteś',), '4##jesteś'),\n", | |
553 | + " ('*V', ('8##mają',), '8##mają'),\n", | |
554 | + " ('*VP', ('*V',), '4##jesteś'),\n", | |
555 | + " ('*VP', ('*V',), '8##mają'),\n", | |
556 | + " ('CP', ('*Comp', 'S'), '7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
557 | + " ('CP',\n", | |
558 | + " ('CP', '*Conj', 'CP'),\n", | |
559 | + " '2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
560 | + " ('CP', ('Punct', '*Comp', 'S'), '2##, 3##iż 4##jesteś 5##nikim'),\n", | |
561 | + " ('NP', ('*N',), '11##nic'),\n", | |
562 | + " ('NP', ('*N',), '5##nikim'),\n", | |
563 | + " ('NP', ('*N',), '9##cię'),\n", | |
564 | + " ('NP',\n", | |
565 | + " ('*NP', 'CP'),\n", | |
566 | + " '1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
567 | + " ('PrepNP', ('*Prep', 'NP'), '10##za 11##nic'),\n", | |
568 | + " ('Punct', ('12##!',), '12##!'),\n", | |
569 | + " ('Punct', ('2##,',), '2##,'),\n", | |
570 | + " ('ROOT',\n", | |
571 | + " ('*PrepNP', 'Punct'),\n", | |
572 | + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic 12##!'),\n", | |
573 | + " ('S', ('*VP', 'NP'), '4##jesteś 5##nikim'),\n", | |
574 | + " ('S', ('*VP', 'NP', 'PrepNP'), '8##mają 9##cię 10##za 11##nic')}" | |
575 | + ] | |
576 | + }, | |
577 | + "metadata": {}, | |
578 | + "output_type": "display_data" | |
579 | + }, | |
580 | + { | |
581 | + "name": "stdout", | |
582 | + "output_type": "stream", | |
583 | + "text": [ | |
584 | + "!!!!!!!!!!!!!!! *PrepNP 0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic\n" | |
585 | + ] | |
586 | + }, | |
587 | + { | |
588 | + "data": { | |
589 | + "text/plain": [ | |
590 | + "{('*Comp', ('3##iż',), '3##iż'),\n", | |
591 | + " ('*Comp', ('7##iż',), '7##iż'),\n", | |
592 | + " ('*Conj', ('6##,',), '6##,'),\n", | |
593 | + " ('*N', ('1##tym',), '1##tym'),\n", | |
594 | + " ('*N', ('11##nic',), '11##nic'),\n", | |
595 | + " ('*N', ('5##nikim',), '5##nikim'),\n", | |
596 | + " ('*N', ('9##cię',), '9##cię'),\n", | |
597 | + " ('*NP', ('*N',), '1##tym'),\n", | |
598 | + " ('*Prep', ('0##O',), '0##O'),\n", | |
599 | + " ('*Prep', ('10##za',), '10##za'),\n", | |
600 | + " ('*PrepNP',\n", | |
601 | + " ('*Prep', 'NP'),\n", | |
602 | + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
603 | + " ('*PrepNP',\n", | |
604 | + " ('*PrepNP',),\n", | |
605 | + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
606 | + " ('*V', ('4##jesteś',), '4##jesteś'),\n", | |
607 | + " ('*V', ('8##mają',), '8##mają'),\n", | |
608 | + " ('*VP', ('*V',), '4##jesteś'),\n", | |
609 | + " ('*VP', ('*V',), '8##mają'),\n", | |
610 | + " ('CP', ('*Comp', 'S'), '7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
611 | + " ('CP',\n", | |
612 | + " ('CP', '*Conj', 'CP'),\n", | |
613 | + " '2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
614 | + " ('CP', ('Punct', '*Comp', 'S'), '2##, 3##iż 4##jesteś 5##nikim'),\n", | |
615 | + " ('NP', ('*N',), '11##nic'),\n", | |
616 | + " ('NP', ('*N',), '5##nikim'),\n", | |
617 | + " ('NP', ('*N',), '9##cię'),\n", | |
618 | + " ('NP',\n", | |
619 | + " ('*NP', 'CP'),\n", | |
620 | + " '1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic'),\n", | |
621 | + " ('PrepNP', ('*Prep', 'NP'), '10##za 11##nic'),\n", | |
622 | + " ('Punct', ('12##!',), '12##!'),\n", | |
623 | + " ('Punct', ('2##,',), '2##,'),\n", | |
624 | + " ('ROOT',\n", | |
625 | + " ('*PrepNP', 'Punct'),\n", | |
626 | + " '0##O 1##tym 2##, 3##iż 4##jesteś 5##nikim 6##, 7##iż 8##mają 9##cię 10##za 11##nic 12##!'),\n", | |
627 | + " ('S', ('*VP', 'NP'), '4##jesteś 5##nikim'),\n", | |
628 | + " ('S', ('*VP', 'NP', 'PrepNP'), '8##mają 9##cię 10##za 11##nic')}" | |
629 | + ] | |
630 | + }, | |
631 | + "metadata": {}, | |
632 | + "output_type": "display_data" | |
633 | + }, | |
634 | + { | |
635 | + "name": "stdout", | |
636 | + "output_type": "stream", | |
637 | + "text": [ | |
638 | + "!!!!!!!!!!!!!!! *NP 0##Cztery 1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości\n" | |
639 | + ] | |
640 | + }, | |
641 | + { | |
642 | + "data": { | |
643 | + "text/plain": [ | |
644 | + "{('*Adj', ('1##małe',), '1##małe'),\n", | |
645 | + " ('*N', ('2##groszki',), '2##groszki'),\n", | |
646 | + " ('*N', ('4##strąku',), '4##strąku'),\n", | |
647 | + " ('*N', ('6##tunelu',), '6##tunelu'),\n", | |
648 | + " ('*N', ('7##miłości',), '7##miłości'),\n", | |
649 | + " ('*NP', ('*N',), '2##groszki'),\n", | |
650 | + " ('*NP', ('*N',), '6##tunelu'),\n", | |
651 | + " ('*NP',\n", | |
652 | + " ('*NP',),\n", | |
653 | + " '0##Cztery 1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości'),\n", | |
654 | + " ('*NP',\n", | |
655 | + " ('*NumP', 'NP'),\n", | |
656 | + " '0##Cztery 1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości'),\n", | |
657 | + " ('*Num', ('0##Cztery',), '0##Cztery'),\n", | |
658 | + " ('*NumP', ('*Num',), '0##Cztery'),\n", | |
659 | + " ('*Prep', ('3##w',), '3##w'),\n", | |
660 | + " ('*Prep', ('5##w',), '5##w'),\n", | |
661 | + " ('AdjP', ('*Adj',), '1##małe'),\n", | |
662 | + " ('NP', ('*N',), '4##strąku'),\n", | |
663 | + " ('NP', ('*N',), '7##miłości'),\n", | |
664 | + " ('NP', ('*NP', 'NP'), '6##tunelu 7##miłości'),\n", | |
665 | + " ('NP',\n", | |
666 | + " ('AdjP', '*NP', 'PrepNP', 'PrepNP'),\n", | |
667 | + " '1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości'),\n", | |
668 | + " ('PrepNP', ('*Prep', 'NP'), '3##w 4##strąku'),\n", | |
669 | + " ('PrepNP', ('*Prep', 'NP'), '5##w 6##tunelu 7##miłości'),\n", | |
670 | + " ('Punct', ('8##.',), '8##.'),\n", | |
671 | + " ('ROOT',\n", | |
672 | + " ('*NP', 'Punct'),\n", | |
673 | + " '0##Cztery 1##małe 2##groszki 3##w 4##strąku 5##w 6##tunelu 7##miłości 8##.')}" | |
674 | + ] | |
675 | + }, | |
676 | + "metadata": {}, | |
677 | + "output_type": "display_data" | |
678 | + }, | |
679 | + { | |
680 | + "name": "stdout", | |
681 | + "output_type": "stream", | |
682 | + "text": [ | |
683 | + "!!!!!!!!!!!!!!! *PrepNP 0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala\n" | |
684 | + ] | |
685 | + }, | |
686 | + { | |
687 | + "data": { | |
688 | + "text/plain": [ | |
689 | + "{('*Adv', ('3##gdy',), '3##gdy'),\n", | |
690 | + " ('*N', ('1##chwili',), '1##chwili'),\n", | |
691 | + " ('*N', ('7##Alpy',), '7##Alpy'),\n", | |
692 | + " ('*N', ('8##słonie',), '8##słonie'),\n", | |
693 | + " ('*N', ('9##Hannibala',), '9##Hannibala'),\n", | |
694 | + " ('*NP', ('*N',), '1##chwili'),\n", | |
695 | + " ('*NP', ('*N',), '8##słonie'),\n", | |
696 | + " ('*Prep', ('0##W',), '0##W'),\n", | |
697 | + " ('*Prep', ('6##przez',), '6##przez'),\n", | |
698 | + " ('*PrepNP',\n", | |
699 | + " ('*Prep', 'NP'),\n", | |
700 | + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
701 | + " ('*PrepNP',\n", | |
702 | + " ('*PrepNP',),\n", | |
703 | + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
704 | + " ('*S',\n", | |
705 | + " ('AdvP', '*VP', 'PrepNP', 'NP'),\n", | |
706 | + " '3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
707 | + " ('*V', ('4##przeprowadzał', '5##em'), '4##przeprowadzał 5##em'),\n", | |
708 | + " ('*VP', ('*V',), '4##przeprowadzał 5##em'),\n", | |
709 | + " ('AdvP', ('*Adv',), '3##gdy'),\n", | |
710 | + " ('CP',\n", | |
711 | + " ('Punct', '*S'),\n", | |
712 | + " '2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
713 | + " ('NP', ('*N',), '7##Alpy'),\n", | |
714 | + " ('NP', ('*N',), '9##Hannibala'),\n", | |
715 | + " ('NP',\n", | |
716 | + " ('*NP', 'CP'),\n", | |
717 | + " '1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
718 | + " ('NP', ('*NP', 'NP'), '8##słonie 9##Hannibala'),\n", | |
719 | + " ('PrepNP', ('*Prep', 'NP'), '6##przez 7##Alpy'),\n", | |
720 | + " ('Punct', ('10##.',), '10##.'),\n", | |
721 | + " ('Punct', ('2##,',), '2##,'),\n", | |
722 | + " ('ROOT',\n", | |
723 | + " ('*PrepNP', 'Punct'),\n", | |
724 | + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala 10##.')}" | |
725 | + ] | |
726 | + }, | |
727 | + "metadata": {}, | |
728 | + "output_type": "display_data" | |
729 | + }, | |
730 | + { | |
731 | + "name": "stdout", | |
732 | + "output_type": "stream", | |
733 | + "text": [ | |
734 | + "!!!!!!!!!!!!!!! *PrepNP 0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala\n" | |
735 | + ] | |
736 | + }, | |
737 | + { | |
738 | + "data": { | |
739 | + "text/plain": [ | |
740 | + "{('*Adv', ('3##gdy',), '3##gdy'),\n", | |
741 | + " ('*N', ('1##chwili',), '1##chwili'),\n", | |
742 | + " ('*N', ('7##Alpy',), '7##Alpy'),\n", | |
743 | + " ('*N', ('8##słonie',), '8##słonie'),\n", | |
744 | + " ('*N', ('9##Hannibala',), '9##Hannibala'),\n", | |
745 | + " ('*NP', ('*N',), '1##chwili'),\n", | |
746 | + " ('*NP', ('*N',), '8##słonie'),\n", | |
747 | + " ('*Prep', ('0##W',), '0##W'),\n", | |
748 | + " ('*Prep', ('6##przez',), '6##przez'),\n", | |
749 | + " ('*PrepNP',\n", | |
750 | + " ('*Prep', 'NP'),\n", | |
751 | + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
752 | + " ('*PrepNP',\n", | |
753 | + " ('*PrepNP',),\n", | |
754 | + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
755 | + " ('*S',\n", | |
756 | + " ('AdvP', '*VP', 'PrepNP', 'NP'),\n", | |
757 | + " '3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
758 | + " ('*V', ('4##przeprowadzał', '5##em'), '4##przeprowadzał 5##em'),\n", | |
759 | + " ('*VP', ('*V',), '4##przeprowadzał 5##em'),\n", | |
760 | + " ('AdvP', ('*Adv',), '3##gdy'),\n", | |
761 | + " ('CP',\n", | |
762 | + " ('Punct', '*S'),\n", | |
763 | + " '2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
764 | + " ('NP', ('*N',), '7##Alpy'),\n", | |
765 | + " ('NP', ('*N',), '9##Hannibala'),\n", | |
766 | + " ('NP',\n", | |
767 | + " ('*NP', 'CP'),\n", | |
768 | + " '1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala'),\n", | |
769 | + " ('NP', ('*NP', 'NP'), '8##słonie 9##Hannibala'),\n", | |
770 | + " ('PrepNP', ('*Prep', 'NP'), '6##przez 7##Alpy'),\n", | |
771 | + " ('Punct', ('10##.',), '10##.'),\n", | |
772 | + " ('Punct', ('2##,',), '2##,'),\n", | |
773 | + " ('ROOT',\n", | |
774 | + " ('*PrepNP', 'Punct'),\n", | |
775 | + " '0##W 1##chwili 2##, 3##gdy 4##przeprowadzał 5##em 6##przez 7##Alpy 8##słonie 9##Hannibala 10##.')}" | |
776 | + ] | |
777 | + }, | |
778 | + "metadata": {}, | |
779 | + "output_type": "display_data" | |
780 | + }, | |
781 | + { | |
782 | + "name": "stdout", | |
783 | + "output_type": "stream", | |
784 | + "text": [ | |
785 | + "!!!!!!!!!!!!!!! *PrepNP 1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna\n" | |
786 | + ] | |
787 | + }, | |
788 | + { | |
789 | + "data": { | |
790 | + "text/plain": [ | |
791 | + "{('*Adj', ('2##roczną',), '2##roczną'),\n", | |
792 | + " ('*Adj', ('7##czternasty',), '7##czternasty'),\n", | |
793 | + " ('*N', ('3##misję',), '3##misję'),\n", | |
794 | + " ('*N', ('5##Tytana',), '5##Tytana'),\n", | |
795 | + " ('*N', ('8##księżyc',), '8##księżyc'),\n", | |
796 | + " ('*N', ('9##Saturna',), '9##Saturna'),\n", | |
797 | + " ('*NP', ('*N',), '3##misję'),\n", | |
798 | + " ('*NP', ('*N',), '5##Tytana'),\n", | |
799 | + " ('*NP', ('*N',), '8##księżyc'),\n", | |
800 | + " ('*Prep', ('1##na',), '1##na'),\n", | |
801 | + " ('*Prep', ('4##na',), '4##na'),\n", | |
802 | + " ('*PrepNP',\n", | |
803 | + " ('*Prep', 'NP'),\n", | |
804 | + " '1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
805 | + " ('*PrepNP',\n", | |
806 | + " ('*PrepNP',),\n", | |
807 | + " '1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
808 | + " ('AdjP', ('*Adj',), '2##roczną'),\n", | |
809 | + " ('AdjP', ('*Adj',), '7##czternasty'),\n", | |
810 | + " ('NP', ('*N',), '9##Saturna'),\n", | |
811 | + " ('NP', ('*NP', 'NP'), '5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
812 | + " ('NP',\n", | |
813 | + " ('AdjP', '*NP', 'PrepNP'),\n", | |
814 | + " '2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
815 | + " ('NP',\n", | |
816 | + " ('Punct', 'AdjP', '*NP', 'NP'),\n", | |
817 | + " '6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
818 | + " ('PrepNP',\n", | |
819 | + " ('*Prep', 'NP'),\n", | |
820 | + " '4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
821 | + " ('Punct', ('0##-',), '0##-'),\n", | |
822 | + " ('Punct', ('10##.',), '10##.'),\n", | |
823 | + " ('Punct', ('6##,',), '6##,'),\n", | |
824 | + " ('ROOT',\n", | |
825 | + " ('Punct', '*PrepNP', 'Punct'),\n", | |
826 | + " '0##- 1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna 10##.')}" | |
827 | + ] | |
828 | + }, | |
829 | + "metadata": {}, | |
830 | + "output_type": "display_data" | |
831 | + }, | |
832 | + { | |
833 | + "name": "stdout", | |
834 | + "output_type": "stream", | |
835 | + "text": [ | |
836 | + "!!!!!!!!!!!!!!! *PrepNP 1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna\n" | |
837 | + ] | |
838 | + }, | |
839 | + { | |
840 | + "data": { | |
841 | + "text/plain": [ | |
842 | + "{('*Adj', ('2##roczną',), '2##roczną'),\n", | |
843 | + " ('*Adj', ('7##czternasty',), '7##czternasty'),\n", | |
844 | + " ('*N', ('3##misję',), '3##misję'),\n", | |
845 | + " ('*N', ('5##Tytana',), '5##Tytana'),\n", | |
846 | + " ('*N', ('8##księżyc',), '8##księżyc'),\n", | |
847 | + " ('*N', ('9##Saturna',), '9##Saturna'),\n", | |
848 | + " ('*NP', ('*N',), '3##misję'),\n", | |
849 | + " ('*NP', ('*N',), '5##Tytana'),\n", | |
850 | + " ('*NP', ('*N',), '8##księżyc'),\n", | |
851 | + " ('*Prep', ('1##na',), '1##na'),\n", | |
852 | + " ('*Prep', ('4##na',), '4##na'),\n", | |
853 | + " ('*PrepNP',\n", | |
854 | + " ('*Prep', 'NP'),\n", | |
855 | + " '1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
856 | + " ('*PrepNP',\n", | |
857 | + " ('*PrepNP',),\n", | |
858 | + " '1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
859 | + " ('AdjP', ('*Adj',), '2##roczną'),\n", | |
860 | + " ('AdjP', ('*Adj',), '7##czternasty'),\n", | |
861 | + " ('NP', ('*N',), '9##Saturna'),\n", | |
862 | + " ('NP', ('*NP', 'NP'), '5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
863 | + " ('NP',\n", | |
864 | + " ('AdjP', '*NP', 'PrepNP'),\n", | |
865 | + " '2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
866 | + " ('NP',\n", | |
867 | + " ('Punct', 'AdjP', '*NP', 'NP'),\n", | |
868 | + " '6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
869 | + " ('PrepNP',\n", | |
870 | + " ('*Prep', 'NP'),\n", | |
871 | + " '4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna'),\n", | |
872 | + " ('Punct', ('0##-',), '0##-'),\n", | |
873 | + " ('Punct', ('10##.',), '10##.'),\n", | |
874 | + " ('Punct', ('6##,',), '6##,'),\n", | |
875 | + " ('ROOT',\n", | |
876 | + " ('Punct', '*PrepNP', 'Punct'),\n", | |
877 | + " '0##- 1##na 2##roczną 3##misję 4##na 5##Tytana 6##, 7##czternasty 8##księżyc 9##Saturna 10##.')}" | |
878 | + ] | |
879 | + }, | |
880 | + "metadata": {}, | |
881 | + "output_type": "display_data" | |
882 | + } | |
883 | + ], | |
884 | + "source": [ | |
885 | + "evaluation_val, errors_val = eval_trees(val_trees, val_pred_trees)\n", | |
886 | + "evaluation_test, errors_test = eval_trees(test_trees, test_pred_trees)" | |
887 | + ] | |
888 | + }, | |
889 | + { | |
890 | + "cell_type": "code", | |
891 | + "execution_count": 56, | |
892 | + "id": "65af3522", | |
893 | + "metadata": {}, | |
894 | + "outputs": [ | |
895 | + { | |
896 | + "name": "stdout", | |
897 | + "output_type": "stream", | |
898 | + "text": [ | |
899 | + "DEV:\n", | |
900 | + "precision: 0.974400637684714\n", | |
901 | + "recall: 0.9742960930674555\n", | |
902 | + "f1: 0.9743483625717548\n" | |
903 | + ] | |
904 | + } | |
905 | + ], | |
906 | + "source": [ | |
907 | + "tp, fp, fn = list(map(sum, zip(*evaluation_val)))\n", | |
908 | + "p, r = tp / (tp + fp), tp / (tp + fn)\n", | |
909 | + "f1 = 2 * tp / (2 * tp + fp + fn)\n", | |
910 | + "print('DEV:')\n", | |
911 | + "print('precision: ', p)\n", | |
912 | + "print('recall: ', r)\n", | |
913 | + "print('f1: ', f1)" | |
914 | + ] | |
915 | + }, | |
916 | + { | |
917 | + "cell_type": "code", | |
918 | + "execution_count": 57, | |
919 | + "id": "8e0f3f93", | |
920 | + "metadata": {}, | |
921 | + "outputs": [ | |
922 | + { | |
923 | + "name": "stdout", | |
924 | + "output_type": "stream", | |
925 | + "text": [ | |
926 | + "TEST:\n", | |
927 | + "precision: 0.9774147274466051\n", | |
928 | + "recall: 0.9775082092645137\n", | |
929 | + "f1: 0.9774614661204711\n" | |
930 | + ] | |
931 | + } | |
932 | + ], | |
933 | + "source": [ | |
934 | + "tp, fp, fn = list(map(sum, zip(*evaluation_test)))\n", | |
935 | + "p, r = tp / (tp + fp), tp / (tp + fn)\n", | |
936 | + "f1 = 2 * tp / (2 * tp + fp + fn)\n", | |
937 | + "print('TEST:')\n", | |
938 | + "print('precision: ', p)\n", | |
939 | + "print('recall: ', r)\n", | |
940 | + "print('f1: ', f1)" | |
941 | + ] | |
942 | + }, | |
943 | + { | |
944 | + "cell_type": "code", | |
945 | + "execution_count": null, | |
946 | + "id": "302b2333", | |
947 | + "metadata": {}, | |
948 | + "outputs": [], | |
949 | + "source": [] | |
950 | + } | |
951 | + ], | |
952 | + "metadata": { | |
953 | + "kernelspec": { | |
954 | + "display_name": "torch_benepar", | |
955 | + "language": "python", | |
956 | + "name": "torch_benepar" | |
957 | + }, | |
958 | + "language_info": { | |
959 | + "codemirror_mode": { | |
960 | + "name": "ipython", | |
961 | + "version": 3 | |
962 | + }, | |
963 | + "file_extension": ".py", | |
964 | + "mimetype": "text/x-python", | |
965 | + "name": "python", | |
966 | + "nbconvert_exporter": "python", | |
967 | + "pygments_lexer": "ipython3", | |
968 | + "version": "3.10.6" | |
969 | + } | |
970 | + }, | |
971 | + "nbformat": 4, | |
972 | + "nbformat_minor": 5 | |
973 | +} | |
... | ... |
COMBO/DataPreparation.ipynb
0 → 100644
1 | +++ a/COMBO/DataPreparation.ipynb | |
1 | +{ | |
2 | + "cells": [ | |
3 | + { | |
4 | + "cell_type": "code", | |
5 | + "execution_count": 46, | |
6 | + "id": "5cd26f6f", | |
7 | + "metadata": {}, | |
8 | + "outputs": [], | |
9 | + "source": [ | |
10 | + "import os\n", | |
11 | + "\n", | |
12 | + "from datasets import load_dataset\n", | |
13 | + "\n", | |
14 | + "from IPython.display import display" | |
15 | + ] | |
16 | + }, | |
17 | + { | |
18 | + "cell_type": "code", | |
19 | + "execution_count": 47, | |
20 | + "id": "fecef4af", | |
21 | + "metadata": {}, | |
22 | + "outputs": [ | |
23 | + { | |
24 | + "name": "stderr", | |
25 | + "output_type": "stream", | |
26 | + "text": [ | |
27 | + "Found cached dataset pdb_c_beta (/home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1)\n" | |
28 | + ] | |
29 | + }, | |
30 | + { | |
31 | + "data": { | |
32 | + "application/vnd.jupyter.widget-view+json": { | |
33 | + "model_id": "1c89c7103bba4347a3fa7d23cac42cfe", | |
34 | + "version_major": 2, | |
35 | + "version_minor": 0 | |
36 | + }, | |
37 | + "text/plain": [ | |
38 | + " 0%| | 0/3 [00:00<?, ?it/s]" | |
39 | + ] | |
40 | + }, | |
41 | + "metadata": {}, | |
42 | + "output_type": "display_data" | |
43 | + } | |
44 | + ], | |
45 | + "source": [ | |
46 | + "pdbc_dataset = load_dataset('../pdb_c_beta')" | |
47 | + ] | |
48 | + }, | |
49 | + { | |
50 | + "cell_type": "code", | |
51 | + "execution_count": 48, | |
52 | + "id": "23da801f", | |
53 | + "metadata": {}, | |
54 | + "outputs": [], | |
55 | + "source": [ | |
56 | + "CONLLU_DIR = 'connlu'\n", | |
57 | + "! rm -r {CONLLU_DIR}\n", | |
58 | + "! mkdir {CONLLU_DIR}" | |
59 | + ] | |
60 | + }, | |
61 | + { | |
62 | + "cell_type": "code", | |
63 | + "execution_count": 50, | |
64 | + "id": "91fb3bf3", | |
65 | + "metadata": {}, | |
66 | + "outputs": [], | |
67 | + "source": [ | |
68 | + "import sys\n", | |
69 | + "sys.path.append('../')\n", | |
70 | + "from neural_parser.hybrid_tree_utils import tree_from_dataset_instance" | |
71 | + ] | |
72 | + }, | |
73 | + { | |
74 | + "cell_type": "code", | |
75 | + "execution_count": 60, | |
76 | + "id": "c105feff", | |
77 | + "metadata": {}, | |
78 | + "outputs": [ | |
79 | + { | |
80 | + "name": "stdout", | |
81 | + "output_type": "stream", | |
82 | + "text": [ | |
83 | + "train\n", | |
84 | + " connlu/pdbc-train.conllu\n", | |
85 | + " 17659\n", | |
86 | + " connlu/pdbc-cont-train.conllu\n", | |
87 | + " 15903\n", | |
88 | + "validation\n", | |
89 | + " connlu/pdbc-validation.conllu\n", | |
90 | + " 2211\n", | |
91 | + " connlu/pdbc-cont-validation.conllu\n", | |
92 | + " 1980\n", | |
93 | + "test\n", | |
94 | + " connlu/pdbc-test.conllu\n", | |
95 | + " 2205\n", | |
96 | + " connlu/pdbc-cont-test.conllu\n", | |
97 | + " 1990\n" | |
98 | + ] | |
99 | + } | |
100 | + ], | |
101 | + "source": [ | |
102 | + "features = pdbc_dataset['train'].features\n", | |
103 | + "\n", | |
104 | + "for part, dataset in pdbc_dataset.items():\n", | |
105 | + " print(part)\n", | |
106 | + " s_cont, s_all = [], [] \n", | |
107 | + " for sentence in dataset:\n", | |
108 | + " # TODO! check if discont\n", | |
109 | + " tokens = sentence['tokens']\n", | |
110 | + " lemmas = sentence['lemmas']\n", | |
111 | + " heads = sentence['heads']\n", | |
112 | + " heads = [h + 1 if h is not None else 0 for i, h in enumerate(heads)]\n", | |
113 | + " deprels = [features['deprels'].feature.int2str(d) for d in sentence['deprels']]\n", | |
114 | + " deprels = ['root' if deprel == 'ROOT' else deprel for deprel in deprels]\n", | |
115 | + " rows = [f'# text = {\" \".join(tokens)}'] + [\n", | |
116 | + " f'{i + 1}\\t{t}\\t{l}\\t_\\t_\\t_\\t{h}\\t{d}\\t{h}:{d}\\t_'\n", | |
117 | + " for i, (t, l, h, d) in enumerate(zip(tokens, lemmas, heads, deprels))\n", | |
118 | + " ]\n", | |
119 | + " s_all.append(rows)\n", | |
120 | + " if tree_from_dataset_instance(sentence, features).is_continuous():\n", | |
121 | + " s_cont.append(rows)\n", | |
122 | + " f_all = os.path.join(CONLLU_DIR, f'pdbc-{part}.conllu')\n", | |
123 | + " f_cont = os.path.join(CONLLU_DIR, f'pdbc-cont-{part}.conllu')\n", | |
124 | + " with open(f_all, 'w') as f:\n", | |
125 | + " print(' ', f_all)\n", | |
126 | + " print(' ', len(s_all))\n", | |
127 | + " for rows in s_all:\n", | |
128 | + " print('\\n'.join(rows), end='\\n\\n', file=f)\n", | |
129 | + " with open(f_cont, 'w') as f:\n", | |
130 | + " print(' ', f_cont)\n", | |
131 | + " print(' ', len(s_cont))\n", | |
132 | + " for rows in s_cont:\n", | |
133 | + " print('\\n'.join(rows), end='\\n\\n', file=f)" | |
134 | + ] | |
135 | + }, | |
136 | + { | |
137 | + "cell_type": "code", | |
138 | + "execution_count": 61, | |
139 | + "id": "c849233c", | |
140 | + "metadata": {}, | |
141 | + "outputs": [ | |
142 | + { | |
143 | + "name": "stdout", | |
144 | + "output_type": "stream", | |
145 | + "text": [ | |
146 | + " 32509 319813 1398303 connlu/pdbc-cont-test.conllu\n", | |
147 | + " 32509 319813 1198902 connlu/pdbc-cont-test-pred.conllu\n", | |
148 | + " 271337 2682725 11781617 connlu/pdbc-cont-train.conllu\n", | |
149 | + " 33491 330792 1452373 connlu/pdbc-cont-validation.conllu\n", | |
150 | + " 33491 330792 1244192 connlu/pdbc-cont-validation-pred.conllu\n", | |
151 | + " 37754 373431 1639937 connlu/pdbc-test.conllu\n", | |
152 | + " 37754 373431 1406776 connlu/pdbc-test-pred.conllu\n", | |
153 | + " 315364 3133712 13808053 connlu/pdbc-train.conllu\n", | |
154 | + " 38987 386865 1704685 connlu/pdbc-validation.conllu\n", | |
155 | + " 38987 386865 1461922 connlu/pdbc-validation-pred.conllu\n", | |
156 | + " 872183 8638239 37096760 total\n" | |
157 | + ] | |
158 | + } | |
159 | + ], | |
160 | + "source": [ | |
161 | + "! wc {CONLLU_DIR}/*.conllu" | |
162 | + ] | |
163 | + }, | |
164 | + { | |
165 | + "cell_type": "code", | |
166 | + "execution_count": 62, | |
167 | + "id": "6b571716", | |
168 | + "metadata": {}, | |
169 | + "outputs": [ | |
170 | + { | |
171 | + "name": "stdout", | |
172 | + "output_type": "stream", | |
173 | + "text": [ | |
174 | + "# text = Skośnooka dziewczynka trzyma w rękach drewniane pałeczki , a przed nią znajdują się naczynia kuchenne .\r\n", | |
175 | + "1\tSkośnooka\tskośnooki\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n", | |
176 | + "2\tdziewczynka\tdziewczynka\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n", | |
177 | + "3\ttrzyma\ttrzymać\t_\t_\t_\t9\tconjunct\t9:conjunct\t_\r\n", | |
178 | + "4\tw\tw\t_\t_\t_\t3\tadjunct_locat\t3:adjunct_locat\t_\r\n", | |
179 | + "5\trękach\tręka\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n", | |
180 | + "6\tdrewniane\tdrewniany\t_\t_\t_\t7\tadjunct\t7:adjunct\t_\r\n", | |
181 | + "7\tpałeczki\tpałeczka\t_\t_\t_\t3\tobj\t3:obj\t_\r\n", | |
182 | + "8\t,\t,\t_\t_\t_\t9\tpunct\t9:punct\t_\r\n", | |
183 | + "9\ta\ta\t_\t_\t_\t0\troot\t0:root\t_\r\n" | |
184 | + ] | |
185 | + } | |
186 | + ], | |
187 | + "source": [ | |
188 | + "! head {CONLLU_DIR}/pdbc-train.conllu" | |
189 | + ] | |
190 | + } | |
191 | + ], | |
192 | + "metadata": { | |
193 | + "kernelspec": { | |
194 | + "display_name": "TF_zajecia", | |
195 | + "language": "python", | |
196 | + "name": "tf_zajecia" | |
197 | + }, | |
198 | + "language_info": { | |
199 | + "codemirror_mode": { | |
200 | + "name": "ipython", | |
201 | + "version": 3 | |
202 | + }, | |
203 | + "file_extension": ".py", | |
204 | + "mimetype": "text/x-python", | |
205 | + "name": "python", | |
206 | + "nbconvert_exporter": "python", | |
207 | + "pygments_lexer": "ipython3", | |
208 | + "version": "3.10.6" | |
209 | + } | |
210 | + }, | |
211 | + "nbformat": 4, | |
212 | + "nbformat_minor": 5 | |
213 | +} | |
... | ... |
COMBO/ParseValAndTrain.ipynb
0 → 100644
1 | +++ a/COMBO/ParseValAndTrain.ipynb | |
1 | +{ | |
2 | + "cells": [ | |
3 | + { | |
4 | + "cell_type": "code", | |
5 | + "execution_count": 1, | |
6 | + "id": "aabfb24b", | |
7 | + "metadata": {}, | |
8 | + "outputs": [], | |
9 | + "source": [ | |
10 | + "COMBO = '/home/kkrasnowska/anaconda3/envs/combo_p39/bin/combo'" | |
11 | + ] | |
12 | + }, | |
13 | + { | |
14 | + "cell_type": "markdown", | |
15 | + "id": "787fff78", | |
16 | + "metadata": {}, | |
17 | + "source": [ | |
18 | + "Main model" | |
19 | + ] | |
20 | + }, | |
21 | + { | |
22 | + "cell_type": "code", | |
23 | + "execution_count": 2, | |
24 | + "id": "1d9daaa9", | |
25 | + "metadata": {}, | |
26 | + "outputs": [ | |
27 | + { | |
28 | + "name": "stdout", | |
29 | + "output_type": "stream", | |
30 | + "text": [ | |
31 | + "I0407 10:49:31.448594 140072765682752 archival.py:184] loading archive file model-pdbc/model.tar.gz\n", | |
32 | + "I0407 10:49:31.449148 140072765682752 archival.py:263] extracting archive file model-pdbc/model.tar.gz to temp dir /tmp/tmp_htckuhc\n", | |
33 | + "I0407 10:49:48.075045 140072765682752 params.py:248] dataset_reader.type = conllu\n", | |
34 | + "I0407 10:49:48.075561 140072765682752 params.py:248] dataset_reader.lazy = False\n", | |
35 | + "I0407 10:49:48.075693 140072765682752 params.py:248] dataset_reader.cache_directory = None\n", | |
36 | + "I0407 10:49:48.075764 140072765682752 params.py:248] dataset_reader.max_instances = None\n", | |
37 | + "I0407 10:49:48.075832 140072765682752 params.py:248] dataset_reader.manual_distributed_sharding = False\n", | |
38 | + "I0407 10:49:48.075901 140072765682752 params.py:248] dataset_reader.manual_multi_process_sharding = False\n", | |
39 | + "I0407 10:49:48.076193 140072765682752 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n", | |
40 | + "I0407 10:49:48.076388 140072765682752 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n", | |
41 | + "I0407 10:49:48.076621 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n", | |
42 | + "I0407 10:49:48.076697 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
43 | + "I0407 10:49:48.076790 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
44 | + "I0407 10:49:48.076939 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
45 | + "I0407 10:49:48.077063 140072765682752 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n", | |
46 | + "I0407 10:49:48.077118 140072765682752 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n", | |
47 | + "I0407 10:49:48.077185 140072765682752 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n", | |
48 | + "I0407 10:49:48.077238 140072765682752 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n", | |
49 | + "I0407 10:49:48.077383 140072765682752 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n", | |
50 | + "I0407 10:49:48.077555 140072765682752 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n", | |
51 | + "I0407 10:49:48.077628 140072765682752 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n", | |
52 | + "I0407 10:49:48.077702 140072765682752 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n", | |
53 | + "I0407 10:49:48.077838 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n", | |
54 | + "I0407 10:49:48.078031 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n", | |
55 | + "I0407 10:49:48.078231 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n", | |
56 | + "I0407 10:49:48.078300 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n", | |
57 | + "I0407 10:49:48.078378 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n", | |
58 | + "I0407 10:49:48.078666 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n", | |
59 | + "I0407 10:49:48.078786 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n", | |
60 | + "I0407 10:49:48.078862 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n", | |
61 | + "I0407 10:49:48.078916 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n", | |
62 | + "I0407 10:49:48.078969 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n", | |
63 | + "I0407 10:49:48.079103 140072765682752 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n", | |
64 | + "I0407 10:49:48.079328 140072765682752 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n", | |
65 | + "I0407 10:49:48.079406 140072765682752 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n", | |
66 | + "I0407 10:49:48.079461 140072765682752 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n", | |
67 | + "I0407 10:49:48.079525 140072765682752 params.py:248] dataset_reader.token_indexers.token.max_length = None\n", | |
68 | + "I0407 10:49:48.079628 140072765682752 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n", | |
69 | + "I0407 10:49:51.185825 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n", | |
70 | + "I0407 10:49:51.186234 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n", | |
71 | + "I0407 10:49:51.186336 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n", | |
72 | + "I0407 10:49:51.186398 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n", | |
73 | + "I0407 10:49:51.186465 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n", | |
74 | + "I0407 10:49:51.186517 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n", | |
75 | + "I0407 10:49:51.186579 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
76 | + "I0407 10:49:51.186631 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n", | |
77 | + "I0407 10:49:51.186791 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n", | |
78 | + "I0407 10:49:51.186975 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n", | |
79 | + "I0407 10:49:51.187041 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n", | |
80 | + "I0407 10:49:51.187107 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n", | |
81 | + "I0407 10:49:51.187170 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n", | |
82 | + "I0407 10:49:51.187220 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n", | |
83 | + "I0407 10:49:51.187275 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
84 | + "I0407 10:49:51.187334 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n", | |
85 | + "I0407 10:49:51.187556 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n", | |
86 | + "I0407 10:49:51.187731 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n", | |
87 | + "I0407 10:49:51.187935 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n", | |
88 | + "I0407 10:49:51.187995 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
89 | + "I0407 10:49:51.188073 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
90 | + "I0407 10:49:51.188217 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
91 | + "I0407 10:49:51.188334 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n", | |
92 | + "I0407 10:49:51.188398 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n", | |
93 | + "I0407 10:49:51.188460 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n", | |
94 | + "I0407 10:49:51.188522 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n", | |
95 | + "I0407 10:49:51.188614 140072765682752 params.py:248] dataset_reader.features = ['token', 'char']\n", | |
96 | + "I0407 10:49:51.188712 140072765682752 params.py:248] dataset_reader.targets = ['head', 'deprel']\n", | |
97 | + "I0407 10:49:51.188802 140072765682752 params.py:248] dataset_reader.use_sem = False\n", | |
98 | + "I0407 10:49:51.188952 140072765682752 params.py:248] dataset_reader.type = conllu\n", | |
99 | + "I0407 10:49:51.189191 140072765682752 params.py:248] dataset_reader.lazy = False\n", | |
100 | + "I0407 10:49:51.189266 140072765682752 params.py:248] dataset_reader.cache_directory = None\n", | |
101 | + "I0407 10:49:51.189324 140072765682752 params.py:248] dataset_reader.max_instances = None\n", | |
102 | + "I0407 10:49:51.189382 140072765682752 params.py:248] dataset_reader.manual_distributed_sharding = False\n", | |
103 | + "I0407 10:49:51.189436 140072765682752 params.py:248] dataset_reader.manual_multi_process_sharding = False\n", | |
104 | + "I0407 10:49:51.189675 140072765682752 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n", | |
105 | + "I0407 10:49:51.189843 140072765682752 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n", | |
106 | + "I0407 10:49:51.190060 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n", | |
107 | + "I0407 10:49:51.190128 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
108 | + "I0407 10:49:51.190197 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
109 | + "I0407 10:49:51.190324 140072765682752 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
110 | + "I0407 10:49:51.190443 140072765682752 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n", | |
111 | + "I0407 10:49:51.190508 140072765682752 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n", | |
112 | + "I0407 10:49:51.190564 140072765682752 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n", | |
113 | + "I0407 10:49:51.190627 140072765682752 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n", | |
114 | + "I0407 10:49:51.190772 140072765682752 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n", | |
115 | + "I0407 10:49:51.190932 140072765682752 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n", | |
116 | + "I0407 10:49:51.191003 140072765682752 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n", | |
117 | + "I0407 10:49:51.191065 140072765682752 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n", | |
118 | + "I0407 10:49:51.191206 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n", | |
119 | + "I0407 10:49:51.191369 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n", | |
120 | + "I0407 10:49:51.191561 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n", | |
121 | + "I0407 10:49:51.191629 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n", | |
122 | + "I0407 10:49:51.191706 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n", | |
123 | + "I0407 10:49:51.191827 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n", | |
124 | + "I0407 10:49:51.191938 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n", | |
125 | + "I0407 10:49:51.191999 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n", | |
126 | + "I0407 10:49:51.192067 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n", | |
127 | + "I0407 10:49:51.192142 140072765682752 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n", | |
128 | + "I0407 10:49:51.192281 140072765682752 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n", | |
129 | + "I0407 10:49:51.192501 140072765682752 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n", | |
130 | + "I0407 10:49:51.192575 140072765682752 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n", | |
131 | + "I0407 10:49:51.192638 140072765682752 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n", | |
132 | + "I0407 10:49:51.192698 140072765682752 params.py:248] dataset_reader.token_indexers.token.max_length = None\n", | |
133 | + "I0407 10:49:51.192795 140072765682752 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n", | |
134 | + "I0407 10:49:51.194080 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n" | |
135 | + ] | |
136 | + }, | |
137 | + { | |
138 | + "name": "stdout", | |
139 | + "output_type": "stream", | |
140 | + "text": [ | |
141 | + "I0407 10:49:51.194318 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n", | |
142 | + "I0407 10:49:51.194404 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n", | |
143 | + "I0407 10:49:51.194471 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n", | |
144 | + "I0407 10:49:51.194532 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n", | |
145 | + "I0407 10:49:51.194586 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n", | |
146 | + "I0407 10:49:51.194648 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
147 | + "I0407 10:49:51.194708 140072765682752 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n", | |
148 | + "I0407 10:49:51.194854 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n", | |
149 | + "I0407 10:49:51.195033 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n", | |
150 | + "I0407 10:49:51.195105 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n", | |
151 | + "I0407 10:49:51.195167 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n", | |
152 | + "I0407 10:49:51.195222 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n", | |
153 | + "I0407 10:49:51.195280 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n", | |
154 | + "I0407 10:49:51.195338 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
155 | + "I0407 10:49:51.195398 140072765682752 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n", | |
156 | + "I0407 10:49:51.195601 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n", | |
157 | + "I0407 10:49:51.195774 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n", | |
158 | + "I0407 10:49:51.195971 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n", | |
159 | + "I0407 10:49:51.196039 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
160 | + "I0407 10:49:51.196113 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
161 | + "I0407 10:49:51.196244 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
162 | + "I0407 10:49:51.196364 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n", | |
163 | + "I0407 10:49:51.196430 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n", | |
164 | + "I0407 10:49:51.196492 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n", | |
165 | + "I0407 10:49:51.196552 140072765682752 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n", | |
166 | + "I0407 10:49:51.196640 140072765682752 params.py:248] dataset_reader.features = ['token', 'char']\n", | |
167 | + "I0407 10:49:51.196732 140072765682752 params.py:248] dataset_reader.targets = ['head', 'deprel']\n", | |
168 | + "I0407 10:49:51.196815 140072765682752 params.py:248] dataset_reader.use_sem = False\n", | |
169 | + "I0407 10:49:51.197346 140072765682752 params.py:248] vocabulary.type = from_instances_extended\n", | |
170 | + "I0407 10:49:51.197421 140072765682752 vocabulary.py:323] Loading token dictionary from /tmp/tmp_htckuhc/vocabulary.\n", | |
171 | + "I0407 10:49:51.197736 140072765682752 filelock.py:254] Lock 140069359832176 acquired on /tmp/tmp_htckuhc/vocabulary/.lock\n", | |
172 | + "I0407 10:49:51.198361 140072765682752 filelock.py:317] Lock 140069359832176 released on /tmp/tmp_htckuhc/vocabulary/.lock\n", | |
173 | + "I0407 10:49:51.198865 140072765682752 params.py:248] model.type = semantic_multitask\n", | |
174 | + "I0407 10:49:51.199399 140072765682752 params.py:248] model.text_field_embedder.type = basic\n", | |
175 | + "I0407 10:49:51.199762 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.type = char_embeddings_from_config\n", | |
176 | + "I0407 10:49:51.199955 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.embedding_dim = 64\n", | |
177 | + "I0407 10:49:51.200206 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.input_dim = 64\n", | |
178 | + "I0407 10:49:51.200286 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.filters = [512, 256, 64]\n", | |
179 | + "I0407 10:49:51.200380 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.kernel_size = [3, 3, 3]\n", | |
180 | + "I0407 10:49:51.200467 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.stride = [1, 1, 1]\n", | |
181 | + "I0407 10:49:51.200556 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.padding = [1, 2, 4]\n", | |
182 | + "I0407 10:49:51.200649 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.dilation = [1, 2, 4]\n", | |
183 | + "I0407 10:49:51.200745 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.activations = ['relu', 'relu', 'linear']\n", | |
184 | + "I0407 10:49:51.200886 140072765682752 params.py:248] type = relu\n", | |
185 | + "I0407 10:49:51.201073 140072765682752 params.py:248] type = relu\n", | |
186 | + "I0407 10:49:51.201222 140072765682752 params.py:248] type = linear\n", | |
187 | + "I0407 10:49:51.208180 140072765682752 params.py:248] model.text_field_embedder.token_embedders.char.vocab_namespace = token_characters\n", | |
188 | + "I0407 10:49:51.208718 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.type = transformers_word_embeddings\n", | |
189 | + "I0407 10:49:51.208946 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.model_name = allegro/herbert-large-cased\n", | |
190 | + "I0407 10:49:51.209028 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.projection_dim = 100\n", | |
191 | + "I0407 10:49:51.209110 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.projection_activation = <function TransformersWordEmbedder.<lambda> at 0x7f646dd85280>\n", | |
192 | + "I0407 10:49:51.209182 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.projection_dropout_rate = 0.0\n", | |
193 | + "I0407 10:49:51.209239 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.freeze_transformer = True\n", | |
194 | + "I0407 10:49:51.209295 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.last_layer_only = True\n", | |
195 | + "I0407 10:49:51.209401 140072765682752 params.py:384] model.text_field_embedder.token_embedders.token.tokenizer_kwargs.use_fast = False\n", | |
196 | + "I0407 10:49:51.209471 140072765682752 params.py:248] model.text_field_embedder.token_embedders.token.transformer_kwargs = None\n", | |
197 | + "I0407 10:49:58.747374 140072765682752 params.py:248] model.seq_encoder.type = combo_encoder\n", | |
198 | + "I0407 10:49:58.747746 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.input_size = 164\n", | |
199 | + "I0407 10:49:58.747819 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.hidden_size = 512\n", | |
200 | + "I0407 10:49:58.747869 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.num_layers = 2\n", | |
201 | + "I0407 10:49:58.747919 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.recurrent_dropout_probability = 0.33\n", | |
202 | + "I0407 10:49:58.747966 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.layer_dropout_probability = 0.33\n", | |
203 | + "I0407 10:49:58.748013 140072765682752 params.py:248] model.seq_encoder.stacked_bilstm.use_highway = False\n", | |
204 | + "I0407 10:49:59.084017 140072765682752 params.py:248] model.seq_encoder.layer_dropout_probability = 0.33\n", | |
205 | + "I0407 10:49:59.084280 140072765682752 params.py:248] model.use_sample_weight = True\n", | |
206 | + "I0407 10:49:59.084377 140072765682752 params.py:248] model.lemmatizer = None\n", | |
207 | + "I0407 10:49:59.084436 140072765682752 params.py:248] model.upos_tagger = None\n", | |
208 | + "I0407 10:49:59.084487 140072765682752 params.py:248] model.xpos_tagger = None\n", | |
209 | + "I0407 10:49:59.084537 140072765682752 params.py:248] model.semantic_relation = None\n", | |
210 | + "I0407 10:49:59.084585 140072765682752 params.py:248] model.morphological_feat = None\n", | |
211 | + "I0407 10:49:59.084832 140072765682752 params.py:248] model.dependency_relation.type = combo_dependency_parsing_from_vocab\n", | |
212 | + "I0407 10:49:59.085025 140072765682752 params.py:248] model.dependency_relation.vocab_namespace = deprel_labels\n", | |
213 | + "I0407 10:49:59.085301 140072765682752 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.in_features = 1024\n", | |
214 | + "I0407 10:49:59.085365 140072765682752 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.out_features = 512\n", | |
215 | + "I0407 10:49:59.085421 140072765682752 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.activation = tanh\n", | |
216 | + "I0407 10:49:59.085520 140072765682752 params.py:248] type = tanh\n", | |
217 | + "I0407 10:49:59.085608 140072765682752 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.dropout_rate = 0.0\n", | |
218 | + "I0407 10:49:59.089095 140072765682752 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.in_features = 1024\n", | |
219 | + "I0407 10:49:59.089183 140072765682752 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.out_features = 512\n", | |
220 | + "I0407 10:49:59.089244 140072765682752 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.activation = tanh\n", | |
221 | + "I0407 10:49:59.089346 140072765682752 params.py:248] type = tanh\n", | |
222 | + "I0407 10:49:59.089423 140072765682752 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.dropout_rate = 0.0\n", | |
223 | + "I0407 10:49:59.092701 140072765682752 params.py:248] model.dependency_relation.head_predictor.cycle_loss_n = 0\n", | |
224 | + "I0407 10:49:59.092917 140072765682752 params.py:248] model.dependency_relation.head_projection_layer.in_features = 1024\n", | |
225 | + "I0407 10:49:59.092972 140072765682752 params.py:248] model.dependency_relation.head_projection_layer.out_features = 128\n", | |
226 | + "I0407 10:49:59.093022 140072765682752 params.py:248] model.dependency_relation.head_projection_layer.activation = tanh\n", | |
227 | + "I0407 10:49:59.093108 140072765682752 params.py:248] type = tanh\n", | |
228 | + "I0407 10:49:59.093183 140072765682752 params.py:248] model.dependency_relation.head_projection_layer.dropout_rate = 0.25\n", | |
229 | + "I0407 10:49:59.094336 140072765682752 params.py:248] model.dependency_relation.dependency_projection_layer.in_features = 1024\n", | |
230 | + "I0407 10:49:59.094411 140072765682752 params.py:248] model.dependency_relation.dependency_projection_layer.out_features = 128\n", | |
231 | + "I0407 10:49:59.094463 140072765682752 params.py:248] model.dependency_relation.dependency_projection_layer.activation = tanh\n", | |
232 | + "I0407 10:49:59.094551 140072765682752 params.py:248] type = tanh\n", | |
233 | + "I0407 10:49:59.094618 140072765682752 params.py:248] model.dependency_relation.dependency_projection_layer.dropout_rate = 0.25\n", | |
234 | + "I0407 10:49:59.095806 140072765682752 params.py:248] model.enhanced_dependency_relation = None\n", | |
235 | + "I0407 10:49:59.096206 140072765682752 params.py:248] model.regularizer.regexes.0.1.type = l2\n", | |
236 | + "I0407 10:49:59.096345 140072765682752 params.py:248] model.regularizer.regexes.0.1.alpha = 1e-06\n", | |
237 | + "I0407 10:49:59.096471 140072765682752 params.py:248] model.regularizer.regexes.1.1.type = l2\n", | |
238 | + "I0407 10:49:59.096584 140072765682752 params.py:248] model.regularizer.regexes.1.1.alpha = 1e-06\n", | |
239 | + "I0407 10:49:59.096696 140072765682752 params.py:248] model.regularizer.regexes.2.1.type = l2\n", | |
240 | + "I0407 10:49:59.096809 140072765682752 params.py:248] model.regularizer.regexes.2.1.alpha = 1e-06\n", | |
241 | + "I0407 10:49:59.096917 140072765682752 params.py:248] model.regularizer.regexes.3.1.type = l2\n", | |
242 | + "I0407 10:49:59.097025 140072765682752 params.py:248] model.regularizer.regexes.3.1.alpha = 1e-05\n" | |
243 | + ] | |
244 | + }, | |
245 | + { | |
246 | + "name": "stdout", | |
247 | + "output_type": "stream", | |
248 | + "text": [ | |
249 | + "I0407 10:50:01.854557 140072765682752 archival.py:211] removing temporary unarchived model dir at /tmp/tmp_htckuhc\n", | |
250 | + "reading instances: 2211it [01:52, 19.69it/s]\n" | |
251 | + ] | |
252 | + } | |
253 | + ], | |
254 | + "source": [ | |
255 | + "! {COMBO} --mode predict \\\n", | |
256 | + " --cuda_device 0 \\\n", | |
257 | + " --model_path model-pdbc/model.tar.gz \\\n", | |
258 | + " --input_file connlu/pdbc-validation.conllu \\\n", | |
259 | + " --output_file connlu/pdbc-validation-pred.conllu" | |
260 | + ] | |
261 | + }, | |
262 | + { | |
263 | + "cell_type": "code", | |
264 | + "execution_count": 3, | |
265 | + "id": "11f1b7b1", | |
266 | + "metadata": {}, | |
267 | + "outputs": [ | |
268 | + { | |
269 | + "name": "stdout", | |
270 | + "output_type": "stream", | |
271 | + "text": [ | |
272 | + "# text = Dwie dziewczynki opierają się o dach kapliczki , chłopiec wspina się na niego , a trzecia dziewczynka stoi obok .\r\n", | |
273 | + "1\tDwie\tdwa\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n", | |
274 | + "2\tdziewczynki\tdziewczynka\t_\t_\t_\t1\tcomp\t1:comp\t_\r\n", | |
275 | + "3\topierają\topierać\t_\t_\t_\t15\tconjunct\t15:conjunct\t_\r\n", | |
276 | + "4\tsię\tsię\t_\t_\t_\t3\trefl\t3:refl\t_\r\n", | |
277 | + "5\to\to\t_\t_\t_\t3\tcomp\t3:comp\t_\r\n", | |
278 | + "6\tdach\tdach\t_\t_\t_\t5\tcomp\t5:comp\t_\r\n", | |
279 | + "7\tkapliczki\tkapliczka\t_\t_\t_\t6\tadjunct\t6:adjunct\t_\r\n", | |
280 | + "8\t,\t,\t_\t_\t_\t15\tpunct\t15:punct\t_\r\n", | |
281 | + "9\tchłopiec\tchłopiec\t_\t_\t_\t10\tsubj\t10:subj\t_\r\n" | |
282 | + ] | |
283 | + } | |
284 | + ], | |
285 | + "source": [ | |
286 | + "! head connlu/pdbc-validation.conllu" | |
287 | + ] | |
288 | + }, | |
289 | + { | |
290 | + "cell_type": "code", | |
291 | + "execution_count": 4, | |
292 | + "id": "8fa72124", | |
293 | + "metadata": {}, | |
294 | + "outputs": [ | |
295 | + { | |
296 | + "name": "stdout", | |
297 | + "output_type": "stream", | |
298 | + "text": [ | |
299 | + "# text = Dwie dziewczynki opierają się o dach kapliczki , chłopiec wspina się na niego , a trzecia dziewczynka stoi obok .\r\n", | |
300 | + "1\tDwie\tdwa\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n", | |
301 | + "2\tdziewczynki\tdziewczynka\t_\t_\t_\t1\tcomp\t1:comp\t_\r\n", | |
302 | + "3\topierają\topierać\t_\t_\t_\t15\tconjunct\t15:conjunct\t_\r\n", | |
303 | + "4\tsię\tsię\t_\t_\t_\t3\trefl\t3:refl\t_\r\n", | |
304 | + "5\to\to\t_\t_\t_\t3\tcomp\t3:comp\t_\r\n", | |
305 | + "6\tdach\tdach\t_\t_\t_\t5\tcomp\t5:comp\t_\r\n", | |
306 | + "7\tkapliczki\tkapliczka\t_\t_\t_\t6\tadjunct\t6:adjunct\t_\r\n", | |
307 | + "8\t,\t,\t_\t_\t_\t15\tpunct\t15:punct\t_\r\n", | |
308 | + "9\tchłopiec\tchłopiec\t_\t_\t_\t10\tsubj\t10:subj\t_\r\n" | |
309 | + ] | |
310 | + } | |
311 | + ], | |
312 | + "source": [ | |
313 | + "! head connlu/pdbc-validation-pred.conllu" | |
314 | + ] | |
315 | + }, | |
316 | + { | |
317 | + "cell_type": "code", | |
318 | + "execution_count": 5, | |
319 | + "id": "dde6dd31", | |
320 | + "metadata": {}, | |
321 | + "outputs": [ | |
322 | + { | |
323 | + "name": "stdout", | |
324 | + "output_type": "stream", | |
325 | + "text": [ | |
326 | + "I0407 10:52:00.220404 139754138821696 archival.py:184] loading archive file model-pdbc/model.tar.gz\n", | |
327 | + "I0407 10:52:00.221079 139754138821696 archival.py:263] extracting archive file model-pdbc/model.tar.gz to temp dir /tmp/tmp2jhqu3i6\n", | |
328 | + "I0407 10:52:16.996590 139754138821696 params.py:248] dataset_reader.type = conllu\n", | |
329 | + "I0407 10:52:16.997079 139754138821696 params.py:248] dataset_reader.lazy = False\n", | |
330 | + "I0407 10:52:16.997236 139754138821696 params.py:248] dataset_reader.cache_directory = None\n", | |
331 | + "I0407 10:52:16.997326 139754138821696 params.py:248] dataset_reader.max_instances = None\n", | |
332 | + "I0407 10:52:16.997391 139754138821696 params.py:248] dataset_reader.manual_distributed_sharding = False\n", | |
333 | + "I0407 10:52:16.997456 139754138821696 params.py:248] dataset_reader.manual_multi_process_sharding = False\n", | |
334 | + "I0407 10:52:16.997756 139754138821696 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n", | |
335 | + "I0407 10:52:16.997950 139754138821696 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n", | |
336 | + "I0407 10:52:16.998211 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n", | |
337 | + "I0407 10:52:16.998285 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
338 | + "I0407 10:52:16.998367 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
339 | + "I0407 10:52:16.998522 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
340 | + "I0407 10:52:16.998643 139754138821696 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n", | |
341 | + "I0407 10:52:16.998707 139754138821696 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n", | |
342 | + "I0407 10:52:16.998770 139754138821696 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n", | |
343 | + "I0407 10:52:16.998831 139754138821696 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n", | |
344 | + "I0407 10:52:16.998980 139754138821696 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n", | |
345 | + "I0407 10:52:16.999143 139754138821696 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n", | |
346 | + "I0407 10:52:16.999213 139754138821696 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n", | |
347 | + "I0407 10:52:16.999269 139754138821696 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n", | |
348 | + "I0407 10:52:16.999412 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n", | |
349 | + "I0407 10:52:16.999578 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n", | |
350 | + "I0407 10:52:16.999774 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n", | |
351 | + "I0407 10:52:16.999842 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n", | |
352 | + "I0407 10:52:16.999923 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n", | |
353 | + "I0407 10:52:17.000045 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n", | |
354 | + "I0407 10:52:17.000156 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n", | |
355 | + "I0407 10:52:17.000220 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n", | |
356 | + "I0407 10:52:17.000282 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n", | |
357 | + "I0407 10:52:17.000344 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n", | |
358 | + "I0407 10:52:17.000521 139754138821696 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n", | |
359 | + "I0407 10:52:17.000770 139754138821696 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n", | |
360 | + "I0407 10:52:17.000865 139754138821696 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n", | |
361 | + "I0407 10:52:17.000947 139754138821696 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n", | |
362 | + "I0407 10:52:17.001028 139754138821696 params.py:248] dataset_reader.token_indexers.token.max_length = None\n", | |
363 | + "I0407 10:52:17.001172 139754138821696 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n", | |
364 | + "I0407 10:52:20.459573 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n", | |
365 | + "I0407 10:52:20.459947 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n", | |
366 | + "I0407 10:52:20.460046 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n", | |
367 | + "I0407 10:52:20.460119 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n", | |
368 | + "I0407 10:52:20.460172 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n", | |
369 | + "I0407 10:52:20.460235 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n", | |
370 | + "I0407 10:52:20.460288 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
371 | + "I0407 10:52:20.460351 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n", | |
372 | + "I0407 10:52:20.460508 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n", | |
373 | + "I0407 10:52:20.460695 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n", | |
374 | + "I0407 10:52:20.460773 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n", | |
375 | + "I0407 10:52:20.460840 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n", | |
376 | + "I0407 10:52:20.460901 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n", | |
377 | + "I0407 10:52:20.460962 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n", | |
378 | + "I0407 10:52:20.461021 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
379 | + "I0407 10:52:20.461083 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n", | |
380 | + "I0407 10:52:20.461313 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n", | |
381 | + "I0407 10:52:20.461496 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n", | |
382 | + "I0407 10:52:20.461706 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n", | |
383 | + "I0407 10:52:20.461774 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
384 | + "I0407 10:52:20.461853 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
385 | + "I0407 10:52:20.462028 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
386 | + "I0407 10:52:20.462157 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n", | |
387 | + "I0407 10:52:20.462226 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n", | |
388 | + "I0407 10:52:20.462283 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n", | |
389 | + "I0407 10:52:20.462336 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n", | |
390 | + "I0407 10:52:20.462417 139754138821696 params.py:248] dataset_reader.features = ['token', 'char']\n", | |
391 | + "I0407 10:52:20.462514 139754138821696 params.py:248] dataset_reader.targets = ['head', 'deprel']\n", | |
392 | + "I0407 10:52:20.462607 139754138821696 params.py:248] dataset_reader.use_sem = False\n", | |
393 | + "I0407 10:52:20.462767 139754138821696 params.py:248] dataset_reader.type = conllu\n", | |
394 | + "I0407 10:52:20.463083 139754138821696 params.py:248] dataset_reader.lazy = False\n", | |
395 | + "I0407 10:52:20.463172 139754138821696 params.py:248] dataset_reader.cache_directory = None\n", | |
396 | + "I0407 10:52:20.463237 139754138821696 params.py:248] dataset_reader.max_instances = None\n", | |
397 | + "I0407 10:52:20.463301 139754138821696 params.py:248] dataset_reader.manual_distributed_sharding = False\n", | |
398 | + "I0407 10:52:20.463361 139754138821696 params.py:248] dataset_reader.manual_multi_process_sharding = False\n", | |
399 | + "I0407 10:52:20.463605 139754138821696 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n", | |
400 | + "I0407 10:52:20.463779 139754138821696 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n", | |
401 | + "I0407 10:52:20.463980 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n", | |
402 | + "I0407 10:52:20.464051 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
403 | + "I0407 10:52:20.464129 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
404 | + "I0407 10:52:20.464254 139754138821696 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
405 | + "I0407 10:52:20.464366 139754138821696 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n", | |
406 | + "I0407 10:52:20.464429 139754138821696 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n", | |
407 | + "I0407 10:52:20.464490 139754138821696 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n", | |
408 | + "I0407 10:52:20.464552 139754138821696 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n", | |
409 | + "I0407 10:52:20.464691 139754138821696 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n", | |
410 | + "I0407 10:52:20.464847 139754138821696 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n", | |
411 | + "I0407 10:52:20.464918 139754138821696 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n", | |
412 | + "I0407 10:52:20.464980 139754138821696 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n", | |
413 | + "I0407 10:52:20.465120 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n", | |
414 | + "I0407 10:52:20.465285 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n", | |
415 | + "I0407 10:52:20.465479 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n", | |
416 | + "I0407 10:52:20.465544 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n", | |
417 | + "I0407 10:52:20.465618 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n", | |
418 | + "I0407 10:52:20.465741 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n", | |
419 | + "I0407 10:52:20.465851 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n", | |
420 | + "I0407 10:52:20.465914 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n", | |
421 | + "I0407 10:52:20.466024 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n", | |
422 | + "I0407 10:52:20.466112 139754138821696 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n", | |
423 | + "I0407 10:52:20.466268 139754138821696 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n", | |
424 | + "I0407 10:52:20.466485 139754138821696 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n", | |
425 | + "I0407 10:52:20.466559 139754138821696 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n", | |
426 | + "I0407 10:52:20.466621 139754138821696 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n", | |
427 | + "I0407 10:52:20.466682 139754138821696 params.py:248] dataset_reader.token_indexers.token.max_length = None\n", | |
428 | + "I0407 10:52:20.466777 139754138821696 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n", | |
429 | + "I0407 10:52:20.468071 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n", | |
430 | + "I0407 10:52:20.468319 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n", | |
431 | + "I0407 10:52:20.468404 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n", | |
432 | + "I0407 10:52:20.468464 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n", | |
433 | + "I0407 10:52:20.468523 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n", | |
434 | + "I0407 10:52:20.468573 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n", | |
435 | + "I0407 10:52:20.468636 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
436 | + "I0407 10:52:20.468697 139754138821696 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n", | |
437 | + "I0407 10:52:20.468832 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n", | |
438 | + "I0407 10:52:20.469012 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n", | |
439 | + "I0407 10:52:20.469086 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n", | |
440 | + "I0407 10:52:20.469144 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n", | |
441 | + "I0407 10:52:20.469196 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n", | |
442 | + "I0407 10:52:20.469256 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n", | |
443 | + "I0407 10:52:20.469320 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
444 | + "I0407 10:52:20.469382 139754138821696 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n", | |
445 | + "I0407 10:52:20.469586 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n", | |
446 | + "I0407 10:52:20.469758 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n", | |
447 | + "I0407 10:52:20.469957 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n", | |
448 | + "I0407 10:52:20.470050 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
449 | + "I0407 10:52:20.470128 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
450 | + "I0407 10:52:20.470261 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
451 | + "I0407 10:52:20.470381 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n", | |
452 | + "I0407 10:52:20.470448 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n", | |
453 | + "I0407 10:52:20.470509 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n", | |
454 | + "I0407 10:52:20.470579 139754138821696 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n", | |
455 | + "I0407 10:52:20.470668 139754138821696 params.py:248] dataset_reader.features = ['token', 'char']\n", | |
456 | + "I0407 10:52:20.470764 139754138821696 params.py:248] dataset_reader.targets = ['head', 'deprel']\n", | |
457 | + "I0407 10:52:20.470849 139754138821696 params.py:248] dataset_reader.use_sem = False\n", | |
458 | + "I0407 10:52:20.471387 139754138821696 params.py:248] vocabulary.type = from_instances_extended\n", | |
459 | + "I0407 10:52:20.471461 139754138821696 vocabulary.py:323] Loading token dictionary from /tmp/tmp2jhqu3i6/vocabulary.\n", | |
460 | + "I0407 10:52:20.471798 139754138821696 filelock.py:254] Lock 139750732975216 acquired on /tmp/tmp2jhqu3i6/vocabulary/.lock\n", | |
461 | + "I0407 10:52:20.472387 139754138821696 filelock.py:317] Lock 139750732975216 released on /tmp/tmp2jhqu3i6/vocabulary/.lock\n", | |
462 | + "I0407 10:52:20.472922 139754138821696 params.py:248] model.type = semantic_multitask\n", | |
463 | + "I0407 10:52:20.473455 139754138821696 params.py:248] model.text_field_embedder.type = basic\n", | |
464 | + "I0407 10:52:20.473808 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.type = char_embeddings_from_config\n", | |
465 | + "I0407 10:52:20.474030 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.embedding_dim = 64\n", | |
466 | + "I0407 10:52:20.474286 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.input_dim = 64\n", | |
467 | + "I0407 10:52:20.474377 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.filters = [512, 256, 64]\n", | |
468 | + "I0407 10:52:20.474480 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.kernel_size = [3, 3, 3]\n", | |
469 | + "I0407 10:52:20.474578 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.stride = [1, 1, 1]\n", | |
470 | + "I0407 10:52:20.474673 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.padding = [1, 2, 4]\n", | |
471 | + "I0407 10:52:20.474768 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.dilation = [1, 2, 4]\n", | |
472 | + "I0407 10:52:20.474864 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.activations = ['relu', 'relu', 'linear']\n", | |
473 | + "I0407 10:52:20.475005 139754138821696 params.py:248] type = relu\n", | |
474 | + "I0407 10:52:20.475197 139754138821696 params.py:248] type = relu\n", | |
475 | + "I0407 10:52:20.475347 139754138821696 params.py:248] type = linear\n", | |
476 | + "I0407 10:52:20.481609 139754138821696 params.py:248] model.text_field_embedder.token_embedders.char.vocab_namespace = token_characters\n", | |
477 | + "I0407 10:52:20.482178 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.type = transformers_word_embeddings\n", | |
478 | + "I0407 10:52:20.482446 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.model_name = allegro/herbert-large-cased\n", | |
479 | + "I0407 10:52:20.482533 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.projection_dim = 100\n", | |
480 | + "I0407 10:52:20.482632 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.projection_activation = <function TransformersWordEmbedder.<lambda> at 0x7f1a3e346280>\n", | |
481 | + "I0407 10:52:20.482703 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.projection_dropout_rate = 0.0\n", | |
482 | + "I0407 10:52:20.482769 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.freeze_transformer = True\n", | |
483 | + "I0407 10:52:20.482831 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.last_layer_only = True\n", | |
484 | + "I0407 10:52:20.482933 139754138821696 params.py:384] model.text_field_embedder.token_embedders.token.tokenizer_kwargs.use_fast = False\n", | |
485 | + "I0407 10:52:20.483003 139754138821696 params.py:248] model.text_field_embedder.token_embedders.token.transformer_kwargs = None\n" | |
486 | + ] | |
487 | + }, | |
488 | + { | |
489 | + "name": "stdout", | |
490 | + "output_type": "stream", | |
491 | + "text": [ | |
492 | + "I0407 10:52:28.699278 139754138821696 params.py:248] model.seq_encoder.type = combo_encoder\n", | |
493 | + "I0407 10:52:28.699747 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.input_size = 164\n", | |
494 | + "I0407 10:52:28.699841 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.hidden_size = 512\n", | |
495 | + "I0407 10:52:28.699910 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.num_layers = 2\n", | |
496 | + "I0407 10:52:28.699976 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.recurrent_dropout_probability = 0.33\n", | |
497 | + "I0407 10:52:28.700042 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.layer_dropout_probability = 0.33\n", | |
498 | + "I0407 10:52:28.700106 139754138821696 params.py:248] model.seq_encoder.stacked_bilstm.use_highway = False\n", | |
499 | + "I0407 10:52:29.089101 139754138821696 params.py:248] model.seq_encoder.layer_dropout_probability = 0.33\n", | |
500 | + "I0407 10:52:29.089426 139754138821696 params.py:248] model.use_sample_weight = True\n", | |
501 | + "I0407 10:52:29.089556 139754138821696 params.py:248] model.lemmatizer = None\n", | |
502 | + "I0407 10:52:29.089638 139754138821696 params.py:248] model.upos_tagger = None\n", | |
503 | + "I0407 10:52:29.089704 139754138821696 params.py:248] model.xpos_tagger = None\n", | |
504 | + "I0407 10:52:29.089766 139754138821696 params.py:248] model.semantic_relation = None\n", | |
505 | + "I0407 10:52:29.089827 139754138821696 params.py:248] model.morphological_feat = None\n", | |
506 | + "I0407 10:52:29.090160 139754138821696 params.py:248] model.dependency_relation.type = combo_dependency_parsing_from_vocab\n", | |
507 | + "I0407 10:52:29.090409 139754138821696 params.py:248] model.dependency_relation.vocab_namespace = deprel_labels\n", | |
508 | + "I0407 10:52:29.090762 139754138821696 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.in_features = 1024\n", | |
509 | + "I0407 10:52:29.090843 139754138821696 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.out_features = 512\n", | |
510 | + "I0407 10:52:29.090915 139754138821696 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.activation = tanh\n", | |
511 | + "I0407 10:52:29.091041 139754138821696 params.py:248] type = tanh\n", | |
512 | + "I0407 10:52:29.091149 139754138821696 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.dropout_rate = 0.0\n", | |
513 | + "I0407 10:52:29.096003 139754138821696 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.in_features = 1024\n", | |
514 | + "I0407 10:52:29.096106 139754138821696 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.out_features = 512\n", | |
515 | + "I0407 10:52:29.096185 139754138821696 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.activation = tanh\n", | |
516 | + "I0407 10:52:29.096311 139754138821696 params.py:248] type = tanh\n", | |
517 | + "I0407 10:52:29.096407 139754138821696 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.dropout_rate = 0.0\n", | |
518 | + "I0407 10:52:29.101276 139754138821696 params.py:248] model.dependency_relation.head_predictor.cycle_loss_n = 0\n", | |
519 | + "I0407 10:52:29.101581 139754138821696 params.py:248] model.dependency_relation.head_projection_layer.in_features = 1024\n", | |
520 | + "I0407 10:52:29.101692 139754138821696 params.py:248] model.dependency_relation.head_projection_layer.out_features = 128\n", | |
521 | + "I0407 10:52:29.101771 139754138821696 params.py:248] model.dependency_relation.head_projection_layer.activation = tanh\n", | |
522 | + "I0407 10:52:29.101904 139754138821696 params.py:248] type = tanh\n", | |
523 | + "I0407 10:52:29.102032 139754138821696 params.py:248] model.dependency_relation.head_projection_layer.dropout_rate = 0.25\n", | |
524 | + "I0407 10:52:29.103649 139754138821696 params.py:248] model.dependency_relation.dependency_projection_layer.in_features = 1024\n", | |
525 | + "I0407 10:52:29.103747 139754138821696 params.py:248] model.dependency_relation.dependency_projection_layer.out_features = 128\n", | |
526 | + "I0407 10:52:29.103819 139754138821696 params.py:248] model.dependency_relation.dependency_projection_layer.activation = tanh\n", | |
527 | + "I0407 10:52:29.103948 139754138821696 params.py:248] type = tanh\n", | |
528 | + "I0407 10:52:29.104044 139754138821696 params.py:248] model.dependency_relation.dependency_projection_layer.dropout_rate = 0.25\n", | |
529 | + "I0407 10:52:29.105780 139754138821696 params.py:248] model.enhanced_dependency_relation = None\n", | |
530 | + "I0407 10:52:29.106371 139754138821696 params.py:248] model.regularizer.regexes.0.1.type = l2\n", | |
531 | + "I0407 10:52:29.106555 139754138821696 params.py:248] model.regularizer.regexes.0.1.alpha = 1e-06\n", | |
532 | + "I0407 10:52:29.106724 139754138821696 params.py:248] model.regularizer.regexes.1.1.type = l2\n", | |
533 | + "I0407 10:52:29.106879 139754138821696 params.py:248] model.regularizer.regexes.1.1.alpha = 1e-06\n", | |
534 | + "I0407 10:52:29.107035 139754138821696 params.py:248] model.regularizer.regexes.2.1.type = l2\n", | |
535 | + "I0407 10:52:29.107207 139754138821696 params.py:248] model.regularizer.regexes.2.1.alpha = 1e-06\n", | |
536 | + "I0407 10:52:29.107368 139754138821696 params.py:248] model.regularizer.regexes.3.1.type = l2\n", | |
537 | + "I0407 10:52:29.107544 139754138821696 params.py:248] model.regularizer.regexes.3.1.alpha = 1e-05\n", | |
538 | + "I0407 10:52:32.063793 139754138821696 archival.py:211] removing temporary unarchived model dir at /tmp/tmp2jhqu3i6\n", | |
539 | + "reading instances: 2205it [01:49, 20.15it/s]\n" | |
540 | + ] | |
541 | + } | |
542 | + ], | |
543 | + "source": [ | |
544 | + "! {COMBO} --mode predict \\\n", | |
545 | + " --cuda_device 0 \\\n", | |
546 | + " --model_path model-pdbc/model.tar.gz \\\n", | |
547 | + " --input_file connlu/pdbc-test.conllu \\\n", | |
548 | + " --output_file connlu/pdbc-test-pred.conllu" | |
549 | + ] | |
550 | + }, | |
551 | + { | |
552 | + "cell_type": "code", | |
553 | + "execution_count": 6, | |
554 | + "id": "13748ca1", | |
555 | + "metadata": {}, | |
556 | + "outputs": [ | |
557 | + { | |
558 | + "name": "stdout", | |
559 | + "output_type": "stream", | |
560 | + "text": [ | |
561 | + "# text = Mały chłopiec patrzy w bok po ściągnięciu okularów .\r\n", | |
562 | + "1\tMały\tmały\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n", | |
563 | + "2\tchłopiec\tchłopiec\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n", | |
564 | + "3\tpatrzy\tpatrzeć\t_\t_\t_\t0\troot\t0:root\t_\r\n", | |
565 | + "4\tw\tw\t_\t_\t_\t3\tadjunct_adl\t3:adjunct_adl\t_\r\n", | |
566 | + "5\tbok\tbok\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n", | |
567 | + "6\tpo\tpo\t_\t_\t_\t3\tadjunct_temp\t3:adjunct_temp\t_\r\n", | |
568 | + "7\tściągnięciu\tściągnąć\t_\t_\t_\t6\tcomp\t6:comp\t_\r\n", | |
569 | + "8\tokularów\tokulary\t_\t_\t_\t7\tobj\t7:obj\t_\r\n", | |
570 | + "9\t.\t.\t_\t_\t_\t3\tpunct\t3:punct\t_\r\n" | |
571 | + ] | |
572 | + } | |
573 | + ], | |
574 | + "source": [ | |
575 | + "! head connlu/pdbc-test.conllu" | |
576 | + ] | |
577 | + }, | |
578 | + { | |
579 | + "cell_type": "code", | |
580 | + "execution_count": 7, | |
581 | + "id": "30021124", | |
582 | + "metadata": {}, | |
583 | + "outputs": [ | |
584 | + { | |
585 | + "name": "stdout", | |
586 | + "output_type": "stream", | |
587 | + "text": [ | |
588 | + "# text = Mały chłopiec patrzy w bok po ściągnięciu okularów .\r\n", | |
589 | + "1\tMały\tmały\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n", | |
590 | + "2\tchłopiec\tchłopiec\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n", | |
591 | + "3\tpatrzy\tpatrzeć\t_\t_\t_\t0\troot\t0:root\t_\r\n", | |
592 | + "4\tw\tw\t_\t_\t_\t3\tcomp\t3:adjunct_adl\t_\r\n", | |
593 | + "5\tbok\tbok\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n", | |
594 | + "6\tpo\tpo\t_\t_\t_\t3\tadjunct_temp\t3:adjunct_temp\t_\r\n", | |
595 | + "7\tściągnięciu\tściągnąć\t_\t_\t_\t6\tcomp\t6:comp\t_\r\n", | |
596 | + "8\tokularów\tokulary\t_\t_\t_\t7\tobj\t7:obj\t_\r\n", | |
597 | + "9\t.\t.\t_\t_\t_\t3\tpunct\t3:punct\t_\r\n" | |
598 | + ] | |
599 | + } | |
600 | + ], | |
601 | + "source": [ | |
602 | + "! head connlu/pdbc-test-pred.conllu" | |
603 | + ] | |
604 | + }, | |
605 | + { | |
606 | + "cell_type": "markdown", | |
607 | + "id": "99359d8c", | |
608 | + "metadata": {}, | |
609 | + "source": [ | |
610 | + "Continuous-only model" | |
611 | + ] | |
612 | + }, | |
613 | + { | |
614 | + "cell_type": "code", | |
615 | + "execution_count": 8, | |
616 | + "id": "30a66da6", | |
617 | + "metadata": {}, | |
618 | + "outputs": [ | |
619 | + { | |
620 | + "name": "stdout", | |
621 | + "output_type": "stream", | |
622 | + "text": [ | |
623 | + "I0407 10:54:27.401382 140321380496448 archival.py:184] loading archive file model-pdbc-cont/model.tar.gz\n", | |
624 | + "I0407 10:54:27.402150 140321380496448 archival.py:263] extracting archive file model-pdbc-cont/model.tar.gz to temp dir /tmp/tmpuvesoi4q\n", | |
625 | + "I0407 10:54:43.091615 140321380496448 params.py:248] dataset_reader.type = conllu\n", | |
626 | + "I0407 10:54:43.092000 140321380496448 params.py:248] dataset_reader.lazy = False\n", | |
627 | + "I0407 10:54:43.092082 140321380496448 params.py:248] dataset_reader.cache_directory = None\n", | |
628 | + "I0407 10:54:43.092129 140321380496448 params.py:248] dataset_reader.max_instances = None\n", | |
629 | + "I0407 10:54:43.092173 140321380496448 params.py:248] dataset_reader.manual_distributed_sharding = False\n", | |
630 | + "I0407 10:54:43.092208 140321380496448 params.py:248] dataset_reader.manual_multi_process_sharding = False\n", | |
631 | + "I0407 10:54:43.092409 140321380496448 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n", | |
632 | + "I0407 10:54:43.092535 140321380496448 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n", | |
633 | + "I0407 10:54:43.092682 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n", | |
634 | + "I0407 10:54:43.092730 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
635 | + "I0407 10:54:43.092786 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
636 | + "I0407 10:54:43.092888 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
637 | + "I0407 10:54:43.092970 140321380496448 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n", | |
638 | + "I0407 10:54:43.093014 140321380496448 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n", | |
639 | + "I0407 10:54:43.093051 140321380496448 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n", | |
640 | + "I0407 10:54:43.093093 140321380496448 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n", | |
641 | + "I0407 10:54:43.093198 140321380496448 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n", | |
642 | + "I0407 10:54:43.093306 140321380496448 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n", | |
643 | + "I0407 10:54:43.093353 140321380496448 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n", | |
644 | + "I0407 10:54:43.093388 140321380496448 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n", | |
645 | + "I0407 10:54:43.093482 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n", | |
646 | + "I0407 10:54:43.093593 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n", | |
647 | + "I0407 10:54:43.093723 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n", | |
648 | + "I0407 10:54:43.093769 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n", | |
649 | + "I0407 10:54:43.093816 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n", | |
650 | + "I0407 10:54:43.093899 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n", | |
651 | + "I0407 10:54:43.093993 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n", | |
652 | + "I0407 10:54:43.094043 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n", | |
653 | + "I0407 10:54:43.094079 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n", | |
654 | + "I0407 10:54:43.094121 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n", | |
655 | + "I0407 10:54:43.094226 140321380496448 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n", | |
656 | + "I0407 10:54:43.094377 140321380496448 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n", | |
657 | + "I0407 10:54:43.094430 140321380496448 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n", | |
658 | + "I0407 10:54:43.094474 140321380496448 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n", | |
659 | + "I0407 10:54:43.094522 140321380496448 params.py:248] dataset_reader.token_indexers.token.max_length = None\n", | |
660 | + "I0407 10:54:43.094592 140321380496448 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n", | |
661 | + "I0407 10:54:45.858621 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n", | |
662 | + "I0407 10:54:45.858990 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n", | |
663 | + "I0407 10:54:45.859087 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n", | |
664 | + "I0407 10:54:45.859157 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n", | |
665 | + "I0407 10:54:45.859210 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n", | |
666 | + "I0407 10:54:45.859268 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n", | |
667 | + "I0407 10:54:45.859321 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
668 | + "I0407 10:54:45.859382 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n", | |
669 | + "I0407 10:54:45.859541 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n", | |
670 | + "I0407 10:54:45.859729 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n", | |
671 | + "I0407 10:54:45.859802 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n", | |
672 | + "I0407 10:54:45.859875 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n", | |
673 | + "I0407 10:54:45.859931 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n", | |
674 | + "I0407 10:54:45.859991 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n", | |
675 | + "I0407 10:54:45.860045 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
676 | + "I0407 10:54:45.860103 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n", | |
677 | + "I0407 10:54:45.860332 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n", | |
678 | + "I0407 10:54:45.860523 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n", | |
679 | + "I0407 10:54:45.860739 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n", | |
680 | + "I0407 10:54:45.860809 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
681 | + "I0407 10:54:45.860888 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
682 | + "I0407 10:54:45.861032 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
683 | + "I0407 10:54:45.861149 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n", | |
684 | + "I0407 10:54:45.861213 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n", | |
685 | + "I0407 10:54:45.861277 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n", | |
686 | + "I0407 10:54:45.861337 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n", | |
687 | + "I0407 10:54:45.861427 140321380496448 params.py:248] dataset_reader.features = ['token', 'char']\n", | |
688 | + "I0407 10:54:45.861522 140321380496448 params.py:248] dataset_reader.targets = ['head', 'deprel']\n", | |
689 | + "I0407 10:54:45.861611 140321380496448 params.py:248] dataset_reader.use_sem = False\n", | |
690 | + "I0407 10:54:45.861762 140321380496448 params.py:248] dataset_reader.type = conllu\n", | |
691 | + "I0407 10:54:45.862029 140321380496448 params.py:248] dataset_reader.lazy = False\n", | |
692 | + "I0407 10:54:45.862116 140321380496448 params.py:248] dataset_reader.cache_directory = None\n", | |
693 | + "I0407 10:54:45.862177 140321380496448 params.py:248] dataset_reader.max_instances = None\n", | |
694 | + "I0407 10:54:45.862234 140321380496448 params.py:248] dataset_reader.manual_distributed_sharding = False\n", | |
695 | + "I0407 10:54:45.862295 140321380496448 params.py:248] dataset_reader.manual_multi_process_sharding = False\n", | |
696 | + "I0407 10:54:45.862535 140321380496448 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n", | |
697 | + "I0407 10:54:45.862701 140321380496448 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n", | |
698 | + "I0407 10:54:45.862900 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n", | |
699 | + "I0407 10:54:45.862966 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
700 | + "I0407 10:54:45.863043 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
701 | + "I0407 10:54:45.863168 140321380496448 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
702 | + "I0407 10:54:45.863281 140321380496448 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n", | |
703 | + "I0407 10:54:45.863344 140321380496448 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n", | |
704 | + "I0407 10:54:45.863406 140321380496448 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n", | |
705 | + "I0407 10:54:45.863469 140321380496448 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n", | |
706 | + "I0407 10:54:45.863596 140321380496448 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n", | |
707 | + "I0407 10:54:45.863752 140321380496448 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n", | |
708 | + "I0407 10:54:45.863821 140321380496448 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n", | |
709 | + "I0407 10:54:45.863883 140321380496448 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n", | |
710 | + "I0407 10:54:45.864030 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n", | |
711 | + "I0407 10:54:45.864196 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n", | |
712 | + "I0407 10:54:45.864392 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n", | |
713 | + "I0407 10:54:45.864460 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n", | |
714 | + "I0407 10:54:45.864540 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n", | |
715 | + "I0407 10:54:45.864660 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n", | |
716 | + "I0407 10:54:45.864772 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n", | |
717 | + "I0407 10:54:45.864835 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n", | |
718 | + "I0407 10:54:45.864896 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n", | |
719 | + "I0407 10:54:45.864965 140321380496448 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n", | |
720 | + "I0407 10:54:45.865104 140321380496448 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n", | |
721 | + "I0407 10:54:45.865323 140321380496448 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n", | |
722 | + "I0407 10:54:45.865396 140321380496448 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n", | |
723 | + "I0407 10:54:45.865460 140321380496448 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n", | |
724 | + "I0407 10:54:45.865518 140321380496448 params.py:248] dataset_reader.token_indexers.token.max_length = None\n", | |
725 | + "I0407 10:54:45.865614 140321380496448 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n", | |
726 | + "I0407 10:54:45.866884 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n", | |
727 | + "I0407 10:54:45.867116 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n", | |
728 | + "I0407 10:54:45.867190 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n", | |
729 | + "I0407 10:54:45.867258 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n", | |
730 | + "I0407 10:54:45.867316 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n", | |
731 | + "I0407 10:54:45.867376 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n", | |
732 | + "I0407 10:54:45.867437 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
733 | + "I0407 10:54:45.867497 140321380496448 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n", | |
734 | + "I0407 10:54:45.867640 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n", | |
735 | + "I0407 10:54:45.867815 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n", | |
736 | + "I0407 10:54:45.867887 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n", | |
737 | + "I0407 10:54:45.867951 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n", | |
738 | + "I0407 10:54:45.868006 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n", | |
739 | + "I0407 10:54:45.868063 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n", | |
740 | + "I0407 10:54:45.868122 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
741 | + "I0407 10:54:45.868181 140321380496448 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n", | |
742 | + "I0407 10:54:45.868388 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n", | |
743 | + "I0407 10:54:45.868559 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n", | |
744 | + "I0407 10:54:45.868757 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n", | |
745 | + "I0407 10:54:45.868824 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
746 | + "I0407 10:54:45.868897 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
747 | + "I0407 10:54:45.869028 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
748 | + "I0407 10:54:45.869139 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n", | |
749 | + "I0407 10:54:45.869202 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n", | |
750 | + "I0407 10:54:45.869256 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n", | |
751 | + "I0407 10:54:45.869315 140321380496448 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n", | |
752 | + "I0407 10:54:45.869398 140321380496448 params.py:248] dataset_reader.features = ['token', 'char']\n", | |
753 | + "I0407 10:54:45.869489 140321380496448 params.py:248] dataset_reader.targets = ['head', 'deprel']\n", | |
754 | + "I0407 10:54:45.869572 140321380496448 params.py:248] dataset_reader.use_sem = False\n", | |
755 | + "I0407 10:54:45.870136 140321380496448 params.py:248] vocabulary.type = from_instances_extended\n", | |
756 | + "I0407 10:54:45.870218 140321380496448 vocabulary.py:323] Loading token dictionary from /tmp/tmpuvesoi4q/vocabulary.\n", | |
757 | + "I0407 10:54:45.870543 140321380496448 filelock.py:254] Lock 140317974842768 acquired on /tmp/tmpuvesoi4q/vocabulary/.lock\n", | |
758 | + "I0407 10:54:45.871132 140321380496448 filelock.py:317] Lock 140317974842768 released on /tmp/tmpuvesoi4q/vocabulary/.lock\n", | |
759 | + "I0407 10:54:45.871641 140321380496448 params.py:248] model.type = semantic_multitask\n", | |
760 | + "I0407 10:54:45.872183 140321380496448 params.py:248] model.text_field_embedder.type = basic\n", | |
761 | + "I0407 10:54:45.872548 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.type = char_embeddings_from_config\n", | |
762 | + "I0407 10:54:45.872749 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.embedding_dim = 64\n", | |
763 | + "I0407 10:54:45.873004 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.input_dim = 64\n", | |
764 | + "I0407 10:54:45.873091 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.filters = [512, 256, 64]\n", | |
765 | + "I0407 10:54:45.873195 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.kernel_size = [3, 3, 3]\n", | |
766 | + "I0407 10:54:45.873291 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.stride = [1, 1, 1]\n", | |
767 | + "I0407 10:54:45.873384 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.padding = [1, 2, 4]\n", | |
768 | + "I0407 10:54:45.873478 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.dilation = [1, 2, 4]\n", | |
769 | + "I0407 10:54:45.873572 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.activations = ['relu', 'relu', 'linear']\n", | |
770 | + "I0407 10:54:45.873714 140321380496448 params.py:248] type = relu\n", | |
771 | + "I0407 10:54:45.873904 140321380496448 params.py:248] type = relu\n", | |
772 | + "I0407 10:54:45.874098 140321380496448 params.py:248] type = linear\n", | |
773 | + "I0407 10:54:45.880232 140321380496448 params.py:248] model.text_field_embedder.token_embedders.char.vocab_namespace = token_characters\n", | |
774 | + "I0407 10:54:45.880783 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.type = transformers_word_embeddings\n", | |
775 | + "I0407 10:54:45.881011 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.model_name = allegro/herbert-large-cased\n", | |
776 | + "I0407 10:54:45.881093 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.projection_dim = 100\n", | |
777 | + "I0407 10:54:45.881184 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.projection_activation = <function TransformersWordEmbedder.<lambda> at 0x7f9e50745280>\n", | |
778 | + "I0407 10:54:45.881261 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.projection_dropout_rate = 0.0\n", | |
779 | + "I0407 10:54:45.881328 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.freeze_transformer = True\n", | |
780 | + "I0407 10:54:45.881389 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.last_layer_only = True\n", | |
781 | + "I0407 10:54:45.881492 140321380496448 params.py:384] model.text_field_embedder.token_embedders.token.tokenizer_kwargs.use_fast = False\n", | |
782 | + "I0407 10:54:45.881562 140321380496448 params.py:248] model.text_field_embedder.token_embedders.token.transformer_kwargs = None\n" | |
783 | + ] | |
784 | + }, | |
785 | + { | |
786 | + "name": "stdout", | |
787 | + "output_type": "stream", | |
788 | + "text": [ | |
789 | + "I0407 10:54:52.911276 140321380496448 params.py:248] model.seq_encoder.type = combo_encoder\n", | |
790 | + "I0407 10:54:52.911743 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.input_size = 164\n", | |
791 | + "I0407 10:54:52.911836 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.hidden_size = 512\n", | |
792 | + "I0407 10:54:52.911902 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.num_layers = 2\n", | |
793 | + "I0407 10:54:52.911965 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.recurrent_dropout_probability = 0.33\n", | |
794 | + "I0407 10:54:52.912029 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.layer_dropout_probability = 0.33\n", | |
795 | + "I0407 10:54:52.912090 140321380496448 params.py:248] model.seq_encoder.stacked_bilstm.use_highway = False\n", | |
796 | + "I0407 10:54:53.279199 140321380496448 params.py:248] model.seq_encoder.layer_dropout_probability = 0.33\n", | |
797 | + "I0407 10:54:53.279505 140321380496448 params.py:248] model.use_sample_weight = True\n", | |
798 | + "I0407 10:54:53.279624 140321380496448 params.py:248] model.lemmatizer = None\n", | |
799 | + "I0407 10:54:53.279695 140321380496448 params.py:248] model.upos_tagger = None\n", | |
800 | + "I0407 10:54:53.279757 140321380496448 params.py:248] model.xpos_tagger = None\n", | |
801 | + "I0407 10:54:53.279815 140321380496448 params.py:248] model.semantic_relation = None\n", | |
802 | + "I0407 10:54:53.279873 140321380496448 params.py:248] model.morphological_feat = None\n", | |
803 | + "I0407 10:54:53.280155 140321380496448 params.py:248] model.dependency_relation.type = combo_dependency_parsing_from_vocab\n", | |
804 | + "I0407 10:54:53.280393 140321380496448 params.py:248] model.dependency_relation.vocab_namespace = deprel_labels\n", | |
805 | + "I0407 10:54:53.280741 140321380496448 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.in_features = 1024\n", | |
806 | + "I0407 10:54:53.280819 140321380496448 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.out_features = 512\n", | |
807 | + "I0407 10:54:53.280887 140321380496448 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.activation = tanh\n", | |
808 | + "I0407 10:54:53.281012 140321380496448 params.py:248] type = tanh\n", | |
809 | + "I0407 10:54:53.281121 140321380496448 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.dropout_rate = 0.0\n", | |
810 | + "I0407 10:54:53.285843 140321380496448 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.in_features = 1024\n", | |
811 | + "I0407 10:54:53.286010 140321380496448 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.out_features = 512\n", | |
812 | + "I0407 10:54:53.286088 140321380496448 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.activation = tanh\n", | |
813 | + "I0407 10:54:53.286234 140321380496448 params.py:248] type = tanh\n", | |
814 | + "I0407 10:54:53.286334 140321380496448 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.dropout_rate = 0.0\n", | |
815 | + "I0407 10:54:53.290788 140321380496448 params.py:248] model.dependency_relation.head_predictor.cycle_loss_n = 0\n", | |
816 | + "I0407 10:54:53.291093 140321380496448 params.py:248] model.dependency_relation.head_projection_layer.in_features = 1024\n", | |
817 | + "I0407 10:54:53.291184 140321380496448 params.py:248] model.dependency_relation.head_projection_layer.out_features = 128\n", | |
818 | + "I0407 10:54:53.291281 140321380496448 params.py:248] model.dependency_relation.head_projection_layer.activation = tanh\n", | |
819 | + "I0407 10:54:53.291444 140321380496448 params.py:248] type = tanh\n", | |
820 | + "I0407 10:54:53.291567 140321380496448 params.py:248] model.dependency_relation.head_projection_layer.dropout_rate = 0.25\n", | |
821 | + "I0407 10:54:53.293048 140321380496448 params.py:248] model.dependency_relation.dependency_projection_layer.in_features = 1024\n", | |
822 | + "I0407 10:54:53.293147 140321380496448 params.py:248] model.dependency_relation.dependency_projection_layer.out_features = 128\n", | |
823 | + "I0407 10:54:53.293218 140321380496448 params.py:248] model.dependency_relation.dependency_projection_layer.activation = tanh\n", | |
824 | + "I0407 10:54:53.293342 140321380496448 params.py:248] type = tanh\n", | |
825 | + "I0407 10:54:53.293437 140321380496448 params.py:248] model.dependency_relation.dependency_projection_layer.dropout_rate = 0.25\n", | |
826 | + "I0407 10:54:53.295091 140321380496448 params.py:248] model.enhanced_dependency_relation = None\n", | |
827 | + "I0407 10:54:53.295609 140321380496448 params.py:248] model.regularizer.regexes.0.1.type = l2\n", | |
828 | + "I0407 10:54:53.295784 140321380496448 params.py:248] model.regularizer.regexes.0.1.alpha = 1e-06\n", | |
829 | + "I0407 10:54:53.295953 140321380496448 params.py:248] model.regularizer.regexes.1.1.type = l2\n", | |
830 | + "I0407 10:54:53.296107 140321380496448 params.py:248] model.regularizer.regexes.1.1.alpha = 1e-06\n", | |
831 | + "I0407 10:54:53.296261 140321380496448 params.py:248] model.regularizer.regexes.2.1.type = l2\n", | |
832 | + "I0407 10:54:53.296412 140321380496448 params.py:248] model.regularizer.regexes.2.1.alpha = 1e-06\n", | |
833 | + "I0407 10:54:53.296564 140321380496448 params.py:248] model.regularizer.regexes.3.1.type = l2\n", | |
834 | + "I0407 10:54:53.296715 140321380496448 params.py:248] model.regularizer.regexes.3.1.alpha = 1e-05\n", | |
835 | + "I0407 10:54:56.194218 140321380496448 archival.py:211] removing temporary unarchived model dir at /tmp/tmpuvesoi4q\n", | |
836 | + "reading instances: 1980it [01:33, 21.15it/s]\n" | |
837 | + ] | |
838 | + } | |
839 | + ], | |
840 | + "source": [ | |
841 | + "! {COMBO} --mode predict \\\n", | |
842 | + " --cuda_device 0 \\\n", | |
843 | + " --model_path model-pdbc-cont/model.tar.gz \\\n", | |
844 | + " --input_file connlu/pdbc-cont-validation.conllu \\\n", | |
845 | + " --output_file connlu/pdbc-cont-validation-pred.conllu" | |
846 | + ] | |
847 | + }, | |
848 | + { | |
849 | + "cell_type": "code", | |
850 | + "execution_count": 9, | |
851 | + "id": "cfe7a3c4", | |
852 | + "metadata": {}, | |
853 | + "outputs": [ | |
854 | + { | |
855 | + "name": "stdout", | |
856 | + "output_type": "stream", | |
857 | + "text": [ | |
858 | + "# text = Dwie dziewczynki opierają się o dach kapliczki , chłopiec wspina się na niego , a trzecia dziewczynka stoi obok .\r\n", | |
859 | + "1\tDwie\tdwa\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n", | |
860 | + "2\tdziewczynki\tdziewczynka\t_\t_\t_\t1\tcomp\t1:comp\t_\r\n", | |
861 | + "3\topierają\topierać\t_\t_\t_\t15\tconjunct\t15:conjunct\t_\r\n", | |
862 | + "4\tsię\tsię\t_\t_\t_\t3\trefl\t3:refl\t_\r\n", | |
863 | + "5\to\to\t_\t_\t_\t3\tcomp\t3:comp\t_\r\n", | |
864 | + "6\tdach\tdach\t_\t_\t_\t5\tcomp\t5:comp\t_\r\n", | |
865 | + "7\tkapliczki\tkapliczka\t_\t_\t_\t6\tadjunct\t6:adjunct\t_\r\n", | |
866 | + "8\t,\t,\t_\t_\t_\t15\tpunct\t15:punct\t_\r\n", | |
867 | + "9\tchłopiec\tchłopiec\t_\t_\t_\t10\tsubj\t10:subj\t_\r\n" | |
868 | + ] | |
869 | + } | |
870 | + ], | |
871 | + "source": [ | |
872 | + "! head connlu/pdbc-cont-validation.conllu" | |
873 | + ] | |
874 | + }, | |
875 | + { | |
876 | + "cell_type": "code", | |
877 | + "execution_count": 10, | |
878 | + "id": "7dba9571", | |
879 | + "metadata": {}, | |
880 | + "outputs": [ | |
881 | + { | |
882 | + "name": "stdout", | |
883 | + "output_type": "stream", | |
884 | + "text": [ | |
885 | + "# text = Dwie dziewczynki opierają się o dach kapliczki , chłopiec wspina się na niego , a trzecia dziewczynka stoi obok .\r\n", | |
886 | + "1\tDwie\tdwa\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n", | |
887 | + "2\tdziewczynki\tdziewczynka\t_\t_\t_\t1\tcomp\t1:comp\t_\r\n", | |
888 | + "3\topierają\topierać\t_\t_\t_\t15\tconjunct\t15:conjunct\t_\r\n", | |
889 | + "4\tsię\tsię\t_\t_\t_\t3\trefl\t3:refl\t_\r\n", | |
890 | + "5\to\to\t_\t_\t_\t3\tcomp\t3:comp\t_\r\n", | |
891 | + "6\tdach\tdach\t_\t_\t_\t5\tcomp\t5:comp\t_\r\n", | |
892 | + "7\tkapliczki\tkapliczka\t_\t_\t_\t6\tadjunct\t6:adjunct\t_\r\n", | |
893 | + "8\t,\t,\t_\t_\t_\t15\tpunct\t15:punct\t_\r\n", | |
894 | + "9\tchłopiec\tchłopiec\t_\t_\t_\t10\tsubj\t10:subj\t_\r\n" | |
895 | + ] | |
896 | + } | |
897 | + ], | |
898 | + "source": [ | |
899 | + "! head connlu/pdbc-cont-validation-pred.conllu" | |
900 | + ] | |
901 | + }, | |
902 | + { | |
903 | + "cell_type": "code", | |
904 | + "execution_count": 11, | |
905 | + "id": "679601c2", | |
906 | + "metadata": {}, | |
907 | + "outputs": [ | |
908 | + { | |
909 | + "name": "stdout", | |
910 | + "output_type": "stream", | |
911 | + "text": [ | |
912 | + "I0407 10:56:35.295660 140254825452608 archival.py:184] loading archive file model-pdbc-cont/model.tar.gz\n", | |
913 | + "I0407 10:56:35.296370 140254825452608 archival.py:263] extracting archive file model-pdbc-cont/model.tar.gz to temp dir /tmp/tmpdhtf4et1\n", | |
914 | + "I0407 10:56:52.876630 140254825452608 params.py:248] dataset_reader.type = conllu\n", | |
915 | + "I0407 10:56:52.877122 140254825452608 params.py:248] dataset_reader.lazy = False\n", | |
916 | + "I0407 10:56:52.877243 140254825452608 params.py:248] dataset_reader.cache_directory = None\n", | |
917 | + "I0407 10:56:52.877313 140254825452608 params.py:248] dataset_reader.max_instances = None\n", | |
918 | + "I0407 10:56:52.877380 140254825452608 params.py:248] dataset_reader.manual_distributed_sharding = False\n", | |
919 | + "I0407 10:56:52.877446 140254825452608 params.py:248] dataset_reader.manual_multi_process_sharding = False\n", | |
920 | + "I0407 10:56:52.877737 140254825452608 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n", | |
921 | + "I0407 10:56:52.877938 140254825452608 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n", | |
922 | + "I0407 10:56:52.878201 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n", | |
923 | + "I0407 10:56:52.878276 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
924 | + "I0407 10:56:52.878360 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
925 | + "I0407 10:56:52.878507 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
926 | + "I0407 10:56:52.878633 140254825452608 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n", | |
927 | + "I0407 10:56:52.878702 140254825452608 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n", | |
928 | + "I0407 10:56:52.878761 140254825452608 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n", | |
929 | + "I0407 10:56:52.878825 140254825452608 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n", | |
930 | + "I0407 10:56:52.878969 140254825452608 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n", | |
931 | + "I0407 10:56:52.879144 140254825452608 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n", | |
932 | + "I0407 10:56:52.879218 140254825452608 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n", | |
933 | + "I0407 10:56:52.879282 140254825452608 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n", | |
934 | + "I0407 10:56:52.879426 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n", | |
935 | + "I0407 10:56:52.879594 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n", | |
936 | + "I0407 10:56:52.879792 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n", | |
937 | + "I0407 10:56:52.879862 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n", | |
938 | + "I0407 10:56:52.879944 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n", | |
939 | + "I0407 10:56:52.880068 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n", | |
940 | + "I0407 10:56:52.880184 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n", | |
941 | + "I0407 10:56:52.880254 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n", | |
942 | + "I0407 10:56:52.880316 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n", | |
943 | + "I0407 10:56:52.880378 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n", | |
944 | + "I0407 10:56:52.880523 140254825452608 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n", | |
945 | + "I0407 10:56:52.880748 140254825452608 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n", | |
946 | + "I0407 10:56:52.880829 140254825452608 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n", | |
947 | + "I0407 10:56:52.880893 140254825452608 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n", | |
948 | + "I0407 10:56:52.880957 140254825452608 params.py:248] dataset_reader.token_indexers.token.max_length = None\n", | |
949 | + "I0407 10:56:52.881069 140254825452608 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n", | |
950 | + "I0407 10:56:55.893562 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n", | |
951 | + "I0407 10:56:55.894115 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n", | |
952 | + "I0407 10:56:55.894256 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n", | |
953 | + "I0407 10:56:55.894343 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n", | |
954 | + "I0407 10:56:55.894395 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n", | |
955 | + "I0407 10:56:55.894465 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n", | |
956 | + "I0407 10:56:55.894520 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
957 | + "I0407 10:56:55.894590 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n", | |
958 | + "I0407 10:56:55.894762 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n", | |
959 | + "I0407 10:56:55.894958 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n", | |
960 | + "I0407 10:56:55.895048 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n", | |
961 | + "I0407 10:56:55.895111 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n", | |
962 | + "I0407 10:56:55.895176 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n", | |
963 | + "I0407 10:56:55.895228 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n", | |
964 | + "I0407 10:56:55.895297 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
965 | + "I0407 10:56:55.895349 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n", | |
966 | + "I0407 10:56:55.895593 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n", | |
967 | + "I0407 10:56:55.895786 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n", | |
968 | + "I0407 10:56:55.896016 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n", | |
969 | + "I0407 10:56:55.896095 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
970 | + "I0407 10:56:55.896188 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
971 | + "I0407 10:56:55.896353 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
972 | + "I0407 10:56:55.896480 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n", | |
973 | + "I0407 10:56:55.896552 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n", | |
974 | + "I0407 10:56:55.896607 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n", | |
975 | + "I0407 10:56:55.896675 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n", | |
976 | + "I0407 10:56:55.896760 140254825452608 params.py:248] dataset_reader.features = ['token', 'char']\n", | |
977 | + "I0407 10:56:55.896864 140254825452608 params.py:248] dataset_reader.targets = ['head', 'deprel']\n", | |
978 | + "I0407 10:56:55.896962 140254825452608 params.py:248] dataset_reader.use_sem = False\n", | |
979 | + "I0407 10:56:55.897153 140254825452608 params.py:248] dataset_reader.type = conllu\n", | |
980 | + "I0407 10:56:55.897414 140254825452608 params.py:248] dataset_reader.lazy = False\n", | |
981 | + "I0407 10:56:55.897499 140254825452608 params.py:248] dataset_reader.cache_directory = None\n", | |
982 | + "I0407 10:56:55.897570 140254825452608 params.py:248] dataset_reader.max_instances = None\n", | |
983 | + "I0407 10:56:55.897637 140254825452608 params.py:248] dataset_reader.manual_distributed_sharding = False\n", | |
984 | + "I0407 10:56:55.897707 140254825452608 params.py:248] dataset_reader.manual_multi_process_sharding = False\n", | |
985 | + "I0407 10:56:55.897995 140254825452608 params.py:248] dataset_reader.token_indexers.char.type = characters_const_padding\n", | |
986 | + "I0407 10:56:55.898183 140254825452608 params.py:248] dataset_reader.token_indexers.char.namespace = token_characters\n", | |
987 | + "I0407 10:56:55.898398 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.byte_encoding = None\n", | |
988 | + "I0407 10:56:55.898473 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
989 | + "I0407 10:56:55.898542 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
990 | + "I0407 10:56:55.898677 140254825452608 params.py:248] dataset_reader.token_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
991 | + "I0407 10:56:55.898799 140254825452608 params.py:248] dataset_reader.token_indexers.char.start_tokens = None\n", | |
992 | + "I0407 10:56:55.898869 140254825452608 params.py:248] dataset_reader.token_indexers.char.end_tokens = None\n", | |
993 | + "I0407 10:56:55.898936 140254825452608 params.py:248] dataset_reader.token_indexers.char.min_padding_length = 32\n", | |
994 | + "I0407 10:56:55.898998 140254825452608 params.py:248] dataset_reader.token_indexers.char.token_min_padding_length = 0\n", | |
995 | + "I0407 10:56:55.899158 140254825452608 params.py:248] dataset_reader.token_indexers.feats.type = feats_indexer\n", | |
996 | + "I0407 10:56:55.899337 140254825452608 params.py:248] dataset_reader.token_indexers.feats.namespace = feats\n", | |
997 | + "I0407 10:56:55.899414 140254825452608 params.py:248] dataset_reader.token_indexers.feats.feature_name = feats_\n", | |
998 | + "I0407 10:56:55.899485 140254825452608 params.py:248] dataset_reader.token_indexers.feats.token_min_padding_length = 0\n", | |
999 | + "I0407 10:56:55.899629 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.type = characters_const_padding\n", | |
1000 | + "I0407 10:56:55.899797 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.namespace = token_characters\n", | |
1001 | + "I0407 10:56:55.899995 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.byte_encoding = None\n", | |
1002 | + "I0407 10:56:55.900055 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.lowercase_characters = False\n", | |
1003 | + "I0407 10:56:55.900130 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.start_tokens = ['__START__']\n", | |
1004 | + "I0407 10:56:55.900250 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.character_tokenizer.end_tokens = ['__END__']\n", | |
1005 | + "I0407 10:56:55.900363 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.start_tokens = None\n", | |
1006 | + "I0407 10:56:55.900426 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.end_tokens = None\n", | |
1007 | + "I0407 10:56:55.900486 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.min_padding_length = 32\n", | |
1008 | + "I0407 10:56:55.900547 140254825452608 params.py:248] dataset_reader.token_indexers.lemma.token_min_padding_length = 0\n", | |
1009 | + "I0407 10:56:55.900689 140254825452608 params.py:248] dataset_reader.token_indexers.token.type = pretrained_transformer_mismatched_fixed\n", | |
1010 | + "I0407 10:56:55.900916 140254825452608 params.py:248] dataset_reader.token_indexers.token.token_min_padding_length = 0\n", | |
1011 | + "I0407 10:56:55.900995 140254825452608 params.py:248] dataset_reader.token_indexers.token.model_name = allegro/herbert-large-cased\n", | |
1012 | + "I0407 10:56:55.901061 140254825452608 params.py:248] dataset_reader.token_indexers.token.namespace = tags\n", | |
1013 | + "I0407 10:56:55.901125 140254825452608 params.py:248] dataset_reader.token_indexers.token.max_length = None\n", | |
1014 | + "I0407 10:56:55.901226 140254825452608 params.py:384] dataset_reader.token_indexers.token.tokenizer_kwargs.use_fast = False\n", | |
1015 | + "I0407 10:56:55.902561 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.type = single_id\n", | |
1016 | + "I0407 10:56:55.902824 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.namespace = upostag\n", | |
1017 | + "I0407 10:56:55.902909 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.lowercase_tokens = False\n", | |
1018 | + "I0407 10:56:55.902969 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.start_tokens = None\n", | |
1019 | + "I0407 10:56:55.903034 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.end_tokens = None\n", | |
1020 | + "I0407 10:56:55.903095 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.feature_name = pos_\n", | |
1021 | + "I0407 10:56:55.903159 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
1022 | + "I0407 10:56:55.903219 140254825452608 params.py:248] dataset_reader.token_indexers.upostag.token_min_padding_length = 0\n", | |
1023 | + "I0407 10:56:55.903364 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.type = single_id\n", | |
1024 | + "I0407 10:56:55.903547 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.namespace = xpostag\n", | |
1025 | + "I0407 10:56:55.903621 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.lowercase_tokens = False\n", | |
1026 | + "I0407 10:56:55.903687 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.start_tokens = None\n", | |
1027 | + "I0407 10:56:55.903748 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.end_tokens = None\n", | |
1028 | + "I0407 10:56:55.903811 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.feature_name = tag_\n", | |
1029 | + "I0407 10:56:55.903868 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.default_value = THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING\n", | |
1030 | + "I0407 10:56:55.903931 140254825452608 params.py:248] dataset_reader.token_indexers.xpostag.token_min_padding_length = 0\n", | |
1031 | + "I0407 10:56:55.904146 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.type = characters_const_padding\n", | |
1032 | + "I0407 10:56:55.904325 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.namespace = lemma_characters\n", | |
1033 | + "I0407 10:56:55.904539 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.byte_encoding = None\n", | |
1034 | + "I0407 10:56:55.904611 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.lowercase_characters = False\n", | |
1035 | + "I0407 10:56:55.904691 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.start_tokens = ['__START__']\n", | |
1036 | + "I0407 10:56:55.904827 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.character_tokenizer.end_tokens = ['__END__']\n", | |
1037 | + "I0407 10:56:55.904946 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.start_tokens = None\n", | |
1038 | + "I0407 10:56:55.905013 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.end_tokens = None\n", | |
1039 | + "I0407 10:56:55.905084 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.min_padding_length = 32\n", | |
1040 | + "I0407 10:56:55.905149 140254825452608 params.py:248] dataset_reader.lemma_indexers.char.token_min_padding_length = 0\n", | |
1041 | + "I0407 10:56:55.905237 140254825452608 params.py:248] dataset_reader.features = ['token', 'char']\n", | |
1042 | + "I0407 10:56:55.905334 140254825452608 params.py:248] dataset_reader.targets = ['head', 'deprel']\n", | |
1043 | + "I0407 10:56:55.905422 140254825452608 params.py:248] dataset_reader.use_sem = False\n", | |
1044 | + "I0407 10:56:55.906047 140254825452608 params.py:248] vocabulary.type = from_instances_extended\n", | |
1045 | + "I0407 10:56:55.906157 140254825452608 vocabulary.py:323] Loading token dictionary from /tmp/tmpdhtf4et1/vocabulary.\n", | |
1046 | + "I0407 10:56:55.906635 140254825452608 filelock.py:254] Lock 140251419626896 acquired on /tmp/tmpdhtf4et1/vocabulary/.lock\n", | |
1047 | + "I0407 10:56:55.907354 140254825452608 filelock.py:317] Lock 140251419626896 released on /tmp/tmpdhtf4et1/vocabulary/.lock\n", | |
1048 | + "I0407 10:56:55.907914 140254825452608 params.py:248] model.type = semantic_multitask\n", | |
1049 | + "I0407 10:56:55.908506 140254825452608 params.py:248] model.text_field_embedder.type = basic\n", | |
1050 | + "I0407 10:56:55.908878 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.type = char_embeddings_from_config\n", | |
1051 | + "I0407 10:56:55.909080 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.embedding_dim = 64\n", | |
1052 | + "I0407 10:56:55.909353 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.input_dim = 64\n", | |
1053 | + "I0407 10:56:55.909446 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.filters = [512, 256, 64]\n", | |
1054 | + "I0407 10:56:55.909554 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.kernel_size = [3, 3, 3]\n" | |
1055 | + ] | |
1056 | + }, | |
1057 | + { | |
1058 | + "name": "stdout", | |
1059 | + "output_type": "stream", | |
1060 | + "text": [ | |
1061 | + "I0407 10:56:55.909654 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.stride = [1, 1, 1]\n", | |
1062 | + "I0407 10:56:55.909750 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.padding = [1, 2, 4]\n", | |
1063 | + "I0407 10:56:55.909847 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.dilation = [1, 2, 4]\n", | |
1064 | + "I0407 10:56:55.909946 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.dilated_cnn_encoder.activations = ['relu', 'relu', 'linear']\n", | |
1065 | + "I0407 10:56:55.910176 140254825452608 params.py:248] type = relu\n", | |
1066 | + "I0407 10:56:55.910410 140254825452608 params.py:248] type = relu\n", | |
1067 | + "I0407 10:56:55.910567 140254825452608 params.py:248] type = linear\n", | |
1068 | + "I0407 10:56:55.917278 140254825452608 params.py:248] model.text_field_embedder.token_embedders.char.vocab_namespace = token_characters\n", | |
1069 | + "I0407 10:56:55.917941 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.type = transformers_word_embeddings\n", | |
1070 | + "I0407 10:56:55.918267 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.model_name = allegro/herbert-large-cased\n", | |
1071 | + "I0407 10:56:55.918358 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.projection_dim = 100\n", | |
1072 | + "I0407 10:56:55.918458 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.projection_activation = <function TransformersWordEmbedder.<lambda> at 0x7f8ed1745280>\n", | |
1073 | + "I0407 10:56:55.918541 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.projection_dropout_rate = 0.0\n", | |
1074 | + "I0407 10:56:55.918609 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.freeze_transformer = True\n", | |
1075 | + "I0407 10:56:55.918674 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.last_layer_only = True\n", | |
1076 | + "I0407 10:56:55.918785 140254825452608 params.py:384] model.text_field_embedder.token_embedders.token.tokenizer_kwargs.use_fast = False\n", | |
1077 | + "I0407 10:56:55.918858 140254825452608 params.py:248] model.text_field_embedder.token_embedders.token.transformer_kwargs = None\n", | |
1078 | + "I0407 10:57:03.624983 140254825452608 params.py:248] model.seq_encoder.type = combo_encoder\n", | |
1079 | + "I0407 10:57:03.625626 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.input_size = 164\n", | |
1080 | + "I0407 10:57:03.625742 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.hidden_size = 512\n", | |
1081 | + "I0407 10:57:03.625796 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.num_layers = 2\n", | |
1082 | + "I0407 10:57:03.625844 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.recurrent_dropout_probability = 0.33\n", | |
1083 | + "I0407 10:57:03.625942 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.layer_dropout_probability = 0.33\n", | |
1084 | + "I0407 10:57:03.626068 140254825452608 params.py:248] model.seq_encoder.stacked_bilstm.use_highway = False\n", | |
1085 | + "I0407 10:57:03.933019 140254825452608 params.py:248] model.seq_encoder.layer_dropout_probability = 0.33\n", | |
1086 | + "I0407 10:57:03.933302 140254825452608 params.py:248] model.use_sample_weight = True\n", | |
1087 | + "I0407 10:57:03.933391 140254825452608 params.py:248] model.lemmatizer = None\n", | |
1088 | + "I0407 10:57:03.933440 140254825452608 params.py:248] model.upos_tagger = None\n", | |
1089 | + "I0407 10:57:03.933486 140254825452608 params.py:248] model.xpos_tagger = None\n", | |
1090 | + "I0407 10:57:03.933528 140254825452608 params.py:248] model.semantic_relation = None\n", | |
1091 | + "I0407 10:57:03.933570 140254825452608 params.py:248] model.morphological_feat = None\n", | |
1092 | + "I0407 10:57:03.933835 140254825452608 params.py:248] model.dependency_relation.type = combo_dependency_parsing_from_vocab\n", | |
1093 | + "I0407 10:57:03.934096 140254825452608 params.py:248] model.dependency_relation.vocab_namespace = deprel_labels\n", | |
1094 | + "I0407 10:57:03.934389 140254825452608 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.in_features = 1024\n", | |
1095 | + "I0407 10:57:03.934459 140254825452608 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.out_features = 512\n", | |
1096 | + "I0407 10:57:03.934515 140254825452608 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.activation = tanh\n", | |
1097 | + "I0407 10:57:03.934614 140254825452608 params.py:248] type = tanh\n", | |
1098 | + "I0407 10:57:03.934703 140254825452608 params.py:248] model.dependency_relation.head_predictor.head_projection_layer.dropout_rate = 0.0\n", | |
1099 | + "I0407 10:57:03.938141 140254825452608 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.in_features = 1024\n", | |
1100 | + "I0407 10:57:03.938247 140254825452608 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.out_features = 512\n", | |
1101 | + "I0407 10:57:03.938306 140254825452608 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.activation = tanh\n", | |
1102 | + "I0407 10:57:03.938404 140254825452608 params.py:248] type = tanh\n", | |
1103 | + "I0407 10:57:03.938489 140254825452608 params.py:248] model.dependency_relation.head_predictor.dependency_projection_layer.dropout_rate = 0.0\n", | |
1104 | + "I0407 10:57:03.941669 140254825452608 params.py:248] model.dependency_relation.head_predictor.cycle_loss_n = 0\n", | |
1105 | + "I0407 10:57:03.941908 140254825452608 params.py:248] model.dependency_relation.head_projection_layer.in_features = 1024\n", | |
1106 | + "I0407 10:57:03.941985 140254825452608 params.py:248] model.dependency_relation.head_projection_layer.out_features = 128\n", | |
1107 | + "I0407 10:57:03.942037 140254825452608 params.py:248] model.dependency_relation.head_projection_layer.activation = tanh\n", | |
1108 | + "I0407 10:57:03.942123 140254825452608 params.py:248] type = tanh\n", | |
1109 | + "I0407 10:57:03.942194 140254825452608 params.py:248] model.dependency_relation.head_projection_layer.dropout_rate = 0.25\n", | |
1110 | + "I0407 10:57:03.943288 140254825452608 params.py:248] model.dependency_relation.dependency_projection_layer.in_features = 1024\n", | |
1111 | + "I0407 10:57:03.943376 140254825452608 params.py:248] model.dependency_relation.dependency_projection_layer.out_features = 128\n", | |
1112 | + "I0407 10:57:03.943423 140254825452608 params.py:248] model.dependency_relation.dependency_projection_layer.activation = tanh\n", | |
1113 | + "I0407 10:57:03.943510 140254825452608 params.py:248] type = tanh\n", | |
1114 | + "I0407 10:57:03.943577 140254825452608 params.py:248] model.dependency_relation.dependency_projection_layer.dropout_rate = 0.25\n", | |
1115 | + "I0407 10:57:03.944838 140254825452608 params.py:248] model.enhanced_dependency_relation = None\n", | |
1116 | + "I0407 10:57:03.945286 140254825452608 params.py:248] model.regularizer.regexes.0.1.type = l2\n", | |
1117 | + "I0407 10:57:03.945443 140254825452608 params.py:248] model.regularizer.regexes.0.1.alpha = 1e-06\n", | |
1118 | + "I0407 10:57:03.945568 140254825452608 params.py:248] model.regularizer.regexes.1.1.type = l2\n", | |
1119 | + "I0407 10:57:03.945679 140254825452608 params.py:248] model.regularizer.regexes.1.1.alpha = 1e-06\n", | |
1120 | + "I0407 10:57:03.945787 140254825452608 params.py:248] model.regularizer.regexes.2.1.type = l2\n", | |
1121 | + "I0407 10:57:03.945892 140254825452608 params.py:248] model.regularizer.regexes.2.1.alpha = 1e-06\n", | |
1122 | + "I0407 10:57:03.946047 140254825452608 params.py:248] model.regularizer.regexes.3.1.type = l2\n", | |
1123 | + "I0407 10:57:03.946158 140254825452608 params.py:248] model.regularizer.regexes.3.1.alpha = 1e-05\n", | |
1124 | + "I0407 10:57:06.549506 140254825452608 archival.py:211] removing temporary unarchived model dir at /tmp/tmpdhtf4et1\n", | |
1125 | + "reading instances: 1990it [01:39, 20.00it/s]\n" | |
1126 | + ] | |
1127 | + } | |
1128 | + ], | |
1129 | + "source": [ | |
1130 | + "! {COMBO} --mode predict \\\n", | |
1131 | + " --cuda_device 0 \\\n", | |
1132 | + " --model_path model-pdbc-cont/model.tar.gz \\\n", | |
1133 | + " --input_file connlu/pdbc-cont-test.conllu \\\n", | |
1134 | + " --output_file connlu/pdbc-cont-test-pred.conllu" | |
1135 | + ] | |
1136 | + }, | |
1137 | + { | |
1138 | + "cell_type": "code", | |
1139 | + "execution_count": 12, | |
1140 | + "id": "ddc3986b", | |
1141 | + "metadata": {}, | |
1142 | + "outputs": [ | |
1143 | + { | |
1144 | + "name": "stdout", | |
1145 | + "output_type": "stream", | |
1146 | + "text": [ | |
1147 | + "# text = Mały chłopiec patrzy w bok po ściągnięciu okularów .\r\n", | |
1148 | + "1\tMały\tmały\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n", | |
1149 | + "2\tchłopiec\tchłopiec\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n", | |
1150 | + "3\tpatrzy\tpatrzeć\t_\t_\t_\t0\troot\t0:root\t_\r\n", | |
1151 | + "4\tw\tw\t_\t_\t_\t3\tadjunct_adl\t3:adjunct_adl\t_\r\n", | |
1152 | + "5\tbok\tbok\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n", | |
1153 | + "6\tpo\tpo\t_\t_\t_\t3\tadjunct_temp\t3:adjunct_temp\t_\r\n", | |
1154 | + "7\tściągnięciu\tściągnąć\t_\t_\t_\t6\tcomp\t6:comp\t_\r\n", | |
1155 | + "8\tokularów\tokulary\t_\t_\t_\t7\tobj\t7:obj\t_\r\n", | |
1156 | + "9\t.\t.\t_\t_\t_\t3\tpunct\t3:punct\t_\r\n" | |
1157 | + ] | |
1158 | + } | |
1159 | + ], | |
1160 | + "source": [ | |
1161 | + "! head connlu/pdbc-cont-test.conllu" | |
1162 | + ] | |
1163 | + }, | |
1164 | + { | |
1165 | + "cell_type": "code", | |
1166 | + "execution_count": 13, | |
1167 | + "id": "34aa16d9", | |
1168 | + "metadata": {}, | |
1169 | + "outputs": [ | |
1170 | + { | |
1171 | + "name": "stdout", | |
1172 | + "output_type": "stream", | |
1173 | + "text": [ | |
1174 | + "# text = Mały chłopiec patrzy w bok po ściągnięciu okularów .\r\n", | |
1175 | + "1\tMały\tmały\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n", | |
1176 | + "2\tchłopiec\tchłopiec\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n", | |
1177 | + "3\tpatrzy\tpatrzeć\t_\t_\t_\t0\troot\t0:root\t_\r\n", | |
1178 | + "4\tw\tw\t_\t_\t_\t3\tcomp\t3:adjunct_adl\t_\r\n", | |
1179 | + "5\tbok\tbok\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n", | |
1180 | + "6\tpo\tpo\t_\t_\t_\t3\tadjunct_temp\t3:adjunct_temp\t_\r\n", | |
1181 | + "7\tściągnięciu\tściągnąć\t_\t_\t_\t6\tcomp\t6:comp\t_\r\n", | |
1182 | + "8\tokularów\tokulary\t_\t_\t_\t7\tobj\t7:obj\t_\r\n", | |
1183 | + "9\t.\t.\t_\t_\t_\t3\tpunct\t3:punct\t_\r\n" | |
1184 | + ] | |
1185 | + } | |
1186 | + ], | |
1187 | + "source": [ | |
1188 | + "! head connlu/pdbc-cont-test-pred.conllu" | |
1189 | + ] | |
1190 | + } | |
1191 | + ], | |
1192 | + "metadata": { | |
1193 | + "kernelspec": { | |
1194 | + "display_name": "combo_python39", | |
1195 | + "language": "python", | |
1196 | + "name": "combo_python39" | |
1197 | + }, | |
1198 | + "language_info": { | |
1199 | + "codemirror_mode": { | |
1200 | + "name": "ipython", | |
1201 | + "version": 3 | |
1202 | + }, | |
1203 | + "file_extension": ".py", | |
1204 | + "mimetype": "text/x-python", | |
1205 | + "name": "python", | |
1206 | + "nbconvert_exporter": "python", | |
1207 | + "pygments_lexer": "ipython3", | |
1208 | + "version": "3.8.16" | |
1209 | + } | |
1210 | + }, | |
1211 | + "nbformat": 4, | |
1212 | + "nbformat_minor": 5 | |
1213 | +} | |
... | ... |