DataPreparation.ipynb 8.93 KB
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5cd26f6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "from datasets import load_dataset\n",
    "\n",
    "from IPython.display import display\n",
    "\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from neural_parser import hybrid_tree_utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fecef4af",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset pdb_c_beta (/home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d6fc0deda216433982f304d7451158b2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pdbc_dataset = load_dataset('../pdb_c_beta/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "23da801f",
   "metadata": {},
   "outputs": [],
   "source": [
    "BRACKETS_DIR = 'brackets'\n",
    "! rm -r {BRACKETS_DIR}\n",
    "! mkdir {BRACKETS_DIR}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c105feff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train\n",
      "    brackets/pdbc-cont-train.dat\n",
      "        15903\n",
      "validation\n",
      "    brackets/pdbc-cont-validation.dat\n",
      "        1980\n",
      "test\n",
      "    brackets/pdbc-cont-test.dat\n",
      "        1990\n"
     ]
    }
   ],
   "source": [
    "features = pdbc_dataset['train'].features\n",
    "\n",
    "for part, dataset in pdbc_dataset.items():\n",
    "    print(part)\n",
    "    b_cont = []\n",
    "    for sentence in dataset:\n",
    "        tree = hybrid_tree_utils.tree_from_dataset_instance(sentence, features)\n",
    "        if tree.is_continuous():\n",
    "            b_cont.append(f'(TOP {tree.to_brackets(morph_tags=True)})')\n",
    "    filepath = os.path.join(BRACKETS_DIR, f'pdbc-cont-{part}.dat')\n",
    "    with open(filepath, 'w') as f:\n",
    "        print('   ', filepath)\n",
    "        print('       ', len(b_cont))\n",
    "        for row in b_cont:\n",
    "            print(row, file=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c849233c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    1990   121784  1024525 brackets/pdbc-cont-test.dat\n",
      "   15903  1022627  8620535 brackets/pdbc-cont-train.dat\n",
      "    1980   126288  1065593 brackets/pdbc-cont-validation.dat\n",
      "   19873  1270699 10710653 total\n"
     ]
    }
   ],
   "source": [
    "! wc {BRACKETS_DIR}/*.dat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "679b9f10",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(TOP (ROOT (*S (S (NP (AdjP (*Adj (adj:sg:nom:f:pos Skośnooka))) (*NP (*N (subst:sg:nom:f dziewczynka)))) (*VP (*V (fin:sg:ter:imperf trzyma))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:pl:loc:f rękach)))) (NP (AdjP (*Adj (adj:pl:nom:f:pos drewniane))) (*NP (*N (subst:pl:nom:f pałeczki))))) (Punct (interp ,)) (*Conj (conj a)) (S (PrepNP (*Prep (prep:inst:nwok przed)) (NP (*N (ppron3:sg:inst:f:ter:akc:praep nią)))) (*VP (*V (fin:pl:ter:imperf znajdują))) (Part (part się)) (NP (*NP (*N (subst:pl:nom:n:ncol naczynia))) (AdjP (*Adj (adj:pl:nom:n:pos kuchenne)))))) (Punct (interp .))))\r\n",
      "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:n:col Dziecko))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (AdjP (*Adj (adj:sg:loc:f:pos różowej))) (*NP (*N (subst:sg:loc:f opasce)))))) (*VP (*V (fin:sg:ter:imperf unosi))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:pl:loc:f rękach)))) (NP (AdjP (*Adj (adj:pl:acc:m3:pos drewniane))) (*NP (*N (subst:pl:acc:m3 patyczki)))) (PrepNP (*Prep (prep:inst:nwok nad)) (NP (AdjP (*AdjP (*Adj (ppas:pl:inst:n:perf:aff postawionymi))) (NP (PrepNP (*Prep (prep:gen do)) (NP (*N (subst:sg:gen:f góry)))) (*NP (*N (subst:sg:inst:n:ncol dnem))))) (*NP (NP (*N (subst:sg:inst:f miską))) (*Conj (conj i)) (NP (*N (subst:sg:inst:m3 garnkiem))))))) (Punct (interp .))))\r\n",
      "(TOP (ROOT (*S (NP (*NP (*N (subst:pl:nom:m1 Zawodnicy))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*NP (*N (subst:sg:loc:n:ncol pobliżu))) (NP (*N (subst:sg:gen:f piłki)))))) (*VP (*V (fin:pl:ter:imperf przepychają))) (Part (part się)) (PrepNP (*Prep (prep:inst między)) (NP (*N (siebie:inst sobą)))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:n:ncol boisku))))) (Punct (interp .))))\r\n",
      "(TOP (ROOT (*S (S (NP (*NP (*N (subst:sg:nom:f Dziewczynka))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:sg:loc:f sukience))))) (*VP (*V (fin:sg:ter:imperf puszcza))) (NP (*NP (*N (subst:pl:acc:f bańki))) (AdjP (*Adj (adj:pl:acc:f:pos mydlane)))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:f trawie))))) (Punct (interp ,)) (*Conj (conj a)) (S (PrepNP (*Prep (prep:inst za)) (NP (*N (ppron3:sg:inst:f:ter:akc:praep nią)))) (*VP (*V (fin:sg:ter:imperf stoi))) (NP (AdjP (*Adj (adj:sg:nom:f:pos druga))) (*NP (*N (subst:sg:nom:f dziewczynka)))))) (Punct (interp .))))\r\n",
      "(TOP (ROOT (*S (NP (*NP (*N (subst:pl:nom:f Dziewczynki))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (AdjP (*Adj (adj:pl:loc:f:pos kolorowych))) (*NP (*N (subst:pl:loc:f sukienkach)))))) (*VP (*V (fin:pl:ter:imperf stoją))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:f trawie)))) (VP (Punct (interp ,)) (*VP (*V (pcon:imperf puszczając))) (NP (*NP (*N (subst:pl:acc:f bańki))) (AdjP (*Adj (adj:pl:acc:f:pos mydlane)))))) (Punct (interp .))))\r\n",
      "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:f Grupa))) (NP (*N (subst:pl:gen:n:col dzieci)))) (*VP (*V (fin:sg:ter:imperf moczy))) (Part (part się)) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*NP (*N (subst:sg:loc:f wodzie))) (PrepNP (*Prep (prep:gen:nwok z)) (NP (*N (subst:sg:gen:f fontanny))))))) (Punct (interp .))))\r\n",
      "(TOP (ROOT (*S (NP (*NumP (*Num (num:pl:nom:m1:rec:ncol Kilku))) (NP (*N (subst:pl:gen:m1 chłopców)))) (*VP (*V (fin:sg:ter:imperf kąpie))) (Part (part się)) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*NP (*N (subst:sg:loc:f fontannie))) (PrepNP (*Prep (prep:gen obok)) (NP (*NP (*N (subst:pl:gen:m3 stolików))) (CP (Punct (interp ,)) (*S (PrepAdjP (*Prep (prep:loc przy)) (AdjP (*Adj (adj:pl:loc:m3:pos których)))) (*VP (*V (fin:pl:ter:imperf siedzą))) (NP (*N (subst:pl:nom:m1 ludzie)))))))))) (Punct (interp .))))\r\n",
      "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:f Dwójka))) (NP (AdjP (*Adj (adj:pl:gen:n:pos nagich))) (*NP (*N (subst:pl:gen:n:col dzieci))) (AdjP (*AdjP (*Adj (ppas:pl:gen:n:perf:aff ubrudzonych))) (NP (*N (subst:pl:inst:f farbkami)))))) (*VP (*V (fin:sg:ter:imperf siedzi))) (PrepNP (*Prep (prep:loc na)) (NP (*N (subst:sg:loc:f podłodze)))) (PrepNP (*Prep (prep:gen obok)) (NP (AdjP (*Adj (adj:pl:gen:f:pos porozrzucanych))) (*NP (*N (subst:pl:gen:f kartek)))))) (Punct (interp .))))\r\n",
      "(TOP (ROOT (*S (S (NP (*NumP (*Num (num:pl:nom:n:rec:col Dwoje))) (NP (AdjP (AdjP (*Adj (adj:pl:gen:n:pos nagich))) (*Conj (interp ,)) (AdjP (*Adj (adj:pl:gen:n:pos małych)))) (*NP (*N (subst:pl:gen:n:col dzieci))))) (*VP (*V (fin:sg:ter:imperf siedzi))) (PrepNP (*Prep (prep:gen naprzeciwko)) (NP (*N (siebie:gen siebie))))) (*Conj (conj i)) (S (NP (AdjP (*Adj (adj:sg:nom:n:com większe))) (*NP (*N (subst:sg:nom:n:col dziecko)))) (*VP (*V (fin:sg:ter:imperf smaruje))) (NP (*N (subst:sg:inst:f farbą))) (NP (AdjP (*Adj (adj:sg:acc:n:com mniejsze))) (*NP (*N (subst:sg:acc:n:col dziecko)))))) (Punct (interp .))))\r\n",
      "(TOP (ROOT (*S (NP (*NP (*N (subst:sg:nom:f Dziewczynka))) (PrepNP (*Prep (prep:loc o)) (NP (AdjP (*Adj (adj:pl:loc:n:pos ciemnych))) (*NP (*N (subst:pl:loc:n:col oczach)))))) (*VP (*V (fin:sg:ter:imperf patrzy))) (PrepNP (*Prep (prep:acc na)) (NP (AdjP (*Adj (adj:sg:acc:m3:pos czarny))) (*NP (*N (subst:sg:acc:m3 przedmiot))) (CP (Punct (interp ,)) (*S (AdjP (*Adj (adj:sg:acc:m3:pos który))) (*VP (*V (fin:sg:ter:imperf trzyma))) (PrepNP (*Prep (prep:loc:nwok w)) (NP (*N (subst:sg:loc:f ręce))))))))) (Punct (interp .))))\r\n"
     ]
    }
   ],
   "source": [
    "! head {BRACKETS_DIR}/pdbc-cont-train.dat"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "TF_zajecia",
   "language": "python",
   "name": "tf_zajecia"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}