DataPreparation.ipynb 6.2 KB
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "5cd26f6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "from datasets import load_dataset\n",
    "\n",
    "from IPython.display import display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "fecef4af",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset pdb_c_beta (/home/kkrasnowska/.cache/huggingface/datasets/pdb_c_beta/pdb_c_beta/0.2.0/d9c6dc764ae2a3483fa112c6159db4a0342dba8083bdb3b5981c45435b0692e1)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1c89c7103bba4347a3fa7d23cac42cfe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pdbc_dataset = load_dataset('../pdb_c_beta')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "23da801f",
   "metadata": {},
   "outputs": [],
   "source": [
    "CONLLU_DIR = 'connlu'\n",
    "! rm -r {CONLLU_DIR}\n",
    "! mkdir {CONLLU_DIR}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "91fb3bf3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('../')\n",
    "from neural_parser.hybrid_tree_utils import tree_from_dataset_instance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "c105feff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train\n",
      "    connlu/pdbc-train.conllu\n",
      "        17659\n",
      "    connlu/pdbc-cont-train.conllu\n",
      "        15903\n",
      "validation\n",
      "    connlu/pdbc-validation.conllu\n",
      "        2211\n",
      "    connlu/pdbc-cont-validation.conllu\n",
      "        1980\n",
      "test\n",
      "    connlu/pdbc-test.conllu\n",
      "        2205\n",
      "    connlu/pdbc-cont-test.conllu\n",
      "        1990\n"
     ]
    }
   ],
   "source": [
    "features = pdbc_dataset['train'].features\n",
    "\n",
    "for part, dataset in pdbc_dataset.items():\n",
    "    print(part)\n",
    "    s_cont, s_all = [], [] \n",
    "    for sentence in dataset:\n",
    "        # TODO! check if discont\n",
    "        tokens = sentence['tokens']\n",
    "        lemmas = sentence['lemmas']\n",
    "        heads = sentence['heads']\n",
    "        heads = [h + 1 if h is not None else 0 for i, h in enumerate(heads)]\n",
    "        deprels = [features['deprels'].feature.int2str(d) for d in sentence['deprels']]\n",
    "        deprels = ['root' if deprel == 'ROOT' else deprel for deprel in deprels]\n",
    "        rows = [f'# text = {\" \".join(tokens)}'] + [\n",
    "            f'{i + 1}\\t{t}\\t{l}\\t_\\t_\\t_\\t{h}\\t{d}\\t{h}:{d}\\t_'\n",
    "            for i, (t, l, h, d) in enumerate(zip(tokens, lemmas, heads, deprels))\n",
    "        ]\n",
    "        s_all.append(rows)\n",
    "        if tree_from_dataset_instance(sentence, features).is_continuous():\n",
    "            s_cont.append(rows)\n",
    "    f_all = os.path.join(CONLLU_DIR, f'pdbc-{part}.conllu')\n",
    "    f_cont = os.path.join(CONLLU_DIR, f'pdbc-cont-{part}.conllu')\n",
    "    with open(f_all, 'w') as f:\n",
    "        print('   ', f_all)\n",
    "        print('       ', len(s_all))\n",
    "        for rows in s_all:\n",
    "            print('\\n'.join(rows), end='\\n\\n', file=f)\n",
    "    with open(f_cont, 'w') as f:\n",
    "        print('   ', f_cont)\n",
    "        print('       ', len(s_cont))\n",
    "        for rows in s_cont:\n",
    "            print('\\n'.join(rows), end='\\n\\n', file=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "c849233c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   32509   319813  1398303 connlu/pdbc-cont-test.conllu\n",
      "   32509   319813  1198902 connlu/pdbc-cont-test-pred.conllu\n",
      "  271337  2682725 11781617 connlu/pdbc-cont-train.conllu\n",
      "   33491   330792  1452373 connlu/pdbc-cont-validation.conllu\n",
      "   33491   330792  1244192 connlu/pdbc-cont-validation-pred.conllu\n",
      "   37754   373431  1639937 connlu/pdbc-test.conllu\n",
      "   37754   373431  1406776 connlu/pdbc-test-pred.conllu\n",
      "  315364  3133712 13808053 connlu/pdbc-train.conllu\n",
      "   38987   386865  1704685 connlu/pdbc-validation.conllu\n",
      "   38987   386865  1461922 connlu/pdbc-validation-pred.conllu\n",
      "  872183  8638239 37096760 total\n"
     ]
    }
   ],
   "source": [
    "! wc {CONLLU_DIR}/*.conllu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "6b571716",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# text = Skośnooka dziewczynka trzyma w rękach drewniane pałeczki , a przed nią znajdują się naczynia kuchenne .\r\n",
      "1\tSkośnooka\tskośnooki\t_\t_\t_\t2\tadjunct\t2:adjunct\t_\r\n",
      "2\tdziewczynka\tdziewczynka\t_\t_\t_\t3\tsubj\t3:subj\t_\r\n",
      "3\ttrzyma\ttrzymać\t_\t_\t_\t9\tconjunct\t9:conjunct\t_\r\n",
      "4\tw\tw\t_\t_\t_\t3\tadjunct_locat\t3:adjunct_locat\t_\r\n",
      "5\trękach\tręka\t_\t_\t_\t4\tcomp\t4:comp\t_\r\n",
      "6\tdrewniane\tdrewniany\t_\t_\t_\t7\tadjunct\t7:adjunct\t_\r\n",
      "7\tpałeczki\tpałeczka\t_\t_\t_\t3\tobj\t3:obj\t_\r\n",
      "8\t,\t,\t_\t_\t_\t9\tpunct\t9:punct\t_\r\n",
      "9\ta\ta\t_\t_\t_\t0\troot\t0:root\t_\r\n"
     ]
    }
   ],
   "source": [
    "! head {CONLLU_DIR}/pdbc-train.conllu"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "TF_zajecia",
   "language": "python",
   "name": "tf_zajecia"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}