diff --git a/notebooks/ex1_Inspect_DistilBERT_Vocabulary.ipynb b/notebooks/ex1_Inspect_DistilBERT_Vocabulary.ipynb index 08634e7..b043406 100644 --- a/notebooks/ex1_Inspect_DistilBERT_Vocabulary.ipynb +++ b/notebooks/ex1_Inspect_DistilBERT_Vocabulary.ipynb @@ -5,7 +5,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook is copied from Chris McCormick's blog post: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/ and applied to Distilbert so we can compare the vocabulary size and the embeddings of distilbert to BERT." + "This notebook is copied from Chris McCormick's blog post but updated to use the transformers library and adapted (TODO adapt): https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/ \n", + "\n", + "Distilbert has the same vocabulary size and embeddings as BERT (right?)" ] }, { @@ -28,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -37,39 +39,10 @@ "id": "PzCjfNB6jksJ", "outputId": "9aa79438-898e-4d43-8487-b0684dfe2623" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting pytorch-pretrained-bert\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)\n", - "\r\u001b[K |██▋ | 10kB 18.2MB/s eta 0:00:01\r\u001b[K |█████▎ | 20kB 6.7MB/s eta 0:00:01\r\u001b[K |████████ | 30kB 9.4MB/s eta 0:00:01\r\u001b[K |██████████▋ | 40kB 5.7MB/s eta 0:00:01\r\u001b[K |█████████████▎ | 51kB 6.9MB/s eta 0:00:01\r\u001b[K |███████████████▉ | 61kB 8.1MB/s eta 0:00:01\r\u001b[K |██████████████████▌ | 71kB 9.2MB/s eta 0:00:01\r\u001b[K |█████████████████████▏ | 81kB 10.3MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 92kB 11.4MB/s eta 0:00:01\r\u001b[K |██████████████████████████▌ | 102kB 9.1MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▏ | 112kB 9.1MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▊| 122kB 9.1MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 133kB 9.1MB/s \n", - "\u001b[?25hRequirement already satisfied: torch>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from pytorch-pretrained-bert) (1.3.1)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from pytorch-pretrained-bert) (2.21.0)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from pytorch-pretrained-bert) (4.28.1)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from pytorch-pretrained-bert) (1.17.4)\n", - "Collecting regex\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e3/8e/cbf2295643d7265e7883326fb4654e643bfc93b3a8a8274d8010a39d8804/regex-2019.11.1-cp36-cp36m-manylinux1_x86_64.whl (643kB)\n", - "\u001b[K |████████████████████████████████| 645kB 31.3MB/s \n", - "\u001b[?25hRequirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from pytorch-pretrained-bert) (1.10.18)\n", - "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch-pretrained-bert) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch-pretrained-bert) (2019.9.11)\n", - "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch-pretrained-bert) (2.8)\n", - "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch-pretrained-bert) (3.0.4)\n", - "Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch-pretrained-bert) (0.2.1)\n", - "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch-pretrained-bert) (0.9.4)\n", - "Requirement already satisfied: botocore<1.14.0,>=1.13.18 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch-pretrained-bert) (1.13.18)\n", - "Requirement already satisfied: python-dateutil<2.8.1,>=2.1; python_version >= \"2.7\" in /usr/local/lib/python3.6/dist-packages (from botocore<1.14.0,>=1.13.18->boto3->pytorch-pretrained-bert) (2.6.1)\n", - "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.14.0,>=1.13.18->boto3->pytorch-pretrained-bert) (0.15.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil<2.8.1,>=2.1; python_version >= \"2.7\"->botocore<1.14.0,>=1.13.18->boto3->pytorch-pretrained-bert) (1.12.0)\n", - "Installing collected packages: regex, pytorch-pretrained-bert\n", - "Successfully installed pytorch-pretrained-bert-0.6.2 regex-2019.11.1\n" - ] - } - ], + "outputs": [], "source": [ - "# !pip install pytorch-pretrained-bert" + "# !pip install transformers\n", + "# !pip install torch" ] }, { @@ -137,7 +110,7 @@ " for token in tokenizer.vocab.keys():\n", " \n", " # Write it out and escape any unicode characters. \n", - " f.write(token + '\\n')\n" + " f.write(token + '\\n')" ] }, {