diff --git a/HW_2.ipynb b/HW_2.ipynb new file mode 100644 index 0000000..6b85c7c --- /dev/null +++ b/HW_2.ipynb @@ -0,0 +1,512 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + }, + "colab": { + "name": "HW 2.ipynb", + "provenance": [], + "include_colab_link": true + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DfUHKA19ifXX" + }, + "source": [ + "# Homework Assignment 2\n", + "### [The Art of Analyzing Big Data - The Data Scientist’s Toolbox](https://www.ise.bgu.ac.il/labs/fire/lectures.html)\n", + "#### By Dr. Michael Fire " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D9hGo7nXifXa" + }, + "source": [ + "## Dataset Collecting" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D15-Sk2lifXc" + }, + "source": [ + "**Question 1**: Write a function that collects all titles and number of votes for each title of a given [hacker news page](http://news.ycombinator.com) (15pt)\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "dR0z-yztifXe" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fpKRAkjtifXk" + }, + "source": [ + "**Question 2**: Write a function that collects data on four James Bond movies from [The Movie Database](https://www.themoviedb.org). You can use [tmdbv3api](https://pypi.org/project/tmdbv3api/) (15pt)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "32w2fs6JifXl" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AUdAUefVifXo" + }, + "source": [ + "## Kickstarter Projects Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zx58iFpDifXo" + }, + "source": [ + "Using the [Kickstarter Projects Dataset](https://www.kaggle.com/kemical/kickstarter-projects#ks-projects-201801.csv) and Pandas, please answer one of following questions:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9C5Pn9SkifXp" + }, + "source": [ + "### Please answer only **one** of the following questions according to your (ID number + 1) (use the formula ** mod 3 +1**) " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Nttgf82QifXq" + }, + "source": [ + "# which question to answer - put your ID number and run the code \n", + "your_id = \"\"\n", + "q = (int(your_id) + 1) % 3 + 1\n", + "print(\"You need to answer questions %s and 4\" % q)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "po4uxd2tifXu" + }, + "source": [ + "**Question 1:** On average which project category received the lowest number of backers? (15 pt) " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "7tWX81YeifXv" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-5TYWzlxifXx" + }, + "source": [ + "**Question 2:** On average which project category received the lowest pledged USD? (15 pt)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "PMYZplK9ifXy" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O0sULSUlifX0" + }, + "source": [ + "**Question 3:** In which month is there the lowest number of projects? (15 pt)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EflUthvqifX1" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jKZ19m4bifX3" + }, + "source": [ + "**Question 4 (for all):** Visualize your answer using matplotlib or seaborn (15pt)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "01gCsLrqifX3" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "v5NWpk8sx9BT", + "outputId": "90f440eb-096f-429d-d61e-333e1eb833e9", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "VjmQ5hWOx_QO", + "outputId": "24f4f47e-d1a7-445a-d77f-8bda7ba5dd22", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "import json\n", + "import os\n", + "\n", + "# Installing the Kaggle package\n", + "!pip install kaggle \n", + "\n", + "#Important Note: complete this with your own key - after running this for the first time remmember to **remove** your API_KEY\n", + "#api_token = {\"username\":\"\",\"key\":\"\"}\n", + "\n", + "\n", + "# creating kaggle.json file with the personal API-Key details \n", + "# You can also put this file on your Google Drive\n", + "#with open('/root/.kaggle/kaggle.json', 'w') as file:\n", + "# json.dump(api_token, file)\n", + "!mkdir /root/.kaggle/\n", + "!cp /content/drive/MyDrive/kaggle.json /root/.kaggle/kaggle.json\n", + "!chmod 600 /root/.kaggle/kaggle.json" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: kaggle in /usr/local/lib/python3.7/dist-packages (1.5.12)\n", + "Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.62.3)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2021.5.30)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.23.0)\n", + "Requirement already satisfied: python-slugify in /usr/local/lib/python3.7/dist-packages (from kaggle) (5.0.2)\n", + "Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.24.3)\n", + "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.2)\n", + "Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.7/dist-packages (from python-slugify->kaggle) (1.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (3.0.4)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PpjMJgB1ifX6" + }, + "source": [ + "## The Marvel Universe Social Network" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oVRgZuN5ifX6" + }, + "source": [ + "Using the [The Marvel Universe Social Network](https://www.kaggle.com/csanhueza/the-marvel-universe-social-network) and Pandas, please answer the following questions:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "XpdIMnqxyCa7", + "outputId": "55e1b0fa-e7c2-4151-b715-2f86cfbc12c1", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "# Creating a dataset directory\n", + "!mkdir ./datasets\n", + "!mkdir ./datasets/marvel\n", + "\n", + "# download the dataset from Kaggle and unzip it\n", + "!kaggle datasets download csanhueza/the-marvel-universe-social-network -f hero-network.csv -p ./datasets/marvel/\n", + "!chdir ./datasets/marvel\n", + "\n", + "!unzip ./datasets/marvel/hero-network.csv.zip -d ./datasets/marvel" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Archive: ./datasets/marvel/hero-network.csv.zip\n", + " inflating: ./datasets/marvel/hero-network.csv \n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Ifgyk4SVy0gg", + "outputId": "24e480aa-e923-487a-8847-4edcf56df3fc", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } + }, + "source": [ + "\n", + "import pandas as pd # we will talk more about pandas in our next lecture\n", + "df = pd.read_csv('/content/datasets/marvel/hero-network.csv',engine ='python')\n", + "df.head()" + ], + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hero1hero2
0LITTLE, ABNERPRINCESS ZANDA
1LITTLE, ABNERBLACK PANTHER/T'CHAL
2BLACK PANTHER/T'CHALPRINCESS ZANDA
3LITTLE, ABNERPRINCESS ZANDA
4LITTLE, ABNERBLACK PANTHER/T'CHAL
\n", + "
" + ], + "text/plain": [ + " hero1 hero2\n", + "0 LITTLE, ABNER PRINCESS ZANDA\n", + "1 LITTLE, ABNER BLACK PANTHER/T'CHAL\n", + "2 BLACK PANTHER/T'CHAL PRINCESS ZANDA\n", + "3 LITTLE, ABNER PRINCESS ZANDA\n", + "4 LITTLE, ABNER BLACK PANTHER/T'CHAL" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E91-E4ufifX7" + }, + "source": [ + "**Question 1:** Write code which calculate the top-10 most friendly characters, i.e., characters with the \n", + " highest number of friends. Please use _hero_network.csv_ file (15pt). \n", + " \n", + "**Note:** Not all the links in this dataset are symmetric." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "gYIoTJNIifX8" + }, + "source": [ + "hero_dic = {}\n", + "new_df = {'hero1':[] , 'hero2':[]}\n", + "for index, row1 in df.iterrows():\n", + " hero1 = row1['hero1']\n", + " hero2 = row1['hero2']\n", + " if \"hero1\" not in hero_dic:\n", + " hero_dic[hero1]=[]\n", + " hero_dic[hero1].append(hero2)\n", + " \n", + " # for index, row2 in df.copy().iterrows():\n", + " # copy_hero1 = row2['hero1']\n", + " # copy_hero2 = row2['hero2']\n", + " # if hero1 == copy_hero2 and hero2 ==copy_hero1:\n", + " # new_df['hero1'].append(hero1)\n", + " #new_df['hero1'].append(hero1)\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EqcKzoUsifX-" + }, + "source": [ + "**Question 2:** Using Pandas and Networkx create a graph object of The Marvel Universe Social Network with the 150 most \"friendly\" characters (10pt).\n", + " The vertices in that graph need to be relative to the size of each character's number of links (also referred to as the vertex degree) (10pt).\n", + "Please color each node in the graph according to character type according to data in the *nodes.csv* file (5pt)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_4650rgeifX_" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ELLg4egiifYA" + }, + "source": [ + "**Bonus:** Visualize the above network using [Cytoscape](https://cytoscape.org) or [Gephi](https://gephi.org) (10pt)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "TQvUWrqWifYB" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file