From f028915426c928864b36bc144056df12b0e7c7da Mon Sep 17 00:00:00 2001
From: southjohn64 <61773763+southjohn64@users.noreply.github.com>
Date: Tue, 16 Nov 2021 10:18:47 +0200
Subject: [PATCH] Created using Colaboratory
---
Copy_of_HW_4.ipynb | 1410 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 1410 insertions(+)
create mode 100644 Copy_of_HW_4.ipynb
diff --git a/Copy_of_HW_4.ipynb b/Copy_of_HW_4.ipynb
new file mode 100644
index 0000000..6dfaf71
--- /dev/null
+++ b/Copy_of_HW_4.ipynb
@@ -0,0 +1,1410 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ },
+ "colab": {
+ "name": "Copy of HW-4.ipynb",
+ "provenance": [],
+ "collapsed_sections": [],
+ "include_colab_link": true
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "pOXV8fOLj-7Y"
+ },
+ "source": [
+ "!pip install turicreate"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "4eZPBuMDkGiN"
+ },
+ "source": [
+ "import json\n",
+ "import os\n",
+ "import networkx as nx\n",
+ "import turicreate as tc\n",
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline"
+ ],
+ "execution_count": 60,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "3wyT54-KkH2K"
+ },
+ "source": [
+ "!mkdir ./datasets/\n",
+ "!rm -rf ./datasets/reddit/"
+ ],
+ "execution_count": 3,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "wXV4CqgFkK1O"
+ },
+ "source": [
+ "!mkdir ./datasets/reddit\n",
+ "!wget http://dynamics.cs.washington.edu/nobackup/reddit/Dexter.tar.gz -O ./datasets/reddit/Dexter.tar.gz\n",
+ "#!unzip ./datasets/reddit/*.gz -d ./datasets/reddit/\n",
+ "!tar -xf \"/content/datasets/reddit/Dexter.tar.gz\" -C \"./datasets/reddit/\"\n",
+ "!ls ./datasets/reddit/"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "qBOVT9Z2kNql",
+ "outputId": "aed728a3-8adc-4e4e-8429-e613218e6db3"
+ },
+ "source": [
+ "sg = tc.load_sgraph('/content/datasets/reddit/Dexter.2009-10-23.2009-11-20.sgraph')\n",
+ "sg"
+ ],
+ "execution_count": 51,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "SGraph({'num_edges': 16, 'num_vertices': 8})\n",
+ "Vertex Fields:['__id', 'mindate', 'maxdate']\n",
+ "Edge Fields:['__src_id', '__dst_id', 'maxdate', 'weight', 'mindate']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 51
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yCmgortah1Iz"
+ },
+ "source": [
+ "# Homework Assignment 4\n",
+ "### The Art of Analyzing Big Data - The Data Scientist’s Toolbox "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7A2wSKHih1I3"
+ },
+ "source": [
+ "## Reddit Networks\n",
+ "Using the [Reddit networks dataset](http://dynamics.cs.washington.edu/nobackup/reddit/) select the subreddit of your favorite TV show. Using the data of the selected subreddit, \n",
+ "answer the following questions:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1yz7w-sXh1I5"
+ },
+ "source": [
+ "**Task 1:** Calculate and visualize the degree distribution of the vertices in the network (15pt)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "PJe9yE6xh1I7",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 207
+ },
+ "outputId": "1567458b-a3e4-4357-c9d4-96af04c2fb3c"
+ },
+ "source": [
+ "from turicreate import degree_counting\n",
+ "deg = degree_counting.create(sg)\n",
+ "deg_graph = deg['graph'] # a new SGraph with degree data attached to each vertex\n",
+ "in_degree = deg_graph.vertices[['__id', 'in_degree']]\n",
+ "out_degree = deg_graph.vertices[['__id', 'out_degree']]\n",
+ "deg_graph.vertices[['__id', 'total_degree']]"
+ ],
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "
\n",
+ " \n",
+ " __id | \n",
+ " total_degree | \n",
+ "
\n",
+ " \n",
+ " ContentWithOurDecay | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " chroniq | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " aenea | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " Bbaily | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " DoeL | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " Surcam | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " chime | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " apmihal | \n",
+ " 2 | \n",
+ "
\n",
+ "
\n",
+ "[8 rows x 2 columns]
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Columns:\n",
+ "\t__id\tstr\n",
+ "\ttotal_degree\tint\n",
+ "\n",
+ "Rows: 8\n",
+ "\n",
+ "Data:\n",
+ "+---------------------+--------------+\n",
+ "| __id | total_degree |\n",
+ "+---------------------+--------------+\n",
+ "| ContentWithOurDecay | 8 |\n",
+ "| chroniq | 6 |\n",
+ "| aenea | 4 |\n",
+ "| Bbaily | 6 |\n",
+ "| DoeL | 2 |\n",
+ "| Surcam | 2 |\n",
+ "| chime | 2 |\n",
+ "| apmihal | 2 |\n",
+ "+---------------------+--------------+\n",
+ "[8 rows x 2 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "u4FKPZhhkx83"
+ },
+ "source": [
+ "graph_deg_df = deg_graph.vertices[['__id', 'total_degree']].to_dataframe()\n"
+ ],
+ "execution_count": 5,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 354
+ },
+ "id": "7R5nwl4IkzRq",
+ "outputId": "98ffb6e3-eddd-42b2-9110-b30e2c311acc"
+ },
+ "source": [
+ "sns.set(rc={'figure.figsize':(15.7,5.27)})\n",
+ "\n",
+ "ax = sns.scatterplot(data=graph_deg_df , x=\"__id\", y=\"total_degree\")\n",
+ "ax.set(xlabel='vertex name (__id)');"
+ ],
+ "execution_count": 18,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dL0vRMqQh1I-"
+ },
+ "source": [
+ "**Task 2:** Create a subgraph of the top-40 users according to a selected centrality algorithm. Draw the subgraph using circlular layout (15pt)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "cNRkbMHQh1I_",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 649
+ },
+ "outputId": "5277cb3b-407b-46ea-9c20-8a18c9d80765"
+ },
+ "source": [
+ "pr = tc.pagerank.create(sg)\n",
+ "sg.vertices['pagerank'] = pr['graph'].vertices['pagerank'] #pr['graph'] is a graph in which each vertex has pagerank value\n",
+ "sg.vertices\n",
+ "sg.vertices.sort(\"pagerank\", ascending=False)[:40]\n"
+ ],
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "Counting out degree
"
+ ],
+ "text/plain": [
+ "Counting out degree"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "Done counting out degree
"
+ ],
+ "text/plain": [
+ "Done counting out degree"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "+-----------+-----------------------+
"
+ ],
+ "text/plain": [
+ "+-----------+-----------------------+"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| Iteration | L1 change in pagerank |
"
+ ],
+ "text/plain": [
+ "| Iteration | L1 change in pagerank |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "+-----------+-----------------------+
"
+ ],
+ "text/plain": [
+ "+-----------+-----------------------+"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 1 | 5.24167 |
"
+ ],
+ "text/plain": [
+ "| 1 | 5.24167 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 2 | 3.13083 |
"
+ ],
+ "text/plain": [
+ "| 2 | 3.13083 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 3 | 2.25606 |
"
+ ],
+ "text/plain": [
+ "| 3 | 2.25606 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 4 | 1.72552 |
"
+ ],
+ "text/plain": [
+ "| 4 | 1.72552 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 5 | 1.40263 |
"
+ ],
+ "text/plain": [
+ "| 5 | 1.40263 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 6 | 1.17117 |
"
+ ],
+ "text/plain": [
+ "| 6 | 1.17117 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 7 | 0.985853 |
"
+ ],
+ "text/plain": [
+ "| 7 | 0.985853 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 8 | 0.834804 |
"
+ ],
+ "text/plain": [
+ "| 8 | 0.834804 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 9 | 0.708132 |
"
+ ],
+ "text/plain": [
+ "| 9 | 0.708132 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 10 | 0.601435 |
"
+ ],
+ "text/plain": [
+ "| 10 | 0.601435 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 11 | 0.511002 |
"
+ ],
+ "text/plain": [
+ "| 11 | 0.511002 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 12 | 0.434279 |
"
+ ],
+ "text/plain": [
+ "| 12 | 0.434279 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 13 | 0.369105 |
"
+ ],
+ "text/plain": [
+ "| 13 | 0.369105 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 14 | 0.313728 |
"
+ ],
+ "text/plain": [
+ "| 14 | 0.313728 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 15 | 0.266664 |
"
+ ],
+ "text/plain": [
+ "| 15 | 0.266664 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 16 | 0.226663 |
"
+ ],
+ "text/plain": [
+ "| 16 | 0.226663 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 17 | 0.192663 |
"
+ ],
+ "text/plain": [
+ "| 17 | 0.192663 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 18 | 0.163763 |
"
+ ],
+ "text/plain": [
+ "| 18 | 0.163763 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 19 | 0.139198 |
"
+ ],
+ "text/plain": [
+ "| 19 | 0.139198 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "| 20 | 0.118319 |
"
+ ],
+ "text/plain": [
+ "| 20 | 0.118319 |"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "+-----------+-----------------------+
"
+ ],
+ "text/plain": [
+ "+-----------+-----------------------+"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " __id | \n",
+ " mindate | \n",
+ " maxdate | \n",
+ " pagerank | \n",
+ "
\n",
+ " \n",
+ " chroniq | \n",
+ " 2009-10-23 05:56:41 | \n",
+ " 2013-09-23 16:09:20 | \n",
+ " 2.565090740523448 | \n",
+ "
\n",
+ " \n",
+ " aenea | \n",
+ " 2009-10-23 10:43:06 | \n",
+ " 2011-01-31 13:34:33 | \n",
+ " 2.4913315301242274 | \n",
+ "
\n",
+ " \n",
+ " ContentWithOurDecay | \n",
+ " 2009-10-23 06:26:33 | \n",
+ " 2015-03-01 17:30:59 | \n",
+ " 0.38014102452563364 | \n",
+ "
\n",
+ " \n",
+ " Bbaily | \n",
+ " 2009-10-23 16:27:38 | \n",
+ " 2013-06-13 21:28:03 | \n",
+ " 0.26760331552902794 | \n",
+ "
\n",
+ " \n",
+ " DoeL | \n",
+ " 2009-10-23 12:26:49 | \n",
+ " 2009-10-23 12:26:49 | \n",
+ " 0.2607190204341546 | \n",
+ "
\n",
+ " \n",
+ " chime | \n",
+ " 2009-11-12 00:56:01 | \n",
+ " 2012-11-21 05:54:21 | \n",
+ " 0.2038533155290279 | \n",
+ "
\n",
+ " \n",
+ " apmihal | \n",
+ " 2009-10-24 01:19:37 | \n",
+ " 2010-07-23 04:42:27 | \n",
+ " 0.2038533155290279 | \n",
+ "
\n",
+ " \n",
+ " Surcam | \n",
+ " 2009-10-27 14:28:49 | \n",
+ " 2009-10-27 14:43:13 | \n",
+ " 0.15 | \n",
+ "
\n",
+ "
\n",
+ "[8 rows x 4 columns]
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Columns:\n",
+ "\t__id\tstr\n",
+ "\tmindate\tdatetime\n",
+ "\tmaxdate\tdatetime\n",
+ "\tpagerank\tfloat\n",
+ "\n",
+ "Rows: 8\n",
+ "\n",
+ "Data:\n",
+ "+---------------------+---------------------+---------------------+\n",
+ "| __id | mindate | maxdate |\n",
+ "+---------------------+---------------------+---------------------+\n",
+ "| chroniq | 2009-10-23 05:56:41 | 2013-09-23 16:09:20 |\n",
+ "| aenea | 2009-10-23 10:43:06 | 2011-01-31 13:34:33 |\n",
+ "| ContentWithOurDecay | 2009-10-23 06:26:33 | 2015-03-01 17:30:59 |\n",
+ "| Bbaily | 2009-10-23 16:27:38 | 2013-06-13 21:28:03 |\n",
+ "| DoeL | 2009-10-23 12:26:49 | 2009-10-23 12:26:49 |\n",
+ "| chime | 2009-11-12 00:56:01 | 2012-11-21 05:54:21 |\n",
+ "| apmihal | 2009-10-24 01:19:37 | 2010-07-23 04:42:27 |\n",
+ "| Surcam | 2009-10-27 14:28:49 | 2009-10-27 14:43:13 |\n",
+ "+---------------------+---------------------+---------------------+\n",
+ "+---------------------+\n",
+ "| pagerank |\n",
+ "+---------------------+\n",
+ "| 2.565090740523448 |\n",
+ "| 2.4913315301242274 |\n",
+ "| 0.38014102452563364 |\n",
+ "| 0.26760331552902794 |\n",
+ "| 0.2607190204341546 |\n",
+ "| 0.2038533155290279 |\n",
+ "| 0.2038533155290279 |\n",
+ "| 0.15 |\n",
+ "+---------------------+\n",
+ "[8 rows x 4 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "F4VpJrZplTcW",
+ "outputId": "d4d079ad-aea4-4c24-f823-b7882681a1ff"
+ },
+ "source": [
+ "def sgraph2nxgraph(sgraph, is_directed=True, add_vertices_attributes=True, add_edges_attributes=True):\n",
+ " if is_directed:\n",
+ " nx_g = nx.DiGraph()\n",
+ " else:\n",
+ " nx_g = nx.Graph()\n",
+ " if add_vertices_attributes:\n",
+ " vertices = [(r['__id'] , r) for r in sgraph.vertices]\n",
+ " else:\n",
+ " vertices = list(sgraph.get_vertices()['__id'])\n",
+ "\n",
+ " if add_edges_attributes:\n",
+ " edges = [(r['__src_id'], r['__dst_id'], r) for r in sgraph.edges]\n",
+ " else:\n",
+ " edges = [(e['__src_id'], e['__dst_id']) for e in sgraph.get_edges()]\n",
+ " nx_g.add_nodes_from(vertices)\n",
+ " nx_g.add_edges_from(edges)\n",
+ " return nx_g\n",
+ "\n",
+ "\n",
+ "ng = sgraph2nxgraph(sg)\n",
+ "print(\"Networkx: %s\" % nx.info(ng))"
+ ],
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Networkx: DiGraph with 8 nodes and 16 edges\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 591
+ },
+ "id": "fjHK9XgEmfLq",
+ "outputId": "80a5a127-994d-433e-f721-08e1a36e6ed3"
+ },
+ "source": [
+ "def get_top_40_users_by_selected_centrality(algo_name, network_graph):\n",
+ " if algo_name =='page_rank':\n",
+ " do_page_rank(network_graph)\n",
+ " if algo_name == 'closeness_centrality':\n",
+ " do_closeness_centrality(network_graph)\n",
+ "\n",
+ "get_top_40_users_by_selected_centrality('page_rank', ng)\n",
+ "\n",
+ "\n",
+ " "
+ ],
+ "execution_count": 30,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zXpGJCTkmLQk"
+ },
+ "source": [
+ "**page rank**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "VtHLxXcRlWMp"
+ },
+ "source": [
+ "def do_page_rank(ng):\n",
+ " #d = nx.pagerank(ng)\n",
+ "\n",
+ " top40_pagerank = sorted(nx.pagerank(ng) ,key=len, reverse=True)[:40]\n",
+ " #[len(c) for c in top40_pagerank][:40]\n",
+ " h = ng.subgraph(top40_pagerank)\n",
+ " plt.figure(figsize=(10,10))\n",
+ " ax = plt.gca()\n",
+ " ax.set_title('page rank')\n",
+ " nx.draw_circular(h, with_labels=True)"
+ ],
+ "execution_count": 29,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Zi-tma3plmxd"
+ },
+ "source": [
+ "**closeness_centrality**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "mzvgDdJMllm8"
+ },
+ "source": [
+ "def do_closeness_centrality(ng):\n",
+ " #d = nx.closeness_centrality(ng) # can take some time to run\n",
+ " top40_closeness_centrality = sorted(nx.closeness_centrality(ng) ,key=len, reverse=True)[:40]\n",
+ " #[len(c) for c in top40_closeness_centrality][:40]\n",
+ " h = ng.subgraph(top40_closeness_centrality)\n",
+ " plt.figure(figsize=(10,10))\n",
+ " ax = plt.gca()\n",
+ " ax.set_title('closeness centrality')\n",
+ " nx.draw_circular(h, with_labels=True)\n"
+ ],
+ "execution_count": 31,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QFNABWiwh1JC"
+ },
+ "source": [
+ "**Task 3:** Calculate the PageRank, triangles, and average shortest path of each vertex in the graph (15pt)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OCPK34iAosrX",
+ "outputId": "05a14739-0ec9-45e3-ff62-6b39dc33799c"
+ },
+ "source": [
+ "dict(sorted(nx.pagerank(ng).items(), key=lambda item: item[1]))"
+ ],
+ "execution_count": 37,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "{'Bbaily': 0.06206216197944935,\n",
+ " 'ContentWithOurDecay': 0.0828947633011499,\n",
+ " 'DoeL': 0.033988252927105625,\n",
+ " 'Surcam': 0.022361251873504986,\n",
+ " 'aenea': 0.36057615050300873,\n",
+ " 'apmihal': 0.027394148216789108,\n",
+ " 'chime': 0.03242704456007323,\n",
+ " 'chroniq': 0.3782962266389188}"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 37
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 292
+ },
+ "id": "gCNwALjpr6GA",
+ "outputId": "4a574498-87ed-40e7-c531-87ed7abdc67e"
+ },
+ "source": [
+ "triangle_counting_sg = tc.triangle_counting.create(sg)\n",
+ "#triangle_counting_sg_out = triangle_counting_sg['triangle_count']\n",
+ "sg.vertices['triangle_count'] = triangle_counting_sg['graph'].vertices['triangle_count']\n",
+ "sg.vertices.sort('triangle_count',ascending=False)"
+ ],
+ "execution_count": 52,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "Initializing vertex ids.
"
+ ],
+ "text/plain": [
+ "Initializing vertex ids."
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "Removing duplicate (bidirectional) edges.
"
+ ],
+ "text/plain": [
+ "Removing duplicate (bidirectional) edges."
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "Counting triangles...
"
+ ],
+ "text/plain": [
+ "Counting triangles..."
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "Finished in 0.083595 secs.
"
+ ],
+ "text/plain": [
+ "Finished in 0.083595 secs."
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "Total triangles in the graph : 7
"
+ ],
+ "text/plain": [
+ "Total triangles in the graph : 7"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " __id | \n",
+ " mindate | \n",
+ " maxdate | \n",
+ " triangle_count | \n",
+ "
\n",
+ " \n",
+ " ContentWithOurDecay | \n",
+ " 2009-10-23 06:26:33 | \n",
+ " 2015-03-01 17:30:59 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " chroniq | \n",
+ " 2009-10-23 05:56:41 | \n",
+ " 2013-09-23 16:09:20 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " Bbaily | \n",
+ " 2009-10-23 16:27:38 | \n",
+ " 2013-06-13 21:28:03 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " aenea | \n",
+ " 2009-10-23 10:43:06 | \n",
+ " 2011-01-31 13:34:33 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " DoeL | \n",
+ " 2009-10-23 12:26:49 | \n",
+ " 2009-10-23 12:26:49 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " Surcam | \n",
+ " 2009-10-27 14:28:49 | \n",
+ " 2009-10-27 14:43:13 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " apmihal | \n",
+ " 2009-10-24 01:19:37 | \n",
+ " 2010-07-23 04:42:27 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " chime | \n",
+ " 2009-11-12 00:56:01 | \n",
+ " 2012-11-21 05:54:21 | \n",
+ " 0 | \n",
+ "
\n",
+ "
\n",
+ "[8 rows x 4 columns]
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Columns:\n",
+ "\t__id\tstr\n",
+ "\tmindate\tdatetime\n",
+ "\tmaxdate\tdatetime\n",
+ "\ttriangle_count\tint\n",
+ "\n",
+ "Rows: 8\n",
+ "\n",
+ "Data:\n",
+ "+---------------------+---------------------+---------------------+----------------+\n",
+ "| __id | mindate | maxdate | triangle_count |\n",
+ "+---------------------+---------------------+---------------------+----------------+\n",
+ "| ContentWithOurDecay | 2009-10-23 06:26:33 | 2015-03-01 17:30:59 | 5 |\n",
+ "| chroniq | 2009-10-23 05:56:41 | 2013-09-23 16:09:20 | 5 |\n",
+ "| Bbaily | 2009-10-23 16:27:38 | 2013-06-13 21:28:03 | 5 |\n",
+ "| aenea | 2009-10-23 10:43:06 | 2011-01-31 13:34:33 | 3 |\n",
+ "| DoeL | 2009-10-23 12:26:49 | 2009-10-23 12:26:49 | 1 |\n",
+ "| Surcam | 2009-10-27 14:28:49 | 2009-10-27 14:43:13 | 1 |\n",
+ "| apmihal | 2009-10-24 01:19:37 | 2010-07-23 04:42:27 | 1 |\n",
+ "| chime | 2009-11-12 00:56:01 | 2012-11-21 05:54:21 | 0 |\n",
+ "+---------------------+---------------------+---------------------+----------------+\n",
+ "[8 rows x 4 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 52
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gWExwapoxkkK"
+ },
+ "source": [
+ "**avg shortest path**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 407
+ },
+ "id": "DFa2XeESuARt",
+ "outputId": "819779ca-283b-4c91-bea1-513247278bf9"
+ },
+ "source": [
+ "shortest_path_avg_dic = {}\n",
+ "shortest_path_dic = nx.shortest_path(ng)\n",
+ "\n",
+ "shortest_path_avg_dic = {}\n",
+ "shortest_path_avg_dic['node'] = []\n",
+ "shortest_path_avg_dic['avg_shortest_path'] = []\n",
+ "for key, value in shortest_path_dic.items():\n",
+ " vertex_dic = shortest_path_dic[key]\n",
+ " num_items = len(vertex_dic)\n",
+ " total_path_lenth = 0.0\n",
+ " for _, vertex_array in vertex_dic.items():\n",
+ " total_path_lenth += len(vertex_array)\n",
+ " avg = total_path_lenth / num_items\n",
+ " shortest_path_avg_dic['node'].append(key)\n",
+ " shortest_path_avg_dic['avg_shortest_path'].append(avg)\n",
+ "shortest_path_avg_dic\n",
+ "pd.DataFrame(shortest_path_avg_dic).plot(kind='scatter', x='node', y='avg_shortest_path',title='avg shortest path');\n"
+ ],
+ "execution_count": 72,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "EQCElKdf-ltM",
+ "outputId": "41dc22f5-e30e-4303-c6e7-0268b8c7a329"
+ },
+ "source": [
+ "nx"
+ ],
+ "execution_count": 83,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "execution_count": 83
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "U6_x3mtlh1JG"
+ },
+ "source": [
+ "**Task 4:** Use Cytoscape and Gephi to visualize the network, where each vertex size is correlates to its degree (15pt)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "9Uc0HgFjx_PX",
+ "outputId": "59dad206-e59f-4dfa-f0eb-e9b63b908a6b"
+ },
+ "source": [
+ "dexter_graph = nx.Graph()\n",
+ "#l = [g1,g2,g3]\n",
+ "nodes = set()\n",
+ "edges = set()\n",
+ "#for g in l:\n",
+ "nodes |= ng.nodes()\n",
+ "edges |= ng.edges()\n",
+ "\n",
+ "dexter_graph.add_nodes_from(nodes)\n",
+ "dexter_graph.add_edges_from(edges)\n",
+ "\n",
+ "#let's add weights\n",
+ "for e in dexter_graph.edges():\n",
+ " dexter_graph[e[0]][e[1]]['weight'] = 0\n",
+ "\n",
+ "#for g in l:\n",
+ "for e in ng.edges():\n",
+ " dexter_graph[e[0]][e[1]]['weight'] += ng[e[0]][e[1]]['weight']\n",
+ " \n",
+ "print(nx.info(dexter_graph))\n",
+ " "
+ ],
+ "execution_count": 86,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Graph with 8 nodes and 13 edges\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "0QgVAODRh1JH"
+ },
+ "source": [
+ "nx.write_gexf(dexter_graph, \"./datasets/dexter.gexf\")\n",
+ "nx.write_gml(dexter_graph, \"./datasets/dexter.gml\")"
+ ],
+ "execution_count": 87,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "_xLZjqPDh1JL"
+ },
+ "source": [
+ "**Task 5:** Write a function that for a given vertex creates a subgraph of the selected vertex and all the vertex's in/out friends (10pt).\n",
+ "Draw the subgraph (5pt). Calculate the number of verticies and edges in the subgraph (5pt)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "KvocVBDeh1JM"
+ },
+ "source": [
+ ""
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Al5dMlGJh1JP"
+ },
+ "source": [
+ "**Task 6:** Find the top-10 most centeral players at [The Free Internet Chess Sever](http://dynamics.cs.washington.edu/nobackup/chess/fcis.tar.gz) (15 pt). Visualize part of the network (5pt).\n",
+ " \n",
+ "**Note:** The network has 429,747,476 edges"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "FIzUbrRch1JQ"
+ },
+ "source": [
+ ""
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bbFabMLTh1JS"
+ },
+ "source": [
+ "**Task 7:** Use Cytoscape to draw the Lord of the Rings Couples network (see Lecture 2).\n",
+ "Fill the network's vertices in a different color according to the gender.\n",
+ "Select each vertex shape to be according to the vertex race. (7pt)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "SJjpZznsh1JT"
+ },
+ "source": [
+ ""
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yZkceWDCh1JV"
+ },
+ "source": [
+ "### Additional Questions for Practice"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "iDH4cYwFh1JW"
+ },
+ "source": [
+ "**Task 1:** Visualize the distribution of the network's strongly and weakly connected components."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "3y8_gXHhh1JW"
+ },
+ "source": [
+ ""
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "H4ndNxk9h1JZ"
+ },
+ "source": [
+ "**Task 2:** Using Cytoscape, visualize the network's maximal strongly connected component (or part of it)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "SsMG9zLDh1Ja"
+ },
+ "source": [
+ ""
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fxQMNlC8h1Jc"
+ },
+ "source": [
+ "**Task 3:** Draw a subgraph of all the vertices that have at least one reciprocal link, i.e., all the vertices where there is at least one vertex _u_ so that both links (u,v) and (v,u) exists"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "olXHKSyTh1Jc"
+ },
+ "source": [
+ ""
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RTdqzReMh1Je"
+ },
+ "source": [
+ "**Task 4:** Split the network into communities, and find the second most central vertex in each community\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "fQMV_Adfh1Je"
+ },
+ "source": [
+ ""
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6BYt52YRh1Jg"
+ },
+ "source": [
+ "**Task 5:** Find the top-10 most central players at [The Free Internet Chess Server](http://dynamics.cs.washington.edu/nobackup/chess/fcis.tar.gz).\n",
+ " Visualize part of the network.\n",
+ " \n",
+ "**Note:** The network has 429,747,476 edges"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Hx1az7w0h1Jg"
+ },
+ "source": [
+ ""
+ ],
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file