edofazza · May 31, 2021
diff --git a/‎README.md
+22 b/‎README.md
+22
diff --git a/‎documentation/page_rank_spark-Copia Mirco.docx
473 Bytes b/‎documentation/page_rank_spark-Copia Mirco.docx
473 Bytes
diff --git a/‎pagerank_Java/src/main/java/PageRank.java
+6-5 b/‎pagerank_Java/src/main/java/PageRank.java
+6-5
diff --git a/‎pagerank_Java/target/classes/DataParser.class
1.44 KB b/‎pagerank_Java/target/classes/DataParser.class
1.44 KB
diff --git a/‎pagerank_Java/target/classes/PageRank.class
9.05 KB b/‎pagerank_Java/target/classes/PageRank.class
9.05 KB
diff --git a/‎pagerank_Java/target/maven-archiver/pom.properties
+5 b/‎pagerank_Java/target/maven-archiver/pom.properties
+5
diff --git a/‎pagerank_Java/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
+2 b/‎pagerank_Java/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
+2
diff --git a/‎pagerank_Java/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
+2 b/‎pagerank_Java/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
+2
diff --git a/‎pagerank_Java/target/pagerank_Java-1.0-SNAPSHOT.jar
6.27 KB b/‎pagerank_Java/target/pagerank_Java-1.0-SNAPSHOT.jar
6.27 KB
diff --git a/‎pagerank/page_rank.py ‎pagerank_Python/page_rank.py
+5-5 b/‎pagerank/page_rank.py ‎pagerank_Python/page_rank.py
+5-5
diff --git a/‎wiki-micro.txt
+2,427 b/‎wiki-micro.txt
+2,427
@@ -1,2 +1,24 @@
 # PageRank-Spark
+Implementation of the MapReduce PageRank algorithm using the Spark framework both in Python and in Java.
 
+## How to run the algorithm
+Python version: `spark-submit page_rank.py <input file> <output> <number of iterations>`
+
+Java version: `spark-submit --class PageRank <app Jar> <input file> <output> <number of iterations>`
+
+## Input file
+The inputs to the program are pages from the Simple English Wikipedia. We will be using a pre-processed version of the Simple Wikipedia corpus in which the pages are stored in an XML format.
+The XML file can be found [here](wiki-micro.txt).
+
+Each page of Wikipedia is represented in XML as follows:
+
+    <title>page name</title>
+        ...
+    <revisionoptionalVal="xxx">
+            ...
+            <textoptionalVal="yyy">page content</text>
+            ...
+    </revision>
+
+The pages have been "flattened" to be represented on a single line. The body text of the page also has all new lines converted to spaces to ensure it stays on one line in this representation.
+ Links to other Wikipedia articles are of the form [[page name]] and **we considered only links in the _text_ section**.
@@ -2,7 +2,6 @@
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.broadcast.Broadcast;
 import scala.Tuple2;
 
 import java.util.ArrayList;
@@ -13,16 +12,18 @@ public class PageRank {
     private static final double DUMPING_FACTOR = 0.8;
 
     public static void main(String[] args) {
+        if (args.length != 3) {
+            System.err.println("Usage: PageRank <input path> <output path> <# of iterations>");
+            System.exit(-1);
+        }
+
         // import context from Spark (distributed computing using yarn, set name of the application)
         SparkConf sparkConf = new SparkConf().setAppName("pageRankJava").setMaster("yarn");
         JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
 
         // import input data from txt file to rdd
         JavaRDD<String> inputDataRDD = javaSparkContext.textFile(args[0]);
 
-        // the damping factor (static) is broadcast
-        Broadcast<Double> DUMPING_FACTOR_BR = javaSparkContext.broadcast(DUMPING_FACTOR);
-
         // count number of nodes in the input dataset (the N number)
         long nodesNumber = inputDataRDD.count();
 
@@ -45,7 +46,7 @@ public static void main(String[] args) {
 
             // aggregate contributions for each node, compute final ranks
             pageRankRDD = consideredContributionsRDD.reduceByKey(Double::sum)
-                    .mapValues(summedContributions -> (1 - DUMPING_FACTOR_BR.value()) / nodesNumber + DUMPING_FACTOR_BR.value() * summedContributions);
+                    .mapValues(summedContributions -> (1 - DUMPING_FACTOR) / nodesNumber + DUMPING_FACTOR * summedContributions);
         }
 
         // sort by value (pagerank)
 
@@ -0,0 +1,5 @@
+#Generated by Maven
+#Mon May 31 17:47:46 WEST 2021
+version=1.0-SNAPSHOT
+groupId=baggins
+artifactId=pagerank_Java
@@ -0,0 +1,2 @@
+PageRank.class
+DataParser.class
@@ -0,0 +1,2 @@
+/home/hadoop/pagerank_Java/src/main/java/DataParser.java
+/home/hadoop/pagerank_Java/src/main/java/PageRank.java
@@ -2,6 +2,9 @@
 import re
 import sys
 
+# the damping factor (static)
+DAMPING_FACTOR = 0.8
+
 
 def data_parser(line):
     # get the index of the begin of the title
@@ -46,9 +49,6 @@ def spread_rank(node, outgoing_links, rank):
     # import input data from txt file to rdd
     input_data_rdd = sc.textFile(sys.argv[1], 2)
 
-    # the damping factor (static) is broadcast
-    DAMPING_FACTOR_BR = sc.broadcast(0.8)
-
     # count number of nodes in the input dataset (the N number)
     node_number = input_data_rdd.count()
 
@@ -72,8 +72,8 @@ def spread_rank(node, outgoing_links, rank):
 
         # aggregate contributions for each node, compute final ranks
         page_ranks = considered_contributions.reduceByKey(lambda x, y: x + y) \
-            .mapValues(lambda summed_contributions: (float(1 - DAMPING_FACTOR_BR.value) / node_number) +
-                                                    (DAMPING_FACTOR_BR.value * float(summed_contributions)))
+            .mapValues(lambda summed_contributions: (float(1 - DAMPING_FACTOR) / node_number) +
+                                                    (DAMPING_FACTOR * float(summed_contributions)))
 
     # sort by value (pagerank)
     sorted_page_ranks = page_ranks.sortBy(lambda page: page[1], False, 12)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+/home/hadoop/pagerank_Java/src/main/java/DataParser.java`
	`2`	`+/home/hadoop/pagerank_Java/src/main/java/PageRank.java`