IBM · johanandren · Feb 22, 2018 · Feb 22, 2018 · Feb 22, 2018 · Feb 22, 2018
diff --git a/akka-epl/.gitignore b/akka-epl/.gitignore
@@ -0,0 +1,4 @@
+// you have checked in compiled class files by mistake
+// it's good practice to not do that, easiest is to mark
+// target as ignored
+target/
diff --git a/akka-epl/project/project/target/config-classes/$731f18e1a71d9df9db04.cache b/akka-epl/project/project/target/config-classes/$731f18e1a71d9df9db04.cache
diff --git a/akka-epl/src/main/scala/com/epl/akka/AkkaHTTPClient.scala b/akka-epl/src/main/scala/com/epl/akka/AkkaHTTPClient.scala
@@ -10,16 +10,27 @@ import com.epl.akka.AkkaHTTPClient.GET
 /**
   * Created by sanjeevghimire on 9/19/17.
   */
+// this actor seems somewhat redundant in general as all it does is passing requests to singleRequest
+// and then folding the response into memory (none of the actual work happens in the thread of the actor)
+// it is also not used anywhere, from what I can see, so maybe just completely remove it
 class AkkaHTTPClient() extends Actor
   with ActorLogging {
 
   import akka.pattern.pipe
   import context.dispatcher
 
+  // this materializer is bound to the lifecycle of the actor system, that means it will leak if
+  // this actor is stopped. Better either provide a single materializer to use across the app
+  // and take as a constructor parameter here, or to create the materializer with the actor context
+  // as actor-ref factory (then it will be bound to the actor)
+  // See docs here: https://doc.akka.io/docs/akka/current/stream/stream-flows-and-basics.html#actor-materializer-lifecycle
   final implicit val materializer: ActorMaterializer = ActorMaterializer(ActorMaterializerSettings(context.system))
 
   val http = Http(context.system)
 
+  // the commented out logic around this is incorrect as a GET message immediately triggers work
+  // done in a future and the actor then continues processing the next message, meaning that when the
+  // response comes back the originalSender field would contain the latest sender, not the original sender
   //var originalSender: Option[ActorRef] = None // <-- added
 
   override def receive: Receive = {
@@ -49,6 +60,7 @@ class AkkaHTTPClient() extends Actor
 
 object AkkaHTTPClient {
 
+  // use the lambda based props factory Props(new MyActor) instead of the reflection based one
   def props() =
     Props[AkkaHTTPClient]
 

diff --git a/akka-epl/src/main/scala/com/epl/akka/CloudantReaderWriter.scala b/akka-epl/src/main/scala/com/epl/akka/CloudantReaderWriter.scala
@@ -24,8 +24,15 @@ class CloudantReaderWriter extends Actor with ActorLogging{
 
   private val config = context.system.settings.config
 
+  // note that as each of these methods just trigger async work somewhere else, and there is
+  // no state this actor is superflous and could easily be done with regular methods returning Futures
   override def receive: Receive = {
     case SaveToCloudantDatabase(jsonString: String) =>
+      // in general side effecting from a Future callback like this is bad practice
+      // since it will execute on a thread that is not the actor, using `log` should be thread safe
+      // but it is not good to show in example code as it somewhat encourages doing the wrong thing(tm)
+      // if you want to react on the future completion in the actor you should use `pipeTo`
+      // see docs here: https://doc.akka.io/docs/akka/current/actors.html#ask-send-and-receive-future
       WebHttpClient.post(config.getString("cloudant.post_url"),jsonString,config.getString("cloudant.username"),config.getString("cloudant.password")) onComplete {
         case Success(body) =>
           log.info("Successfully Saved:: "+ body)

diff --git a/akka-epl/src/main/scala/com/epl/akka/CrawlingApp.scala b/akka-epl/src/main/scala/com/epl/akka/CrawlingApp.scala
@@ -5,7 +5,14 @@ import com.epl.akka.WebCrawler.CrawlRequest
 
 object CrawlingApp extends App {
   val system = ActorSystem("Crawler")
+  // don't ad-hoc create the Props here, define a props method in the companion object instead,
+  // and avoid using the reflection based props factory method
   val webCrawler = system.actorOf(Props[WebCrawler], "WebCrawler")
   // start the crawling
   webCrawler ! CrawlRequest("https://www.premierleague.com/", true)
+
+  // this app does a single "crawl", but it never completes, the application
+  // will live forever. That's not what I'd expect from such an app.
+  // It should somehow track if it is done or not and then terminate the actor system
+  // so that the JVM can shut down.
 }
diff --git a/akka-epl/src/main/scala/com/epl/akka/HTMLParser.scala b/akka-epl/src/main/scala/com/epl/akka/HTMLParser.scala
@@ -23,6 +23,8 @@ object HTMLParser {
 /**
   * Created by sanjeevghimire on 9/1/17.
   */
+// this name does not really encode what the actor does given that it fetches pages, parses content
+// and turns the result to JSON (the last is not a very good design, see comments below)
 class HTMLParser() extends Actor with ActorLogging{
 
   val EXPIRED_NO = "NO"
@@ -32,6 +34,9 @@ class HTMLParser() extends Actor with ActorLogging{
   import scala.collection.JavaConverters._
 
 
+  // in general it is good practice to move methods that require no context from the actor
+  // to the companion object (making them static) as that makes it clear that they are pure
+  // and also makes writing unit tests for them easier.
   def getValidLinks(content: String, url:String): Iterator[String] = {
     Jsoup.parse(content, url).select("a[href]").iterator().asScala.map(_.absUrl("href"))
   }
@@ -125,6 +130,13 @@ class HTMLParser() extends Actor with ActorLogging{
 
     val jsValue: JsValue = Json.toJson(resultsMap)
     val returnJson: JsObject = jsValue.as[JsObject] + ("expired" -> Json.toJson(EXPIRED_NO)) + ("documentType" -> Json.toJson(DocumentType.Results.toString))
+
+    // using strings (actually even the JSValue types) for internal communication inside
+    // of the app is in general bad practice, sometimes referred to as "stringly typed" programming,
+    // and is not good to use as an example for newcomers.
+    // A proper design would parse data into a specific type here, and then turn that
+    // into json encoded as a string in the CloudAntReaderWriter which knows what format it
+    // needs to save things in.
     Json.stringify(returnJson)
   }
 
@@ -140,6 +152,10 @@ class HTMLParser() extends Actor with ActorLogging{
             //.filter(link => link.endsWith("fixtures") || link.endsWith("tables") || link.endsWith("results"))
             .filter(link => link.endsWith("fixtures"))
             .foreach(context.parent ! CrawlRequest(_,false))
+
+        // rethrow instead and put the decision to stop in the supervision of the parent
+        // which would make it possible to recover from failure by starting a new actor etc.
+        // this will instead permanently break the app until the entire app is restarted
         case Failure(err) =>
           log.error(err, "Error while crawling url: "+ url)
           stop()
@@ -150,6 +166,10 @@ class HTMLParser() extends Actor with ActorLogging{
       val isFixtures = url.endsWith("fixtures")
       val isTables = url.endsWith("tables")
       val isResults = url.endsWith("results")
+      // this is a blocking call, which means it should be run on a separate threadpool
+      // or it may starve the actor system meaning that other actors do not get to execute
+      // read this section of the docs:
+      // https://doc.akka.io/docs/akka/current/dispatchers.html#problem-blocking-on-default-dispatcher
       val body: String = WebHttpClient.getUsingPhantom(url)
       var jsonString: String = null
       if(isFixtures) {
@@ -160,6 +180,14 @@ class HTMLParser() extends Actor with ActorLogging{
         jsonString = getResultsAndConvertToJson(body,url)
       }
       log.info("HTML to JSON: "+jsonString)
+
+      // this mixes conserns, why does the HTMLParser actor know that something should be saved to cloudant
+      // it should instead be a result of the requested work, it should instead be a response to the command
+      // and it should be sent to the sender(), using the parent makes it much harder than necessary to test
+      // case CrawlAndPrepare =>
+      //   ...
+      //   sender() ! PreparedResult(data)
+
       context.parent ! SaveJsonToCloudant(jsonString)
 
     case _: Status.Failure => stop()

diff --git a/akka-epl/src/main/scala/com/epl/akka/URLValidator.scala b/akka-epl/src/main/scala/com/epl/akka/URLValidator.scala
@@ -14,6 +14,10 @@ object URLValidator {
 /**
   * Created by sanjeevghimire on 9/1/17.
   */
+
+// Making this an actor doesn't really make sense, it introduces an async boundary from the parent
+// actor but there is no gain (neither more resilient nor more performant) than just keeping this state
+// directly in WebCrawler
 class URLValidator extends Actor with ActorLogging{
   var visitedUrl = Set.empty[String]
   var childUrls = Set.empty[ActorRef]

diff --git a/akka-epl/src/main/scala/com/epl/akka/WebCrawler.scala b/akka-epl/src/main/scala/com/epl/akka/WebCrawler.scala
@@ -9,6 +9,7 @@ import com.epl.akka.WebCrawler._
 
 object WebCrawler {
   case class CrawlRequest(url: String, isRootUrl: Boolean) {}
+  // Not used
   case class CrawlResponse(links: Set[String]) {}
   case class Crawl(url: String,isRootUrl: Boolean){}
   case class SaveJsonToCloudant(jsonString: String){}
@@ -21,6 +22,7 @@ object WebCrawler {
   */
 class WebCrawler extends Actor with ActorLogging{
 
+  // create a props factory for the actor companions  instead of an adhoc Props call here
   val urlValidator = context.actorOf(Props[URLValidator](new URLValidator()))
   val htmlParser = context.actorOf(Props[HTMLParser](new HTMLParser()))
   val cloudantWriter = context.actorOf(Props[CloudantReaderWriter](new CloudantReaderWriter()))

diff --git a/akka-epl/src/main/scala/com/epl/akka/WebHttpClient.scala b/akka-epl/src/main/scala/com/epl/akka/WebHttpClient.scala
@@ -29,6 +29,9 @@ import scala.concurrent.{ExecutionContext, Future, Promise}
   */
 object WebHttpClient {
 
+  // in general not a good idea to put state like this in a singleton, could be ok here, but what happens
+  // if the async http client crashes - no way to restart it as it is effectively a constant per JVM
+  // (that would be a great use case for using an actor)
   val PHANTOMJS_PAGE_CUSTOMHEADERS_PREFIX: String = "phantomjs.page.customHeaders."
   val PHANTOMJS_CLI_ARGS: String = "phantomjs.cli.args"
 
@@ -40,11 +43,15 @@ object WebHttpClient {
 
   val USER_AGENT: String= "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
   System.setProperty("phantomjs.page.settings.userAgent", USER_AGENT)
+
+  // if this was instead
   var driver: WebDriver = null
 
 
   import scala.concurrent.duration._
 
+  // here you create a separate actor system and materializer that starts thread pools/uses resources but
+  // is not really used
   implicit val system: ActorSystem = ActorSystem()
   implicit val materializer: Materializer = ActorMaterializer()
 
@@ -90,7 +97,7 @@ object WebHttpClient {
     promise.future
   }
 
-
+// remove stuff rather than keep them commented out in a published tutorial sample
 //  def getUsingAkkaHttp(url: String)(implicit system: ActorSystem, mat: Materializer): Future[String] = {
 //    implicit val executionContext = system.dispatcher
 //
@@ -152,6 +159,8 @@ object WebHttpClient {
 
 
 
+  // another great reason to put this inside an actor, actors have explicit lifecycle,
+  // and concurrency is handled for you. What happens here if multiple threads calls this at the same time?
   def initPhantomJS(){
     val desiredCaps: DesiredCapabilities = DesiredCapabilities.phantomjs()
     desiredCaps.setJavascriptEnabled(true)
@@ -165,7 +174,8 @@ object WebHttpClient {
     driver = new PhantomJSDriver(desiredCaps);
   }
 
-
+  // and here as well, this is not thread safe, this will not work given that you call it from
+  // future callbacks in the rest of hte code
   def getUsingPhantom(url: String): String = {
     initPhantomJS()
     driver.get(url)