Fixes #90 Option to download indices files locally from hdfs first

phymbert · Sep 28, 2021 · d79ea0c · d79ea0c
1 parent 010f091
commit d79ea0c
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 3 deletions.
diff --git a/core/src/main/java/org/apache/spark/search/IndexationOptions.java b/core/src/main/java/org/apache/spark/search/IndexationOptions.java
@@ -109,6 +109,8 @@ public BoxedUnit apply() {
     });
     private IndexDirectoryCleanupHandler indexDirectoryCleanupHandler = DEFAULT_SHUTDOWN_HOOK_HANDLER;
 
+    private boolean reloadIndexWithHdfsCopyToLocal = true;
+
     private static final IndexationOptions DEFAULT = builder().build();
 
     // Hidden, use builder or default.
@@ -175,6 +177,10 @@ public boolean isCacheSearchIndexRDD() {
         return cacheSearchIndexRDD;
     }
 
+    public boolean isReloadIndexWithHdfsCopyToLocal() {
+        return reloadIndexWithHdfsCopyToLocal;
+    }
+
     /**
      * Indexation option builder.
      */
@@ -323,6 +329,17 @@ public Builder<T> cacheSearchIndexRDD(boolean cacheSearchIndexRDD) {
             return this;
         }
 
+        /**
+         * Force copy from hdfs to local before reloading indices.
+         *
+         * @param reloadIndexWithHdfsCopyToLocal true to force copy indices from hdfs to local before opening it.
+         * @return builder
+         */
+        public Builder<T> reloadIndexWithHdfsCopyToLocal(boolean reloadIndexWithHdfsCopyToLocal) {
+            options.reloadIndexWithHdfsCopyToLocal = reloadIndexWithHdfsCopyToLocal;
+            return this;
+        }
+
         /**
          * @return built options.
          */

diff --git a/core/src/main/scala/org/apache/spark/search/rdd/SearchRDDReloaded.scala b/core/src/main/scala/org/apache/spark/search/rdd/SearchRDDReloaded.scala
@@ -15,6 +15,9 @@
  */
 package org.apache.spark.search.rdd
 
+import java.io.{File, FileInputStream, InputStream}
+
+import org.apache.commons.io.FileUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path, PathFilter}
 import org.apache.spark.search.SearchOptions
@@ -45,9 +48,18 @@ private[search] class SearchIndexReloadedRDD[S: ClassTag](sc: SparkContext,
 
   override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
     val part = split.asInstanceOf[SearchIndexReloadedPartition]
-    val hadoopConf = new Configuration()
-    val hdfs = FileSystem.get(hadoopConf)
-    ZipUtils.unzipPartition(part.indexDir, hdfs.open(new Path(part.zipPath)))
+    val hdfs = FileSystem.get(new Configuration())
+    val path = new Path(part.zipPath)
+    val is: InputStream = if (options.getIndexationOptions.isReloadIndexWithHdfsCopyToLocal) {
+      val tmpPath = new Path(s"${part.indexDir}.tmp")
+      val tmpFile = new File(tmpPath.getName)
+      context.addTaskCompletionListener[Unit](_ => FileUtils.delete(tmpFile))
+      hdfs.copyToLocalFile(path, tmpPath)
+      new FileInputStream(tmpFile)
+    } else {
+      hdfs.open(path)
+    }
+    ZipUtils.unzipPartition(part.indexDir, is)
     streamPartitionIndexZip(context, part.asInstanceOf[SearchPartitionIndex[S]])
   }
 }