Skip to content
This repository has been archived by the owner on Feb 17, 2024. It is now read-only.

Commit

Permalink
Fixes #90 Option to download indices files locally from hdfs first
Browse files Browse the repository at this point in the history
  • Loading branch information
phymbert committed Sep 28, 2021
1 parent 010f091 commit d79ea0c
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 3 deletions.
17 changes: 17 additions & 0 deletions core/src/main/java/org/apache/spark/search/IndexationOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ public BoxedUnit apply() {
});
private IndexDirectoryCleanupHandler indexDirectoryCleanupHandler = DEFAULT_SHUTDOWN_HOOK_HANDLER;

private boolean reloadIndexWithHdfsCopyToLocal = true;

private static final IndexationOptions DEFAULT = builder().build();

// Hidden, use builder or default.
Expand Down Expand Up @@ -175,6 +177,10 @@ public boolean isCacheSearchIndexRDD() {
return cacheSearchIndexRDD;
}

public boolean isReloadIndexWithHdfsCopyToLocal() {
return reloadIndexWithHdfsCopyToLocal;
}

/**
* Indexation option builder.
*/
Expand Down Expand Up @@ -323,6 +329,17 @@ public Builder<T> cacheSearchIndexRDD(boolean cacheSearchIndexRDD) {
return this;
}

/**
* Force copy from hdfs to local before reloading indices.
*
* @param reloadIndexWithHdfsCopyToLocal true to force copy indices from hdfs to local before opening it.
* @return builder
*/
public Builder<T> reloadIndexWithHdfsCopyToLocal(boolean reloadIndexWithHdfsCopyToLocal) {
options.reloadIndexWithHdfsCopyToLocal = reloadIndexWithHdfsCopyToLocal;
return this;
}

/**
* @return built options.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
package org.apache.spark.search.rdd

import java.io.{File, FileInputStream, InputStream}

import org.apache.commons.io.FileUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path, PathFilter}
import org.apache.spark.search.SearchOptions
Expand Down Expand Up @@ -45,9 +48,18 @@ private[search] class SearchIndexReloadedRDD[S: ClassTag](sc: SparkContext,

override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
val part = split.asInstanceOf[SearchIndexReloadedPartition]
val hadoopConf = new Configuration()
val hdfs = FileSystem.get(hadoopConf)
ZipUtils.unzipPartition(part.indexDir, hdfs.open(new Path(part.zipPath)))
val hdfs = FileSystem.get(new Configuration())
val path = new Path(part.zipPath)
val is: InputStream = if (options.getIndexationOptions.isReloadIndexWithHdfsCopyToLocal) {
val tmpPath = new Path(s"${part.indexDir}.tmp")
val tmpFile = new File(tmpPath.getName)
context.addTaskCompletionListener[Unit](_ => FileUtils.delete(tmpFile))
hdfs.copyToLocalFile(path, tmpPath)
new FileInputStream(tmpFile)
} else {
hdfs.open(path)
}
ZipUtils.unzipPartition(part.indexDir, is)
streamPartitionIndexZip(context, part.asInstanceOf[SearchPartitionIndex[S]])
}
}
Expand Down

0 comments on commit d79ea0c

Please sign in to comment.