Skip to content

Commit

Permalink
[SPARK-23947][SQL] Add hashUTF8String convenience method to hasher cl…
Browse files Browse the repository at this point in the history
…asses

## What changes were proposed in this pull request?

Add `hashUTF8String()` to the hasher classes to allow Spark SQL codegen to generate cleaner code for hashing `UTF8String`s. No change in behavior otherwise.

Although with the introduction of SPARK-10399, the code size for hashing `UTF8String` is already smaller, it's still good to extract a separate function in the hasher classes so that the generated code can stay clean.

## How was this patch tested?

Existing tests.

Author: Kris Mok <[email protected]>

Closes apache#21016 from rednaxelafx/hashutf8.
  • Loading branch information
rednaxelafx authored and gatorsmile committed Apr 10, 2018
1 parent 61b7247 commit f94f362
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.spark.sql.catalyst.expressions;

import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.types.UTF8String;

/**
* Simulates Hive's hashing function from Hive v1.2.1
Expand Down Expand Up @@ -51,4 +52,8 @@ public static int hashUnsafeBytesBlock(MemoryBlock mb) {
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes) {
return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes));
}

public static int hashUTF8String(UTF8String str) {
return hashUnsafeBytesBlock(str.getMemoryBlock());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import com.google.common.primitives.Ints;

import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.types.UTF8String;

/**
* 32-bit Murmur3 hasher. This is based on Guava's Murmur3_32HashFunction.
Expand Down Expand Up @@ -82,6 +83,10 @@ public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) {
return fmix(h1, lengthInBytes);
}

public static int hashUTF8String(UTF8String str, int seed) {
return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
}

public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) {
return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes), seed);
}
Expand All @@ -91,7 +96,7 @@ public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes,
}

public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
// This is compatible with original and another implementations.
// This is compatible with original and other implementations.
// Use this method for new components after Spark 2.3.
int lengthInBytes = Ints.checkedCast(base.size());
assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.spark.sql.catalyst.expressions;

import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.types.UTF8String;

// scalastyle: off
/**
Expand Down Expand Up @@ -107,6 +108,10 @@ public static long hashUnsafeBytesBlock(MemoryBlock mb, long seed) {
return fmix(hash);
}

public static long hashUTF8String(UTF8String str, long seed) {
return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
}

public static long hashUnsafeBytes(Object base, long offset, int length, long seed) {
return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, length), seed);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -361,8 +361,7 @@ abstract class HashExpression[E] extends Expression {
}

protected def genHashString(input: String, result: String): String = {
val mb = s"$input.getMemoryBlock()"
s"$result = $hasherClassName.hashUnsafeBytesBlock($mb, $result);"
s"$result = $hasherClassName.hashUTF8String($input, $result);"
}

protected def genHashForMap(
Expand Down Expand Up @@ -725,8 +724,7 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
"""

override protected def genHashString(input: String, result: String): String = {
val mb = s"$input.getMemoryBlock()"
s"$result = $hasherClassName.hashUnsafeBytesBlock($mb);"
s"$result = $hasherClassName.hashUTF8String($input);"
}

override protected def genHashForArray(
Expand Down

0 comments on commit f94f362

Please sign in to comment.