Skip to content

Commit

Permalink
[SPARK-16128][SQL] Allow setting length of characters to be truncated…
Browse files Browse the repository at this point in the history
… to, in Dataset.show function.

## What changes were proposed in this pull request?

Allowing truncate to a specific number of character is convenient at times, especially while operating from the REPL. Sometimes those last few characters make all the difference, and showing everything brings in whole lot of noise.

## How was this patch tested?
Existing tests. + 1 new test in DataFrameSuite.

For SparkR and pyspark, existing tests and manual testing.

Author: Prashant Sharma <[email protected]>
Author: Prashant Sharma <[email protected]>

Closes apache#13839 from ScrapCodes/add_truncateTo_DF.show.
  • Loading branch information
ScrapCodes committed Jun 28, 2016
1 parent 4cbf611 commit f6b497f
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,4 @@ spark-warehouse/
# For R session data
.RData
.RHistory
.Rhistory
11 changes: 8 additions & 3 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ setMethod("isLocal",
#' @param x A SparkDataFrame
#' @param numRows The number of rows to print. Defaults to 20.
#' @param truncate Whether truncate long strings. If true, strings more than 20 characters will be
#' truncated and all cells will be aligned right
#'
#' truncated. However, if set greater than zero, truncates strings longer than `truncate`
#' characters and all cells will be aligned right.
#' @family SparkDataFrame functions
#' @rdname showDF
#' @name showDF
Expand All @@ -193,7 +193,12 @@ setMethod("isLocal",
setMethod("showDF",
signature(x = "SparkDataFrame"),
function(x, numRows = 20, truncate = TRUE) {
s <- callJMethod(x@sdf, "showString", numToInt(numRows), truncate)
if (is.logical(truncate) && truncate) {
s <- callJMethod(x@sdf, "showString", numToInt(numRows), numToInt(20))
} else {
truncate2 <- as.numeric(truncate)
s <- callJMethod(x@sdf, "showString", numToInt(numRows), numToInt(truncate2))
}
cat(s)
})

Expand Down
8 changes: 8 additions & 0 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1582,7 +1582,15 @@ test_that("showDF()", {
"| 30| Andy|\n",
"| 19| Justin|\n",
"+----+-------+\n", sep = "")
expected2 <- paste("+---+----+\n",
"|age|name|\n",
"+---+----+\n",
"|nul| Mic|\n",
"| 30| And|\n",
"| 19| Jus|\n",
"+---+----+\n", sep = "")
expect_output(showDF(df), expected)
expect_output(showDF(df, truncate = 3), expected2)
})

test_that("isLocal()", {
Expand Down
18 changes: 15 additions & 3 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,9 @@ def show(self, n=20, truncate=True):
"""Prints the first ``n`` rows to the console.
:param n: Number of rows to show.
:param truncate: Whether truncate long strings and align cells right.
:param truncate: If set to True, truncate strings longer than 20 chars by default.
If set to a number greater than one, truncates long strings to length ``truncate``
and align cells right.
>>> df
DataFrame[age: int, name: string]
Expand All @@ -282,8 +284,18 @@ def show(self, n=20, truncate=True):
| 2|Alice|
| 5| Bob|
+---+-----+
"""
print(self._jdf.showString(n, truncate))
>>> df.show(truncate=3)
+---+----+
|age|name|
+---+----+
| 2| Ali|
| 5| Bob|
+---+----+
"""
if isinstance(truncate, bool) and truncate:
print(self._jdf.showString(n, 20))
else:
print(self._jdf.showString(n, int(truncate)))

def __repr__(self):
return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
Expand Down
47 changes: 40 additions & 7 deletions sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
Original file line number Diff line number Diff line change
Expand Up @@ -232,16 +232,18 @@ class Dataset[T] private[sql](
* Compose the string representing rows for output
*
* @param _numRows Number of rows to show
* @param truncate Whether truncate long strings and align cells right
* @param truncate If set to more than 0, truncates strings to `truncate` characters and
* all cells will be aligned right.
*/
private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = {
private[sql] def showString(_numRows: Int, truncate: Int = 20): String = {
val numRows = _numRows.max(0)
val takeResult = toDF().take(numRows + 1)
val hasMoreData = takeResult.length > numRows
val data = takeResult.take(numRows)

// For array values, replace Seq and Array with square brackets
// For cells that are beyond 20 characters, replace it with the first 17 and "..."
// For cells that are beyond `truncate` characters, replace it with the
// first `truncate-3` and "..."
val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
row.toSeq.map { cell =>
val str = cell match {
Expand All @@ -251,7 +253,13 @@ class Dataset[T] private[sql](
case seq: Seq[_] => seq.mkString("[", ", ", "]")
case _ => cell.toString
}
if (truncate && str.length > 20) str.substring(0, 17) + "..." else str
if (truncate > 0 && str.length > truncate) {
// do not show ellipses for strings shorter than 4 characters.
if (truncate < 4) str.substring(0, truncate)
else str.substring(0, truncate - 3) + "..."
} else {
str
}
}: Seq[String]
}

Expand All @@ -273,7 +281,7 @@ class Dataset[T] private[sql](

// column names
rows.head.zipWithIndex.map { case (cell, i) =>
if (truncate) {
if (truncate > 0) {
StringUtils.leftPad(cell, colWidths(i))
} else {
StringUtils.rightPad(cell, colWidths(i))
Expand All @@ -285,7 +293,7 @@ class Dataset[T] private[sql](
// data
rows.tail.map {
_.zipWithIndex.map { case (cell, i) =>
if (truncate) {
if (truncate > 0) {
StringUtils.leftPad(cell.toString, colWidths(i))
} else {
StringUtils.rightPad(cell.toString, colWidths(i))
Expand Down Expand Up @@ -523,7 +531,32 @@ class Dataset[T] private[sql](
* @since 1.6.0
*/
// scalastyle:off println
def show(numRows: Int, truncate: Boolean): Unit = println(showString(numRows, truncate))
def show(numRows: Int, truncate: Boolean): Unit = if (truncate) {
println(showString(numRows, truncate = 20))
} else {
println(showString(numRows, truncate = 0))
}
// scalastyle:on println

/**
* Displays the Dataset in a tabular form. For example:
* {{{
* year month AVG('Adj Close) MAX('Adj Close)
* 1980 12 0.503218 0.595103
* 1981 01 0.523289 0.570307
* 1982 02 0.436504 0.475256
* 1983 03 0.410516 0.442194
* 1984 04 0.450090 0.483521
* }}}
*
* @param numRows Number of rows to show
* @param truncate If set to more than 0, truncates strings to `truncate` characters and
* all cells will be aligned right.
* @group action
* @since 1.6.0
*/
// scalastyle:off println
def show(numRows: Int, truncate: Int): Unit = println(showString(numRows, truncate))
// scalastyle:on println

/**
Expand Down
27 changes: 24 additions & 3 deletions sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
testData.select($"*").show(1000)
}

test("showString: truncate = [true, false]") {
test("showString: truncate = [0, 20]") {
val longString = Array.fill(21)("1").mkString
val df = sparkContext.parallelize(Seq("1", longString)).toDF()
val expectedAnswerForFalse = """+---------------------+
Expand All @@ -733,15 +733,36 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
||111111111111111111111|
|+---------------------+
|""".stripMargin
assert(df.showString(10, false) === expectedAnswerForFalse)
assert(df.showString(10, truncate = 0) === expectedAnswerForFalse)
val expectedAnswerForTrue = """+--------------------+
|| value|
|+--------------------+
|| 1|
||11111111111111111...|
|+--------------------+
|""".stripMargin
assert(df.showString(10, true) === expectedAnswerForTrue)
assert(df.showString(10, truncate = 20) === expectedAnswerForTrue)
}

test("showString: truncate = [3, 17]") {
val longString = Array.fill(21)("1").mkString
val df = sparkContext.parallelize(Seq("1", longString)).toDF()
val expectedAnswerForFalse = """+-----+
||value|
|+-----+
|| 1|
|| 111|
|+-----+
|""".stripMargin
assert(df.showString(10, truncate = 3) === expectedAnswerForFalse)
val expectedAnswerForTrue = """+-----------------+
|| value|
|+-----------------+
|| 1|
||11111111111111...|
|+-----------------+
|""".stripMargin
assert(df.showString(10, truncate = 17) === expectedAnswerForTrue)
}

test("showString(negative)") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {

private def checkShowString[T](ds: Dataset[T], expected: String): Unit = {
val numRows = expected.split("\n").length - 4
val actual = ds.showString(numRows, truncate = true)
val actual = ds.showString(numRows, truncate = 20)

if (expected != actual) {
fail(
Expand Down

0 comments on commit f6b497f

Please sign in to comment.