Skip to content

Commit

Permalink
[SPARK-23847][PYTHON][SQL] Add asc_nulls_first, asc_nulls_last to PyS…
Browse files Browse the repository at this point in the history
…park

## What changes were proposed in this pull request?

Column.scala and Functions.scala have asc_nulls_first, asc_nulls_last,  desc_nulls_first and desc_nulls_last. Add the corresponding python APIs in column.py and functions.py

## How was this patch tested?
Add doctest

Author: Huaxin Gao <[email protected]>

Closes apache#20962 from huaxingao/spark-23847.
  • Loading branch information
huaxingao authored and HyukjinKwon committed Apr 8, 2018
1 parent 6ab134c commit 2c1fe64
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 7 deletions.
56 changes: 52 additions & 4 deletions python/pyspark/sql/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,24 +447,72 @@ def isin(self, *cols):

# order
_asc_doc = """
Returns a sort expression based on the ascending order of the given column name
Returns a sort expression based on ascending order of the column.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
>>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc()).collect()
[Row(name=u'Alice'), Row(name=u'Tom')]
"""
_asc_nulls_first_doc = """
Returns a sort expression based on ascending order of the column, and null values
return before non-null values.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect()
[Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')]
.. versionadded:: 2.4
"""
_asc_nulls_last_doc = """
Returns a sort expression based on ascending order of the column, and null values
appear after non-null values.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect()
[Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)]
.. versionadded:: 2.4
"""
_desc_doc = """
Returns a sort expression based on the descending order of the given column name.
Returns a sort expression based on the descending order of the column.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
>>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc()).collect()
[Row(name=u'Tom'), Row(name=u'Alice')]
"""
_desc_nulls_first_doc = """
Returns a sort expression based on the descending order of the column, and null values
appear before non-null values.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect()
[Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')]
.. versionadded:: 2.4
"""
_desc_nulls_last_doc = """
Returns a sort expression based on the descending order of the column, and null values
appear after non-null values.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect()
[Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)]
.. versionadded:: 2.4
"""

asc = ignore_unicode_prefix(_unary_op("asc", _asc_doc))
asc_nulls_first = ignore_unicode_prefix(_unary_op("asc_nulls_first", _asc_nulls_first_doc))
asc_nulls_last = ignore_unicode_prefix(_unary_op("asc_nulls_last", _asc_nulls_last_doc))
desc = ignore_unicode_prefix(_unary_op("desc", _desc_doc))
desc_nulls_first = ignore_unicode_prefix(_unary_op("desc_nulls_first", _desc_nulls_first_doc))
desc_nulls_last = ignore_unicode_prefix(_unary_op("desc_nulls_last", _desc_nulls_last_doc))

_isNull_doc = """
True if the current expression is null.
Expand Down
13 changes: 13 additions & 0 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,17 @@ def _():
'bitwiseNOT': 'Computes bitwise not.',
}

_functions_2_4 = {
'asc_nulls_first': 'Returns a sort expression based on the ascending order of the given' +
' column name, and null values return before non-null values.',
'asc_nulls_last': 'Returns a sort expression based on the ascending order of the given' +
' column name, and null values appear after non-null values.',
'desc_nulls_first': 'Returns a sort expression based on the descending order of the given' +
' column name, and null values appear before non-null values.',
'desc_nulls_last': 'Returns a sort expression based on the descending order of the given' +
' column name, and null values appear after non-null values',
}

_collect_list_doc = """
Aggregate function: returns a list of objects with duplicates.
Expand Down Expand Up @@ -250,6 +261,8 @@ def _():
globals()[_name] = since(2.1)(_create_function(_name, _doc))
for _name, _message in _functions_deprecated.items():
globals()[_name] = _wrap_deprecated_function(globals()[_name], _message)
for _name, _doc in _functions_2_4.items():
globals()[_name] = since(2.4)(_create_function(_name, _doc))
del _name, _doc


Expand Down
14 changes: 14 additions & 0 deletions python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2991,6 +2991,20 @@ def test_create_dateframe_from_pandas_with_dst(self):
os.environ['TZ'] = orig_env_tz
time.tzset()

def test_2_4_functions(self):
from pyspark.sql import functions

df = self.spark.createDataFrame(
[('Tom', 80), (None, 60), ('Alice', 50)], ["name", "height"])
df.select(df.name).orderBy(functions.asc_nulls_first('name')).collect()
[Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')]
df.select(df.name).orderBy(functions.asc_nulls_last('name')).collect()
[Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)]
df.select(df.name).orderBy(functions.desc_nulls_first('name')).collect()
[Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')]
df.select(df.name).orderBy(functions.desc_nulls_last('name')).collect()
[Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)]


class HiveSparkSubmitTests(SparkSubmitTests):

Expand Down
4 changes: 2 additions & 2 deletions sql/core/src/main/scala/org/apache/spark/sql/Column.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1083,10 +1083,10 @@ class Column(val expr: Expression) extends Logging {
* and null values return before non-null values.
* {{{
* // Scala: sort a DataFrame by age column in ascending order and null values appearing first.
* df.sort(df("age").asc_nulls_last)
* df.sort(df("age").asc_nulls_first)
*
* // Java
* df.sort(df.col("age").asc_nulls_last());
* df.sort(df.col("age").asc_nulls_first());
* }}}
*
* @group expr_ops
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ object functions {
* Returns a sort expression based on ascending order of the column,
* and null values return before non-null values.
* {{{
* df.sort(asc_nulls_last("dept"), desc("age"))
* df.sort(asc_nulls_first("dept"), desc("age"))
* }}}
*
* @group sort_funcs
Expand Down

0 comments on commit 2c1fe64

Please sign in to comment.