47

After creating a Spark DataFrame from a CSV file, I would like to trim a column. I've tried:

df = df.withColumn("Product", df.Product.strip())

df is my data frame, Product is a column in my table.

But I get the error:

Column object is not callable

ZygD
  • 22,092
  • 39
  • 79
  • 102
minh-hieu.pham
  • 1,029
  • 2
  • 12
  • 21

10 Answers10

35

The PySpark version of the strip function is called trim

Trim the spaces from both ends for the specified string column.

Make sure to import the function first and to put the column you are trimming inside your function.

The following should work:

from pyspark.sql.functions import trim
df = df.withColumn("Product", trim(df.Product))
Yadav
  • 129
  • 1
  • 11
jdhaen
  • 351
  • 3
  • 2
21

Starting from version 1.5, Spark SQL provides two specific functions for trimming white space, ltrim and rtrim (search for "trim" in the DataFrame documentation); you'll need to import pyspark.sql.functions first. Here is an example:

 from pyspark.sql import SQLContext
 from pyspark.sql.functions import *
 sqlContext = SQLContext(sc)

 df = sqlContext.createDataFrame([(' 2015-04-08 ',' 2015-05-10 ')], ['d1', 'd2']) # create a dataframe - notice the extra whitespaces in the date strings
 df.collect()
 # [Row(d1=u' 2015-04-08 ', d2=u' 2015-05-10 ')]
 df = df.withColumn('d1', ltrim(df.d1)) # trim left whitespace from column d1
 df.collect()
 # [Row(d1=u'2015-04-08 ', d2=u' 2015-05-10 ')]
 df = df.withColumn('d1', rtrim(df.d1))  # trim right whitespace from d1
 df.collect()
 # [Row(d1=u'2015-04-08', d2=u' 2015-05-10 ')]
desertnaut
  • 57,590
  • 26
  • 140
  • 166
  • 5
    Unfortunately Spark's trim() function doesn't trim white space, but only spaces. This behaviour was changed with https://issues.apache.org/jira/browse/SPARK-17299 . To also trim white space like "\t, \n," etc. one needs a regular expression. – asmaier Oct 23 '17 at 13:30
9

If you need to do it for all columns in the dataframe.

from pyspark.sql import functions as f

for colname in df.columns:
    df = df.withColumn(colname, f.trim(f.col(colname)))
DataDog
  • 475
  • 1
  • 9
  • 23
7

I did that with the udf like this:

from pyspark.sql.functions import udf

def trim(string):
    return string.strip()
trim=udf(trim)

df = sqlContext.createDataFrame([(' 2015-04-08 ',' 2015-05-10 ')], ['d1', 'd2'])

df2 = df.select(trim(df['d1']).alias('d1'),trim(df['d2']).alias('d2'))

output looks like this:

df.show()
df2.show()
+------------+------------+
|          d1|          d2|
+------------+------------+
| 2015-04-08 | 2015-05-10 |
+------------+------------+

+----------+----------+
|        d1|        d2|
+----------+----------+
|2015-04-08|2015-05-10|
+----------+----------+
Babu
  • 4,324
  • 6
  • 41
  • 60
4

Summary

Trim spaces:

  • on both sides:
    Spark 3.0.0+ F.trim("col_name")
    Spark 2.0.1+ F.trim(F.col("col_name"))
  • on left:
    Spark 3.0.0+ F.ltrim("col_name")
    Spark 2.0.1+ F.ltrim(F.col("col_name"))
  • on right:
    Spark 3.0.0+ F.rtrim("col_name")
    Spark 2.0.1+ F.rtrim(F.col("col_name"))

Trim specified symbols (e.g. spaces and tabs):

  • on both sides:
    Spark 3.2.0+ F.expr("BTRIM(col_name, ' \t')")
    Spark 2.3.0+ F.expr("TRIM(BOTH ' \t' FROM col_name)")
    F.regexp_replace("col_name", r"^[ \t]+|[ \t]+$", "")
  • on left:
    Spark 2.3.0+ F.expr("TRIM(LEADING ' \t' FROM col_name)")
    F.regexp_replace("col_name", r"^[ \t]+", "")
  • on right:
    Spark 2.3.0+ F.expr("TRIM(TRAILING ' \t' FROM col_name)")
    F.regexp_replace("col_name", r"[ \t]+$", "")

Trim white space:

  • on both sides:
    F.regexp_replace("col_name", r"^\s+|\s+$", "")
    Spark 1.5.0-2.0.0 F.trim(F.col("col_name"))
  • on left:
    F.regexp_replace("col_name", r"^\s+", "")
    Spark 1.5.0-2.0.0 F.ltrim(F.col("col_name"))
  • on right:
    F.regexp_replace("col_name", r"\s+$", "")
    Spark 1.5.0-2.0.0 F.rtrim(F.col("col_name"))

Examples

from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()

Trim spaces on both sides:

df = spark.range(1).withColumn("id", F.lit("trim_spaces_on_both_sides")) \
                   .withColumn("input", F.lit("  EXAMPLE_STRING_VALUE ")) \
                   .withColumn("output", F.trim("input"))
df.show(1, 0)
#+-------------------------+-----------------------+--------------------+
#|id                       |input                  |output              |
#+-------------------------+-----------------------+--------------------+
#|trim_spaces_on_both_sides|  EXAMPLE_STRING_VALUE |EXAMPLE_STRING_VALUE|
#+-------------------------+-----------------------+--------------------+

Trim spaces on left:

df = spark.range(1).withColumn("id", F.lit("trim_spaces_on_left")) \
                   .withColumn("input", F.lit("  EXAMPLE_STRING_VALUE ")) \
                   .withColumn("output", F.ltrim("input"))
df.show(1, 0)
#+-------------------+-----------------------+---------------------+
#|id                 |input                  |output               |
#+-------------------+-----------------------+---------------------+
#|trim_spaces_on_left|  EXAMPLE_STRING_VALUE |EXAMPLE_STRING_VALUE |
#+-------------------+-----------------------+---------------------+

Trim spaces on right:

df = spark.range(1).withColumn("id", F.lit("trim_spaces_on_right")) \
                   .withColumn("input", F.lit("  EXAMPLE_STRING_VALUE ")) \
                   .withColumn("output", F.rtrim("input"))
df.show(1, 0)
#+--------------------+-----------------------+----------------------+
#|id                  |input                  |output                |
#+--------------------+-----------------------+----------------------+
#|trim_spaces_on_right|  EXAMPLE_STRING_VALUE |  EXAMPLE_STRING_VALUE|
#+--------------------+-----------------------+----------------------+

Trim tabs and spaces on both sides:

df = spark.range(1).withColumn("id", F.lit("trim_tabs_and_spaces_on_both_sides")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.expr("TRIM(BOTH ' \t' FROM input)")) \
                   .withColumn("output_spark_3_2", F.expr("BTRIM(input, ' \t')"))
df.show(1, 0)
#+----------------------------------+-----------------------------+--------------------+--------------------+
#|id                                |input                        |output              |output_spark_3_2    |
#+----------------------------------+-----------------------------+--------------------+--------------------+
#|trim_tabs_and_spaces_on_both_sides|\t\t EXAMPLE_STRING_VALUE  \t|EXAMPLE_STRING_VALUE|EXAMPLE_STRING_VALUE|
#+----------------------------------+-----------------------------+--------------------+--------------------+

Trim tabs and spaces on left:

df = spark.range(1).withColumn("id", F.lit("trim_tabs_and_spaces_on_left")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.expr("TRIM(LEADING ' \t' FROM input)"))
df.show(1, 0)
#+----------------------------+-----------------------------+------------------------+
#|id                          |input                        |output                  |
#+----------------------------+-----------------------------+------------------------+
#|trim_tabs_and_spaces_on_left|\t\t EXAMPLE_STRING_VALUE  \t|EXAMPLE_STRING_VALUE  \t|
#+----------------------------+-----------------------------+------------------------+

Trim tabs and spaces on right:

df = spark.range(1).withColumn("id", F.lit("trim_tabs_and_spaces_on_right")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.expr("TRIM(TRAILING ' \t' FROM input)"))
df.show(1, 0)
#+-----------------------------+-----------------------------+-------------------------+
#|id                           |input                        |output                   |
#+-----------------------------+-----------------------------+-------------------------+
#|trim_tabs_and_spaces_on_right|\t\t EXAMPLE_STRING_VALUE  \t|\t\t EXAMPLE_STRING_VALUE|
#+-----------------------------+-----------------------------+-------------------------+

Trim white space on both sides:

df = spark.range(1).withColumn("id", F.lit("trim_white_space_on_both_sides")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.regexp_replace("input", r"^\s+|\s+$", ""))
df.show(1, 0)
#+------------------------------+-----------------------------+--------------------+
#|id                            |input                        |output              |
#+------------------------------+-----------------------------+--------------------+
#|trim_white_space_on_both_sides|\t\t EXAMPLE_STRING_VALUE  \t|EXAMPLE_STRING_VALUE|
#+------------------------------+-----------------------------+--------------------+

Trim white space on left:

df = spark.range(1).withColumn("id", F.lit("trim_white_space_on_left")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.regexp_replace("input", r"^\s+", ""))
df.show(1, 0)
#+------------------------+-----------------------------+------------------------+
#|id                      |input                        |output                  |
#+------------------------+-----------------------------+------------------------+
#|trim_white_space_on_left|\t\t EXAMPLE_STRING_VALUE  \t|EXAMPLE_STRING_VALUE  \t|
#+------------------------+-----------------------------+------------------------+

Trim white space on right:

df = spark.range(1).withColumn("id", F.lit("trim_white_space_on_right")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.regexp_replace("input", r"\s+$", ""))
df.show(1, 0)
#+-------------------------+-----------------------------+-------------------------+
#|id                       |input                        |output                   |
#+-------------------------+-----------------------------+-------------------------+
#|trim_white_space_on_right|\t\t EXAMPLE_STRING_VALUE  \t|\t\t EXAMPLE_STRING_VALUE|
#+-------------------------+-----------------------------+-------------------------+
ZygD
  • 22,092
  • 39
  • 79
  • 102
3

If your dataframe has columns with different datatypes and you just need to perform trim operation on the string columns only then you could do it as below dynamically:

#Getting all the string columns from the data frame
string_cols = [c for c, t in df.dtypes if t =='string']
for colname in string_cols :
    df= df.withColumn(colname, f.trim(f.col(colname)))

One benefit it has is it does not convert data type of all other columns in the dataframe to string and retains the existing data type for other columns.

Nikunj Kakadiya
  • 2,689
  • 2
  • 20
  • 35
0

If you need to do it for all columns

    df = df\
        .select(
            [F.trim(F.col(c)).alias(c) for c in df.columns]
        )
blud
  • 39
  • 3
0

Here's a well documented function which works only with string type columns (more secure) :

from pyspark.sql import functions as F

def trimColumns(df, columns=None):
    """
    Remove left and right spaces in string column values (only takes effect on string type columns).
    Non-string columns are not affected.

    - Parameters:
      df: The input dataframe.
      columns: The columns on which to remove the side spaces.
               If None, will take all columns from the dataframe but will only work on string type columns.

    - Return:
      The cleaned dataframe.
    """
    if (columns is None):
        columns = df.columns
    columns = [f.name for f in df.schema.fields if f.name in columns and f.jsonValue().get("type")=="string"]
    for column in columns:
        df = df.withColumn(column, F.trim(column))
    return df

# Usage example 1
myDf = trimColumns(myDf)

# Usage example 2
myDf = trimColumns(myDf, myStringColumns)
prossblad
  • 688
  • 1
  • 7
  • 11
-1
source_str_col = [i[0] for i in df.dtypes if i[1] == 'string']
new_column_name_list = list(map(lambda x: "trim1('" + x+ "').alias('" +x+"'),"  , source_str_col))
with_col_sour="".join([item for item in new_column_name_list])
def trim(string):
  try:
    return string.strip()
  except:
    return string
trim1 = udf(trim)
df=eval("df.select("+with_col_sour[:-1]+")")
print(df.show())
-1
source_str_col = [i[0] for i in df.dtypes if i[1] == 'string']
when_source = list(map(lambda x: ".withColumn('" + x+ "', trim(col('" +x+"')))"  , source_str_col))
when_source ="".join([item for item in when_source])
df=eval("df"+ when_source)