Trim string column in PySpark dataframe

Question

After creating a Spark DataFrame from a CSV file, I would like to trim a column. I've tried:

df = df.withColumn("Product", df.Product.strip())

df is my data frame, Product is a column in my table.

But I get the error:

Column object is not callable

score 35 · Answer 1 · edited Sep 07 '22 at 20:18

35

The PySpark version of the strip function is called trim

Trim the spaces from both ends for the specified string column.

Make sure to import the function first and to put the column you are trimming inside your function.

The following should work:

from pyspark.sql.functions import trim
df = df.withColumn("Product", trim(df.Product))

edited Sep 07 '22 at 20:18

Yadav

129
1
11

answered Aug 01 '17 at 12:53

jdhaen

351
3
2

desertnaut · Answer 2 · 2017-07-30T21:23:49.837

Starting from version 1.5, Spark SQL provides two specific functions for trimming white space, ltrim and rtrim (search for "trim" in the DataFrame documentation); you'll need to import pyspark.sql.functions first. Here is an example:

 from pyspark.sql import SQLContext
 from pyspark.sql.functions import *
 sqlContext = SQLContext(sc)

 df = sqlContext.createDataFrame([(' 2015-04-08 ',' 2015-05-10 ')], ['d1', 'd2']) # create a dataframe - notice the extra whitespaces in the date strings
 df.collect()
 # [Row(d1=u' 2015-04-08 ', d2=u' 2015-05-10 ')]
 df = df.withColumn('d1', ltrim(df.d1)) # trim left whitespace from column d1
 df.collect()
 # [Row(d1=u'2015-04-08 ', d2=u' 2015-05-10 ')]
 df = df.withColumn('d1', rtrim(df.d1))  # trim right whitespace from d1
 df.collect()
 # [Row(d1=u'2015-04-08', d2=u' 2015-05-10 ')]

Unfortunately Spark's trim() function doesn't trim white space, but only spaces. This behaviour was changed with https://issues.apache.org/jira/browse/SPARK-17299 . To also trim white space like "\t, \n," etc. one needs a regular expression. — asmaier, Oct 23 '17 at 13:30

score 9 · Answer 3 · answered Dec 11 '19 at 22:02

9

If you need to do it for all columns in the dataframe.

from pyspark.sql import functions as f

for colname in df.columns:
    df = df.withColumn(colname, f.trim(f.col(colname)))

answered Dec 11 '19 at 22:02

DataDog

475
1
9
23

score 7 · Answer 4 · answered Nov 03 '16 at 15:55

I did that with the udf like this:

from pyspark.sql.functions import udf

def trim(string):
    return string.strip()
trim=udf(trim)

df = sqlContext.createDataFrame([(' 2015-04-08 ',' 2015-05-10 ')], ['d1', 'd2'])

df2 = df.select(trim(df['d1']).alias('d1'),trim(df['d2']).alias('d2'))

output looks like this:

df.show()
df2.show()
+------------+------------+
|          d1|          d2|
+------------+------------+
| 2015-04-08 | 2015-05-10 |
+------------+------------+

+----------+----------+
|        d1|        d2|
+----------+----------+
|2015-04-08|2015-05-10|
+----------+----------+

ZygD · Answer 5 · 2022-04-04T03:12:06.933

Summary

Trim spaces:

on both sides:
Spark 3.0.0+ F.trim("col_name")
Spark 2.0.1+ F.trim(F.col("col_name"))
on left:
Spark 3.0.0+ F.ltrim("col_name")
Spark 2.0.1+ F.ltrim(F.col("col_name"))
on right:
Spark 3.0.0+ F.rtrim("col_name")
Spark 2.0.1+ F.rtrim(F.col("col_name"))

Trim specified symbols (e.g. spaces and tabs):

on both sides:
Spark 3.2.0+ F.expr("BTRIM(col_name, ' \t')")
Spark 2.3.0+ F.expr("TRIM(BOTH ' \t' FROM col_name)")
F.regexp_replace("col_name", r"^[ \t]+|[ \t]+$", "")
on left:
Spark 2.3.0+ F.expr("TRIM(LEADING ' \t' FROM col_name)")
F.regexp_replace("col_name", r"^[ \t]+", "")
on right:
Spark 2.3.0+ F.expr("TRIM(TRAILING ' \t' FROM col_name)")
F.regexp_replace("col_name", r"[ \t]+$", "")

Trim white space:

on both sides:
F.regexp_replace("col_name", r"^\s+|\s+$", "")
Spark 1.5.0-2.0.0 F.trim(F.col("col_name"))
on left:
F.regexp_replace("col_name", r"^\s+", "")
Spark 1.5.0-2.0.0 F.ltrim(F.col("col_name"))
on right:
F.regexp_replace("col_name", r"\s+$", "")
Spark 1.5.0-2.0.0 F.rtrim(F.col("col_name"))

Examples

from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()

Trim spaces on both sides:

df = spark.range(1).withColumn("id", F.lit("trim_spaces_on_both_sides")) \
                   .withColumn("input", F.lit("  EXAMPLE_STRING_VALUE ")) \
                   .withColumn("output", F.trim("input"))
df.show(1, 0)
#+-------------------------+-----------------------+--------------------+
#|id                       |input                  |output              |
#+-------------------------+-----------------------+--------------------+
#|trim_spaces_on_both_sides|  EXAMPLE_STRING_VALUE |EXAMPLE_STRING_VALUE|
#+-------------------------+-----------------------+--------------------+

Trim spaces on left:

df = spark.range(1).withColumn("id", F.lit("trim_spaces_on_left")) \
                   .withColumn("input", F.lit("  EXAMPLE_STRING_VALUE ")) \
                   .withColumn("output", F.ltrim("input"))
df.show(1, 0)
#+-------------------+-----------------------+---------------------+
#|id                 |input                  |output               |
#+-------------------+-----------------------+---------------------+
#|trim_spaces_on_left|  EXAMPLE_STRING_VALUE |EXAMPLE_STRING_VALUE |
#+-------------------+-----------------------+---------------------+

Trim spaces on right:

df = spark.range(1).withColumn("id", F.lit("trim_spaces_on_right")) \
                   .withColumn("input", F.lit("  EXAMPLE_STRING_VALUE ")) \
                   .withColumn("output", F.rtrim("input"))
df.show(1, 0)
#+--------------------+-----------------------+----------------------+
#|id                  |input                  |output                |
#+--------------------+-----------------------+----------------------+
#|trim_spaces_on_right|  EXAMPLE_STRING_VALUE |  EXAMPLE_STRING_VALUE|
#+--------------------+-----------------------+----------------------+

Trim tabs and spaces on both sides:

df = spark.range(1).withColumn("id", F.lit("trim_tabs_and_spaces_on_both_sides")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.expr("TRIM(BOTH ' \t' FROM input)")) \
                   .withColumn("output_spark_3_2", F.expr("BTRIM(input, ' \t')"))
df.show(1, 0)
#+----------------------------------+-----------------------------+--------------------+--------------------+
#|id                                |input                        |output              |output_spark_3_2    |
#+----------------------------------+-----------------------------+--------------------+--------------------+
#|trim_tabs_and_spaces_on_both_sides|\t\t EXAMPLE_STRING_VALUE  \t|EXAMPLE_STRING_VALUE|EXAMPLE_STRING_VALUE|
#+----------------------------------+-----------------------------+--------------------+--------------------+

Trim tabs and spaces on left:

df = spark.range(1).withColumn("id", F.lit("trim_tabs_and_spaces_on_left")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.expr("TRIM(LEADING ' \t' FROM input)"))
df.show(1, 0)
#+----------------------------+-----------------------------+------------------------+
#|id                          |input                        |output                  |
#+----------------------------+-----------------------------+------------------------+
#|trim_tabs_and_spaces_on_left|\t\t EXAMPLE_STRING_VALUE  \t|EXAMPLE_STRING_VALUE  \t|
#+----------------------------+-----------------------------+------------------------+

Trim tabs and spaces on right:

df = spark.range(1).withColumn("id", F.lit("trim_tabs_and_spaces_on_right")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.expr("TRIM(TRAILING ' \t' FROM input)"))
df.show(1, 0)
#+-----------------------------+-----------------------------+-------------------------+
#|id                           |input                        |output                   |
#+-----------------------------+-----------------------------+-------------------------+
#|trim_tabs_and_spaces_on_right|\t\t EXAMPLE_STRING_VALUE  \t|\t\t EXAMPLE_STRING_VALUE|
#+-----------------------------+-----------------------------+-------------------------+

Trim white space on both sides:

df = spark.range(1).withColumn("id", F.lit("trim_white_space_on_both_sides")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.regexp_replace("input", r"^\s+|\s+$", ""))
df.show(1, 0)
#+------------------------------+-----------------------------+--------------------+
#|id                            |input                        |output              |
#+------------------------------+-----------------------------+--------------------+
#|trim_white_space_on_both_sides|\t\t EXAMPLE_STRING_VALUE  \t|EXAMPLE_STRING_VALUE|
#+------------------------------+-----------------------------+--------------------+

Trim white space on left:

df = spark.range(1).withColumn("id", F.lit("trim_white_space_on_left")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.regexp_replace("input", r"^\s+", ""))
df.show(1, 0)
#+------------------------+-----------------------------+------------------------+
#|id                      |input                        |output                  |
#+------------------------+-----------------------------+------------------------+
#|trim_white_space_on_left|\t\t EXAMPLE_STRING_VALUE  \t|EXAMPLE_STRING_VALUE  \t|
#+------------------------+-----------------------------+------------------------+

Trim white space on right:

df = spark.range(1).withColumn("id", F.lit("trim_white_space_on_right")) \
                   .withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE  \t")) \
                   .withColumn("output", F.regexp_replace("input", r"\s+$", ""))
df.show(1, 0)
#+-------------------------+-----------------------------+-------------------------+
#|id                       |input                        |output                   |
#+-------------------------+-----------------------------+-------------------------+
#|trim_white_space_on_right|\t\t EXAMPLE_STRING_VALUE  \t|\t\t EXAMPLE_STRING_VALUE|
#+-------------------------+-----------------------------+-------------------------+

score 3 · Answer 6 · answered Sep 23 '22 at 05:41

If your dataframe has columns with different datatypes and you just need to perform trim operation on the string columns only then you could do it as below dynamically:

#Getting all the string columns from the data frame
string_cols = [c for c, t in df.dtypes if t =='string']
for colname in string_cols :
    df= df.withColumn(colname, f.trim(f.col(colname)))

One benefit it has is it does not convert data type of all other columns in the dataframe to string and retains the existing data type for other columns.

score 0 · Answer 7 · answered Sep 19 '22 at 13:36

0

If you need to do it for all columns

    df = df\
        .select(
            [F.trim(F.col(c)).alias(c) for c in df.columns]
        )

answered Sep 19 '22 at 13:36

blud

39
3

prossblad · Answer 8 · 2022-11-17T23:07:15.167

Here's a well documented function which works only with string type columns (more secure) :

from pyspark.sql import functions as F

def trimColumns(df, columns=None):
    """
    Remove left and right spaces in string column values (only takes effect on string type columns).
    Non-string columns are not affected.

    - Parameters:
      df: The input dataframe.
      columns: The columns on which to remove the side spaces.
               If None, will take all columns from the dataframe but will only work on string type columns.

    - Return:
      The cleaned dataframe.
    """
    if (columns is None):
        columns = df.columns
    columns = [f.name for f in df.schema.fields if f.name in columns and f.jsonValue().get("type")=="string"]
    for column in columns:
        df = df.withColumn(column, F.trim(column))
    return df

# Usage example 1
myDf = trimColumns(myDf)

# Usage example 2
myDf = trimColumns(myDf, myStringColumns)

Siddharth Sankhe · Answer 9 · 2022-11-24T11:48:55.517

-1

source_str_col = [i[0] for i in df.dtypes if i[1] == 'string']
new_column_name_list = list(map(lambda x: "trim1('" + x+ "').alias('" +x+"'),"  , source_str_col))
with_col_sour="".join([item for item in new_column_name_list])
def trim(string):
  try:
    return string.strip()
  except:
    return string
trim1 = udf(trim)
df=eval("df.select("+with_col_sour[:-1]+")")
print(df.show())

edited Nov 24 '22 at 11:48

answered Nov 24 '22 at 11:43

Siddharth Sankhe

25
3

Siddharth Sankhe · Answer 10 · 2022-11-24T12:41:00.663

-1

source_str_col = [i[0] for i in df.dtypes if i[1] == 'string']
when_source = list(map(lambda x: ".withColumn('" + x+ "', trim(col('" +x+"')))"  , source_str_col))
when_source ="".join([item for item in when_source])
df=eval("df"+ when_source)

edited Nov 24 '22 at 12:41

answered Nov 24 '22 at 12:27

Siddharth Sankhe

25
3

Please consider adding some explanation to the source code explaining how it solves the problem. – Azhar Khan Dec 01 '22 at 03:59

Trim string column in PySpark dataframe

10 Answers10

Linked