Summary
Trim spaces:
- on both sides:
Spark 3.0.0+ F.trim("col_name")
Spark 2.0.1+ F.trim(F.col("col_name"))
- on left:
Spark 3.0.0+ F.ltrim("col_name")
Spark 2.0.1+ F.ltrim(F.col("col_name"))
- on right:
Spark 3.0.0+ F.rtrim("col_name")
Spark 2.0.1+ F.rtrim(F.col("col_name"))
Trim specified symbols (e.g. spaces and tabs):
- on both sides:
Spark 3.2.0+ F.expr("BTRIM(col_name, ' \t')")
Spark 2.3.0+ F.expr("TRIM(BOTH ' \t' FROM col_name)")
F.regexp_replace("col_name", r"^[ \t]+|[ \t]+$", "")
- on left:
Spark 2.3.0+ F.expr("TRIM(LEADING ' \t' FROM col_name)")
F.regexp_replace("col_name", r"^[ \t]+", "")
- on right:
Spark 2.3.0+ F.expr("TRIM(TRAILING ' \t' FROM col_name)")
F.regexp_replace("col_name", r"[ \t]+$", "")
Trim white space:
- on both sides:
F.regexp_replace("col_name", r"^\s+|\s+$", "")
Spark 1.5.0-2.0.0 F.trim(F.col("col_name"))
- on left:
F.regexp_replace("col_name", r"^\s+", "")
Spark 1.5.0-2.0.0 F.ltrim(F.col("col_name"))
- on right:
F.regexp_replace("col_name", r"\s+$", "")
Spark 1.5.0-2.0.0 F.rtrim(F.col("col_name"))
Examples
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()
Trim spaces on both sides:
df = spark.range(1).withColumn("id", F.lit("trim_spaces_on_both_sides")) \
.withColumn("input", F.lit(" EXAMPLE_STRING_VALUE ")) \
.withColumn("output", F.trim("input"))
df.show(1, 0)
#+-------------------------+-----------------------+--------------------+
#|id |input |output |
#+-------------------------+-----------------------+--------------------+
#|trim_spaces_on_both_sides| EXAMPLE_STRING_VALUE |EXAMPLE_STRING_VALUE|
#+-------------------------+-----------------------+--------------------+
Trim spaces on left:
df = spark.range(1).withColumn("id", F.lit("trim_spaces_on_left")) \
.withColumn("input", F.lit(" EXAMPLE_STRING_VALUE ")) \
.withColumn("output", F.ltrim("input"))
df.show(1, 0)
#+-------------------+-----------------------+---------------------+
#|id |input |output |
#+-------------------+-----------------------+---------------------+
#|trim_spaces_on_left| EXAMPLE_STRING_VALUE |EXAMPLE_STRING_VALUE |
#+-------------------+-----------------------+---------------------+
Trim spaces on right:
df = spark.range(1).withColumn("id", F.lit("trim_spaces_on_right")) \
.withColumn("input", F.lit(" EXAMPLE_STRING_VALUE ")) \
.withColumn("output", F.rtrim("input"))
df.show(1, 0)
#+--------------------+-----------------------+----------------------+
#|id |input |output |
#+--------------------+-----------------------+----------------------+
#|trim_spaces_on_right| EXAMPLE_STRING_VALUE | EXAMPLE_STRING_VALUE|
#+--------------------+-----------------------+----------------------+
Trim tabs and spaces on both sides:
df = spark.range(1).withColumn("id", F.lit("trim_tabs_and_spaces_on_both_sides")) \
.withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE \t")) \
.withColumn("output", F.expr("TRIM(BOTH ' \t' FROM input)")) \
.withColumn("output_spark_3_2", F.expr("BTRIM(input, ' \t')"))
df.show(1, 0)
#+----------------------------------+-----------------------------+--------------------+--------------------+
#|id |input |output |output_spark_3_2 |
#+----------------------------------+-----------------------------+--------------------+--------------------+
#|trim_tabs_and_spaces_on_both_sides|\t\t EXAMPLE_STRING_VALUE \t|EXAMPLE_STRING_VALUE|EXAMPLE_STRING_VALUE|
#+----------------------------------+-----------------------------+--------------------+--------------------+
Trim tabs and spaces on left:
df = spark.range(1).withColumn("id", F.lit("trim_tabs_and_spaces_on_left")) \
.withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE \t")) \
.withColumn("output", F.expr("TRIM(LEADING ' \t' FROM input)"))
df.show(1, 0)
#+----------------------------+-----------------------------+------------------------+
#|id |input |output |
#+----------------------------+-----------------------------+------------------------+
#|trim_tabs_and_spaces_on_left|\t\t EXAMPLE_STRING_VALUE \t|EXAMPLE_STRING_VALUE \t|
#+----------------------------+-----------------------------+------------------------+
Trim tabs and spaces on right:
df = spark.range(1).withColumn("id", F.lit("trim_tabs_and_spaces_on_right")) \
.withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE \t")) \
.withColumn("output", F.expr("TRIM(TRAILING ' \t' FROM input)"))
df.show(1, 0)
#+-----------------------------+-----------------------------+-------------------------+
#|id |input |output |
#+-----------------------------+-----------------------------+-------------------------+
#|trim_tabs_and_spaces_on_right|\t\t EXAMPLE_STRING_VALUE \t|\t\t EXAMPLE_STRING_VALUE|
#+-----------------------------+-----------------------------+-------------------------+
Trim white space on both sides:
df = spark.range(1).withColumn("id", F.lit("trim_white_space_on_both_sides")) \
.withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE \t")) \
.withColumn("output", F.regexp_replace("input", r"^\s+|\s+$", ""))
df.show(1, 0)
#+------------------------------+-----------------------------+--------------------+
#|id |input |output |
#+------------------------------+-----------------------------+--------------------+
#|trim_white_space_on_both_sides|\t\t EXAMPLE_STRING_VALUE \t|EXAMPLE_STRING_VALUE|
#+------------------------------+-----------------------------+--------------------+
Trim white space on left:
df = spark.range(1).withColumn("id", F.lit("trim_white_space_on_left")) \
.withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE \t")) \
.withColumn("output", F.regexp_replace("input", r"^\s+", ""))
df.show(1, 0)
#+------------------------+-----------------------------+------------------------+
#|id |input |output |
#+------------------------+-----------------------------+------------------------+
#|trim_white_space_on_left|\t\t EXAMPLE_STRING_VALUE \t|EXAMPLE_STRING_VALUE \t|
#+------------------------+-----------------------------+------------------------+
Trim white space on right:
df = spark.range(1).withColumn("id", F.lit("trim_white_space_on_right")) \
.withColumn("input", F.lit("\t\t EXAMPLE_STRING_VALUE \t")) \
.withColumn("output", F.regexp_replace("input", r"\s+$", ""))
df.show(1, 0)
#+-------------------------+-----------------------------+-------------------------+
#|id |input |output |
#+-------------------------+-----------------------------+-------------------------+
#|trim_white_space_on_right|\t\t EXAMPLE_STRING_VALUE \t|\t\t EXAMPLE_STRING_VALUE|
#+-------------------------+-----------------------------+-------------------------+