3

I've been given a csv file on which I need to perform certain cleanup tasks using pyspark. Before the cleanup, I'm doing some schema validation check. Below is my code.

# schema for the input data
def get_input_schema():
    return StructType([StructField("Group ID", StringType(), True),                           
                       StructField("Start Date", DateType(), True),
                       StructField("Start Time", StringType(), True),
                       ...
                       StructField("malformed_rows", StringType(), True)
                       ])

# basic cleanup logic
def main(argv):
    spark = SparkSession.builder.appName('cleaner_job').getOrCreate()
    spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
    df = spark.read.option("mode", "PERMISSIVE") \
        .option("dateFormat", "yyyy-MM-dd") \
        .option("columnNameOfCorruptRecord", "malformed_rows") \
        .schema(get_input_schema()) \
        .csv(input_path, header=True)

    # this is where the error is happening
    df_bad = df.filter(df["malformed_rows"].isNotNull())
    df_good = df.filter(df["malformed_rows"].isNull())

    df_good.write.csv(output_path, header=True)
    df_bad.write.csv(output_malformed_path, header=True)

I'm using the PERMISSIVE mode when reading the csv and trying to split the input dataframe into two dataframes (df_good and df_bad) based on whether the malformed_rows column is null or not. If I don't split the dataframe and write it directly to csv, I can see the malformed_rows column in the output csv. But the code above throws error saying that:

ERROR Utils: Aborting task
java.lang.IllegalArgumentException: malformed_rows does not exist. Available: Group ID, Start Date, Start Time, ...,
    at org.apache.spark.sql.types.StructType.$anonfun$fieldIndex$1(StructType.scala:306)
    at scala.collection.MapLike.getOrElse(MapLike.scala:131)
    at scala.collection.MapLike.getOrElse$(MapLike.scala:129)
    at scala.collection.AbstractMap.getOrElse(Map.scala:63)
    at org.apache.spark.sql.types.StructType.fieldIndex(StructType.scala:305)
    at org.apache.spark.sql.catalyst.csv.CSVFilters.$anonfun$predicates$4(CSVFilters.scala:65)
    at org.apache.spark.sql.catalyst.csv.CSVFilters.$anonfun$predicates$4$adapted(CSVFilters.scala:65)
    at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
    at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
    at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
    at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
    at scala.collection.TraversableLike.map(TraversableLike.scala:238)
    at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
    at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
    at org.apache.spark.sql.catalyst.csv.CSVFilters.$anonfun$predicates$3(CSVFilters.scala:65)
    at org.apache.spark.sql.catalyst.csv.CSVFilters.$anonfun$predicates$3$adapted(CSVFilters.scala:54)
    at scala.collection.immutable.List.foreach(List.scala:392)
    at org.apache.spark.sql.catalyst.csv.CSVFilters.<init>(CSVFilters.scala:54)
    at org.apache.spark.sql.catalyst.csv.UnivocityParser.<init>(UnivocityParser.scala:101)
    at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.$anonfun$buildReader$1(CSVFileFormat.scala:138)
    at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:147)
    at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:132)
    at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
    at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
    at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:272)
    at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:281)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:205)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:127)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at java.base/java.lang.Thread.run(Thread.java:834)
ERROR FileFormatWriter: Job job_20210302150943_0000 aborted.
ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
org.apache.spark.SparkException: Task failed while writing rows.
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:291)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:205)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:127)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at java.base/java.lang.Thread.run(Thread.java:834)

I've gone through the spark doc where it says to retain the corrupt-data column we need to define it in the schema, which I'm doing. But it baffles me why it's not working only if I try to filter the data. Any help in resolving this is very much appreciated.

Bitswazsky
  • 4,242
  • 3
  • 29
  • 58

2 Answers2

3

malformed_rows is the internal corrupt record column which is named by default _corrupt_record and you renamed with:

.option("columnNameOfCorruptRecord", "malformed_rows")

But starting from Spark 2.3, you can't query data using only this column as cited in the docs, you need to cache the df before:

Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named _corrupt_record by default). For example, spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count() and spark.read.schema(schema).json(file).select("_corrupt_record").show(). Instead, you can cache or save the parsed results and then send the same query. For example, val df = spark.read.schema(schema).json(file).cache() and then df.filter($"_corrupt_record".isNotNull).count().

blackbishop
  • 30,945
  • 11
  • 55
  • 76
1

If you concat the corrupt column with something else and then filter, it will work.

You can do it like this:

df.createOrReplaceTempView("df1")   
spark.sql("select *, concat('error',malformed_rows) from df1 where concat('error',malformed_rows) is not null").show(10,False)
JW Geertsma
  • 857
  • 3
  • 13
  • 19
pooja
  • 11
  • 4