I am reading a json file with 500 million records from a API and writing to blob in Azure. Tried many ways but getting the below error. I am using PySpark notebook in Azure Synapse
ValueError Traceback (most recent call last)
Cell In [17], line 45
41 total_results.append((parsed_data))
42 ##print(total_results)
43
44 ##RDD Spark creation
---> 45 rdd = spark.sparkContext.parallelize(total_results)
46 df = spark.read.option('multiLine','true').json(rdd)
48 #Create temporary view on dataframe
File /opt/spark/python/lib/pyspark.zip/pyspark/context.py:686, in
SparkContext.parallelize(self, c, numSlices)
683 assert self._jvm is not None
684 return self._jvm.PythonParallelizeServer(self._jsc.sc(), numSlices)
--> 686 jrdd = self._serialize_to_jvm(c, serializer, reader_func, createRDDServer)
687 return RDD(jrdd, self, serializer)
File /opt/spark/python/lib/pyspark.zip/pyspark/context.py:729, in
SparkContext._serialize_to_jvm(self, data, serializer, reader_func, createRDDServer)
727 try:
728 try:
--> 729 serializer.dump_stream(data, tempFile)
730 finally:
731 tempFile.close()
File /opt/spark/python/lib/pyspark.zip/pyspark/serializers.py:224, in
BatchedSerializer.dump_stream(self, iterator, stream)
223 def dump_stream(self, iterator, stream):
--> 224 self.serializer.dump_stream(self._batched(iterator), stream)
File /opt/spark/python/lib/pyspark.zip/pyspark/serializers.py:146, in
FramedSerializer.dump_stream(self, iterator, stream)
144 def dump_stream(self, iterator, stream):
145 for obj in iterator:
--> 146 self._write_with_length(obj, stream)
File /opt/spark/python/lib/pyspark.zip/pyspark/serializers.py:160, in
FramedSerializer._write_with_length(self, obj, stream)
158 raise ValueError("serialized value should not be None")
159 if len(serialized) > (1 << 31):
--> 160 raise ValueError("can not serialize object larger than 2G")
161 write_int(len(serialized), stream)
162 stream.write(serialized)
ValueError: can not serialize object larger than 2G
My code takes JSON in a list and does RDD and write to disk
rdd = spark.sparkContext.parallelize(total_results)
df = spark.read.option('multiLine','true').json(rdd)
#Create temporary view on dataframe
df.createOrReplaceTempView('filter_view')
#SQL query to filter on deleteddate value
df_filter=spark.sql("""select * from filter_view where DeletedDate is null""")
df_filter.coalesce(800).write.format("parquet").save(stagingpath,mode="overwrite")