I'm new to Spark and python. I'm given a dataframe of type pyspark.sql.dataframe.DataFrame and I'm trying to convert to a Pandas data frame.
Tried doing using toPandas(). But it didn't work
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
data = sqlContext.read.text('filename')
df1 = data.toPandas()
Error message:
AttributeErrorTraceback (most recent call last)
<ipython-input-6-98675a0d0483> in <module>()
----> 1 data.toPandas()
/opt/spark/python/pyspark/sql/dataframe.py in toPandas(self)
1964 raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
1965 else:
-> 1966 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
1967
1968 dtype = {}
/opt/spark/python/pyspark/sql/dataframe.py in collect(self)
463 [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
464 """
--> 465 with SCCallSiteSync(self._sc) as css:
466 port = self._jdf.collectToPython()
467 return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
/opt/spark/python/pyspark/traceback_utils.py in __enter__(self)
70 def __enter__(self):
71 if SCCallSiteSync._spark_stack_depth == 0:
---> 72 self._context._jsc.setCallSite(self._call_site)
73 SCCallSiteSync._spark_stack_depth += 1
74
AttributeError: 'NoneType' object has no attribute 'setCallSite'
Appreciate any help on this