I am new to spark and facing an error while converting .csv file to dataframe. I am using pyspark_csv module for the conversion but gives an error saying "module 'pyspark_csv' has no attribute 'csvToDataframe".
here is my code:
import findspark
findspark.init()
findspark.find()
import pyspark
sc=pyspark.SparkContext(appName="myAppName")
sqlCtx = pyspark.SQLContext
#csv to dataframe
sc.addPyFile('/usr/spark-1.5.0/python/pyspark_csv.py')
sc.addPyFile('https://raw.githubusercontent.com/seahboonsiew/pyspark-csv/master/pyspark_csv.py')
import pyspark_csv as pycsv
#skipping the header
def skip_header(idx, iterator):
if(idx == 0):
next(iterator)
return iterator
#loading the dataset
data=sc.textFile('gdeltdata/20160427.CSV')
data_header = data.first()
data_body = data.mapPartitionsWithIndex(skip_header)
data_df = pycsv.csvToDataframe(sqlctx, data_body, sep=",", columns=data_header.split('\t'))
AttributeError Traceback (most recent call last)
<ipython-input-10-8e47cd9759e6> in <module>()
----> 1 data_df = pycsv.csvToDataframe(sqlctx, data_body, sep=",", columns=data_header.split('\t'))
AttributeError: module 'pyspark_csv' has no attribute 'csvToDataframe'