1

I am trying to code in AWS Glue ETL to write dataframe in XML format, compressed in ZIPPED to be loaded in the s3 folder. I have been able to write the code for JSON, parquet, orc but unable to find any for XML.

The main error was:

DataFrameWriter has no attr XML

d0.write.format("xml").option("compression", "gzip").save("/content/sample_data/compressed_xml")

d0.write.format("xml").option("codec", "gzip").save("/content/sample_data/compressed_xml")
blackbishop
  • 30,945
  • 11
  • 55
  • 76
Arijit Roy
  • 11
  • 2

1 Answers1

0

Can do it through lambda :

import json import lzma, os import boto3 import zipfile

def lambda_handler(event, context):

sourceSystem            = str(event['gzip_args']['sourcesystem'])
input_filename          = str(event['gzip_args']['filename'])
bucket_in_name          = str(event['gzip_args']['bucket_in_name'])
bucket_out_name         = str(event['gzip_args']['bucket_out_name'])
xml_file_read_location  = str(event['gzip_args']['xml_file_read_location'])
zip_file_write_location = str(event['gzip_args']['zip_file_write_location'])

s3=boto3.client('s3')

key=xml_file_read_location+sourceSystem+'/'+input_filename

localFilename = '/tmp/{}'.format(os.path.basename(key))

s3.download_file(Bucket=bucket_in_name, Key=key, Filename=localFilename)
os.chdir('/tmp/')

file_list = os.listdir()

#Zip and compress file
zf = zipfile.ZipFile(input_filename.replace("xml","zip"), mode='w', compression=zipfile.ZIP_DEFLATED)


#reading xml file name and zf object is writting in zip format with compression
zf.write(input_filename)

# zf.write(file_list[0])
zf.close()

##Uploading final zip file to respective target location
s3.upload_file(input_filename.replace("xml","zip"),bucket_out_name , zip_file_write_location+sourceSystem+'/'+input_filename.replace("xml","zip"))

os.remove(localFilename)

return {
    'statusCode': 200,
    'body': json.dumps('HelloWorld!')
}
Arijit Roy
  • 11
  • 2