I'm running into an issue with AWS Glue where when I run a Map.apply function to a DataFrame in order to decrypt a given column value it throws an error. The error I'm getting is PicklingError: Could not serialize object: TypeError: can't pickle _ModuleWithDeprecations objects
however I don't think this is caused by the code itself (as it runs on my local machine with the same library versions fine) but rather something to do with the way Spark and Glue bundle scripts and me having this UDF that takes care of the decryption by importing the cryptography library. I'm wondering if there's something obvious I'm missing for using this code to accomplish what I'm looking for or if this is simply a limitation of using AWS Glue in Python and I'll have to switch over to integrating a Jar with the necessary code already pre-bundled and Scala to handle this use case?
Code with some slight simplifications is below:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from base64 import b64decode, b64encode
from cryptography.fernet import Fernet
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.padding import PKCS7
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
KEY = b'our-secret-key-value'
def decrypt_pbe_with_hmac_sha512_aes_256(obj: str) -> str:
# re-generate key from
encrypted_obj = b64decode(obj)
salt = encrypted_obj[0:16]
iv = encrypted_obj[16:32]
cypher_text = encrypted_obj[32:]
kdf = PBKDF2HMAC(hashes.SHA512(), 32, salt, 1000, backend=default_backend())
key = kdf.derive(KEY)
# decrypt
cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
decryptor = cipher.decryptor()
padded_text = decryptor.update(cypher_text) + decryptor.finalize()
# remove padding
unpadder = PKCS7(128).unpadder()
clear_text = unpadder.update(padded_text) + unpadder.finalize()
return clear_text.decode()
def decryptDescription(rec):
rec["updated_description"] = decrypt_pbe_with_hmac_sha512_aes_256(rec["description"])
del rec["description"]
return rec
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node
node1 = glueContext.create_dynamic_frame.from_catalog(...)
mapped_dyF = Map.apply(frame = node1, f = decryptDescription)
# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(...)
# Script generated for node
node3 = glueContext.write_dynamic_frame.from_catalog(...)
job.commit()