0

I am trying to run simple pdf_udf code for my understanding . In actual I need to implement complex logic in it .

But even for this simple udf where I am converting to upper case and appending hello at end I am getting error . Just to mention specifically I have collected my data directly in pyspark dataframe and have not used pandas dataframe anywhere in code.

import sys,os
import concurrent.futures
from concurrent.futures import *
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.context import SparkConf
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from datetime import datetime
from pyspark.sql.functions import array
from pyspark.sql.functions import sha2, concat_ws
from pyspark.sql.functions import  udf
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType
#from pyspark.sql.functions import StringType
from pyspark.sql.functions import row_number,lit,col,expr
from pyspark.sql.window import Window
import requests
import json
import traceback
import base64
import pandas as pd 
from pyspark.sql.functions import pandas_udf
import pyspark.sql.types as T




###############################

class JobBase(object):
    
    #all udf has to be initialised as first step 
    def __init__(self):
        print("Inside the constructor of Class phases ")

        
    
    fair_scheduler_config_file= "fairscheduler.xml"
    rowAsDict={}
    listVendorDF=[]
    Oracle_Username=None
    Oracle_Password=None
    Oracle_jdbc_url=None
    futures=[]
    ataccama_url=None
    spark=None
    #all spark configuations can be passed in object in s3 bucket 

             
    @staticmethod
    @pandas_udf(T.StringType())
    def upperval(Col :pd.Series)-> pd.Series :
        try:
           
            return (upper(Col )+'hello')
        except Exception as exp:
             exception_type, exception_value, exception_traceback = sys.exc_info()
             traceback_string = traceback.format_exception(exception_type, exception_value, exception_traceback)
             err_msg = json.dumps({
                    "errorType": exception_type.__name__,
                    "errorMessage": str(exception_value),
                    "stackTrace": traceback_string})
             print(err_msg)
             return(err_msg)

    def __start_spark_glue_context(self):
        conf = SparkConf().setAppName("python_thread").set('spark.scheduler.mode', 'FAIR').set("spark.scheduler.allocation.file", self.fair_scheduler_config_file).set('spark.sql.execution.arrow.enabled','true').set('spark.sql.execution.arrow.maxRecordsPerBatch',100)
        self.sc = SparkContext(conf=conf)
        self.glueContext = GlueContext(self.sc)
        self.spark = self.glueContext.spark_session

    def execute(self):
        self.__start_spark_glue_context()
        args = getResolvedOptions(sys.argv, ['JOB_NAME'])
        self.ataccama_url=args['ataccma_cleanse_url']
        self.logger = self.glueContext.get_logger()
        self.logger.info("Starting Glue Threading job ")
        client = boto3.client('glue', region_name='XXXX')
        response = client.get_connection(Name='XXXX')
        connection_properties = response['Connection']['ConnectionProperties']
        URL = connection_properties['JDBC_CONNECTION_URL']
        url_list = URL.split("/")
        host = "{}".format(url_list[-2][:-5])
        new_host=host.split('@',1)[1]
        port = url_list[-2][-4:]
        database = "{}".format(url_list[-1])
        self.Oracle_Username = "{}".format(connection_properties['USERNAME'])
        self.Oracle_Password = "{}".format(connection_properties['PASSWORD'])
        #no. jobs which can run in parallel 
        spark_pool_configuration=3
        print("Host:",host)
        print("New Host:",new_host)
        print("Port:",port)
        print("Database:",database)
        self.Oracle_jdbc_url="jdbc:oracle:thin:@//"+new_host+":"+port+"/"+database
        print("Oracle_jdbc_url:",self.Oracle_jdbc_url)
        arrow_enabled =self.spark.conf.get("spark.sql.execution.arrow.enabled")
        print (f' arrow_enabled {arrow_enabled}')
        arrow_enabled_batchsize =self.spark.conf.get("spark.sql.execution.arrow.maxRecordsPerBatch")
        print (f' arrow_enabled_batchsize {arrow_enabled_batchsize}')

        
        ############testing to check hash ############################
        source_df =self.spark.read.format("jdbc").option("url", self.Oracle_jdbc_url).option("dbtable", "(select c1,c2,c3  from table where rownum <=500) ").option("user", self.Oracle_Username).option("password", self.Oracle_Password).option("numPartitions",2)\
        .option("lowerBound", 1)\
        .option("upperBound",100000)\
        .option("partitionColumn", "c1").load()
        # source_df.show(truncate=False)
    

        w=Window.orderBy(lit(1))
        try:
            source_df.select('C2',JobBase.upperval('C2')).show()
        except Exception as exp:
            exception_type, exception_value, exception_traceback = sys.exc_info()
            traceback_string = traceback.format_exception(exception_type, exception_value, exception_traceback)
            err_msg = json.dumps({"errorType": exception_type.__name__,"errorMessage": str(exception_value),"stackTrace": traceback_string})
            print(err_msg)
            
                
def main():
    job = JobBase()
    job.execute() 
    

if __name__ == '__main__':
    main()


 

le(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 450, in <genexpr>\n    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 105, in <lambda>\n    verify_result_type(f(*a)), len(a[0])), arrow_return_type)\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 101, in verify_result_length\n    \"expected %d, got %d\" % (length, len(result)))\nRuntimeError: Result vector from pandas_udf was not the required length: expected 100, got 229\n", "stackTrace": ["Traceback (most recent call last):\n", "  File \"/tmp/TestGlue2.py\", line 374, in execute\n    source_df.select('ENTERPRISE_NAME',JobBase.upperval('ENTERPRISE_NAME')).show()\n", "  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py\", line 485, in show\n    print(self._jdf.showString(n, 20, vertical))\n", "  File \"/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n    answer, self.gateway_client, self.target_id, self.name)\n", "  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py\", line 117, in deco\n    raise converted from None\n", "pyspark.sql.utils.PythonException: \n  An exception was thrown from the Python worker. Please see the stack trace below.\nTraceback (most recent call last):\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 604, in main\n    process()\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 596, in process\n    serializer.dump_stream(out_iter, outfile)\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py\", line 273, in dump_stream\n    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py\", line 81, in dump_stream\n    for batch in iterator:\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py\", line 266, in init_stream_yield_batches\n    for series in iterator:\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 450, in mapper\n    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 450, in <genexpr>\n    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 105, in <lambda>\n    verify_result_type(f(*a)), len(a[0])), arrow_return_type)\n  File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 101, in verify_result_length\n    \"expected %d, got %d\" % (length, len(result)))\nRuntimeError: Result vector from pandas_udf was not the required length: expected 100, got 229\n\n"]}
pbh
  • 186
  • 1
  • 9

0 Answers0