I am trying to load data from AWS EMR(data storage as S3 and glue-catalog for metastore) to Redshift.
import sys
import boto3
from datetime import datetime,date
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.context import SparkContext
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.functions import to_date
from pyspark.sql import SQLContext
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.sparkSession
sc = spark.sparkContext
sqlContext = SQLContext(sc)
df = sqlContext.sql("Select * from classic_models.orderdetails where insert_date >= '2021-01-01' and insert_date < '2021-01-02' ")
dynamic_df = DynamicFrame.fromDF(new_df, glueContext, "dynamic_df")
redshift_target_table = "classic_models.orderdetails"
pre_actions = f"Truncate table {redshift_target_table};"
redshift_connection_opts = {
"database": "dev",
"dbtable": redshift_target_table,
"aws_iam_role": "arn:aws:iam::*********",
"preactions": pre_actions
}
s3_temp_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
glueContext.write_dynamic_frame.from_jdbc_conf(
frame = dynamic_df,
catalog_connection = "redshift",
connection_options = redshift_connection_opts,
redshift_tmp_dir = "s3:/staging/orderdetails/%s/" % s3_temp_dir #Need change
)
The extract data from Hive is quite fast but loading data to Redshift is taking long. By long I mean if I am loading data for the past 10 days, and a glue job takes 16 min to complete, less than 1 min is to extract data from Hive, rest is only to load data to Redshift. More than half of the Hive table columns are of String data-type Is there any other better and faster way to do it?