Based on the documentation, it said that Glue able to convert Semi-Structured schema to relational schema,
Currently I'm able to create schema by using crawler, and able to store my data by the job script generated by AWS Glue from s3 to postgres DB,
But the auto created schema it's not relational, and all my array are converted to string instead of creating a new table to store it.
Below is the script:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(
database = "coach-transformed",
table_name = "bill",
transformation_ctx = "datasource0"
)
applymapping1 = ApplyMapping.apply(
frame = datasource0,
mappings = [
("_id.$oid", "string", "`_id.$oid`", "string"),
("dbname", "string", "dbname", "string"),
("docname", "string", "docname", "string"),
("createdat.$date", "string", "`createdat.$date`", "string"),
("data.activeItems", "array", "`data.activeItems`", "string"),
("data.is.takeaway", "boolean", "`data.is.takeaway`", "boolean"),
("data.slot", "string", "`data.slot`", "string"),
("data.balance.amount", "int", "`data.balance.amount`", "int"),
("data.balance.currency", "string", "`data.balance.currency`", "string"),
("data.balance.precision", "int", "`data.balance.precision`", "int"),
("data.rounding.amount", "int", "`data.rounding.amount`", "int"),
("data.rounding.currency", "string", "`data.rounding.currency`", "string"),
("data.rounding.precision", "int", "`data.rounding.precision`", "int"),
("data.adjustments", "array", "`data.adjustments`", "string"),
("data.items", "array", "`data.items`", "string"),
("data.taxes", "array", "`data.taxes`", "string"),
("data.refund", "string", "`data.refund`", "string"),
("data.delivery", "string", "`data.delivery`", "string"),
("data.pickup", "string", "`data.pickup`", "string"),
("data.status", "string", "`data.status`", "string"),
("data.payments", "array", "`data.payments`", "string"),
("data.reward.checked", "boolean", "`data.reward.checked`", "boolean"),
("data._rev", "string", "`data._rev`", "string"),
("data.person", "int", "`data.person`", "int"),
("data.completedAt", "string", "`data.completedAt`", "string"),
("data.refundedFrom", "string", "`data.refundedFrom`", "string"),
("data.roundToNearest", "double", "`data.roundToNearest`", "double"),
("data.membership.birthday", "string", "`data.membership.birthday`", "string"),
("data.membership.benefits", "array", "`data.membership.benefits`", "string"),
("data.membership.mostPurchasedItems", "array", "`data.membership.mostPurchasedItems`", "string"),
("data.membership.id", "string", "`data.membership.id`", "string"),
("data.membership.userId", "string", "`data.membership.userId`", "string"),
("data.membership.name", "string", "`data.membership.name`", "string"),
("data._id", "string", "`data._id`", "string"),
("data.grossTotal.currency", "string", "`data.grossTotal.currency`", "string"),
("data.grossTotal.precision", "int", "`data.grossTotal.precision`", "int"),
("data.grossTotal.amount", "int", "`data.grossTotal.amount`", "int"),
("data.nettTaxes", "array", "`data.nettTaxes`", "string"),
("data.adjustedTotal.amount", "int", "`data.adjustedTotal.amount`", "int"),
("data.adjustedTotal.currency", "string", "`data.adjustedTotal.currency`", "string"),
("data.adjustedTotal.precision", "int", "`data.adjustedTotal.precision`", "int"),
("data.nettTotal.amount", "int", "`data.nettTotal.amount`", "int"),
("data.nettTotal.currency", "string", "`data.nettTotal.currency`", "string"),
("data.nettTotal.precision", "int", "`data.nettTotal.precision`", "int"),
("data.effects", "array", "`data.effects`", "string"),
("data.inHouseDelivery", "string", "`data.inHouseDelivery`", "string"),
("data.seqNumber", "string", "`data.seqNumber`", "string"),
("data.type_bill", "boolean", "`data.type_bill`", "boolean"),
("data.issued", "string", "`data.issued`", "string"),
("deleted", "boolean", "deleted", "boolean"),
("updatedat.$date", "string", "`updatedat.$date`", "string"),
("importantfield", "int", "importantfield", "int")
],
transformation_ctx = "applymapping1"
)
resolvechoice2 = ResolveChoice.apply(frame = applymapping1, choice = "make_cols", transformation_ctx = "resolvechoice2")
dropnullfields3 = DropNullFields.apply(frame = resolvechoice2, transformation_ctx = "dropnullfields3")
datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
frame = dropnullfields3,
catalog_connection = "default",
connection_options = {"dbtable": "bill", "database": "dpa"},
transformation_ctx = "datasink4"
)
job.commit()
from the script I know why my array being convert to string, but I have no idea which part should I change to create relational schema and create relational table from job, and I couldn't find a good guide from AWS document on how to achieve Converting Semi-Structured Schemas to Relational Schemas