Using pyspark, if you have all the json files in the same folder, you can use df = spark.read.json('folder_path')
. This instruction will load all the json files inside the folder.
For reading performance, I recommend you for providing dataframe the schema:
import pyspark.sql.types as T
billing_schema = billing_schema = T.StructType([
T.StructField('accountId', T.LongType(),True),
T.StructField('accountName',T.StringType(),True),
T.StructField('accountOwnerEmail',T.StringType(),True),
T.StructField('additionalInfo',T.StringType(),True),
T.StructField('chargesBilledSeparately',T.BooleanType(),True),
T.StructField('consumedQuantity',T.DoubleType(),True),
T.StructField('consumedService',T.StringType(),True),
T.StructField('consumedServiceId',T.LongType(),True),
T.StructField('cost',T.DoubleType(),True),
T.StructField('costCenter',T.StringType(),True),
T.StructField('date',T.StringType(),True),
T.StructField('departmentId',T.LongType(),True),
T.StructField('departmentName',T.StringType(),True),
T.StructField('instanceId',T.StringType(),True),
T.StructField('location',T.StringType(),True),
T.StructField('meterCategory',T.StringType(),True),
T.StructField('meterId',T.StringType(),True),
T.StructField('meterName',T.StringType(),True),
T.StructField('meterRegion',T.StringType(),True),
T.StructField('meterSubCategory',T.StringType(),True),
T.StructField('offerId',T.StringType(),True),
T.StructField('partNumber',T.StringType(),True),
T.StructField('product',T.StringType(),True),
T.StructField('productId',T.LongType(),True),
T.StructField('resourceGroup',T.StringType(),True),
T.StructField('resourceGuid',T.StringType(),True),
T.StructField('resourceLocation',T.StringType(),True),
T.StructField('resourceLocationId',T.LongType(),True),
T.StructField('resourceRate',T.DoubleType(),True),
T.StructField('serviceAdministratorId',T.StringType(),True),
T.StructField('serviceInfo1',T.StringType(),True),
T.StructField('serviceInfo2',T.StringType(),True),
T.StructField('serviceName',T.StringType(),True),
T.StructField('serviceTier',T.StringType(),True),
T.StructField('storeServiceIdentifier',T.StringType(),True),
T.StructField('subscriptionGuid',T.StringType(),True),
T.StructField('subscriptionId',T.LongType(),True),
T.StructField('subscriptionName',T.StringType(),True),
T.StructField('tags',T.StringType(),True),
T.StructField('unitOfMeasure',T.StringType(),True)
])
billing_df = spark.read.json('/mnt/billingsources/raw-files/202106/', schema=billing_schema)