I haven't tested it with hdfs but I assume it is similar to reading from a local file. The idea is to store the file as a dict and then parse it to create the desidered schema. I have taken inspiration from here. Currently it lacks support for nullable and I have not tested with deeper levels of nested structs.
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from fractions import Fraction
from pyspark.sql.functions import udf
import json
spark = SparkSession.builder.appName('myPython').getOrCreate()
f = open("/path/schema_file", "r")
dictString = f.read()
derived_schema = StructType([])
jdata = json.loads(dictString)
def get_type(v):
if v == "StringType":
return StringType()
if v == "TimestampType":
return TimestampType()
if v == "IntegerType":
return IntegerType()
def generate_schema(jdata, derived_schema):
for k, v in sorted(jdata.items()):
if (isinstance(v, str)):
derived_schema.add(StructField(k, get_type(v), True))
else:
added_schema = StructType([])
added_schema = generate_schema(v, added_schema)
derived_schema.add(StructField(k, added_schema, True))
return derived_schema
generate_schema(jdata, derived_schema)
from datetime import datetime
data = [("first", "the", datetime.utcnow(), ["as", 1])]
input_df = spark.createDataFrame(data, derived_schema)
input_df.printSchema()
With the file being:
{
"col1" : "StringType",
"col2" : "StringType",
"col3" : "TimestampType",
"col4" : {
"col5" : "StringType",
"col6" : "IntegerType"
}
}