Schema of my json is next
and field Reports_Rows_Rows_Cells
if it not null looks like next:
[Row(Attributes=[Row(Id='account', Value='bd9e85e0-0478-433d-ae9f-0b3c4f04bfe4')], Value='Business Bank Account'),
Row(Attributes=[Row(Id='account', Value='bd9e85e0-0478-433d-ae9f-0b3c4f04bfe4')], Value='10105.54'),
Row(Attributes=[Row(Id='account', Value='bd9e85e0-0478-433d-ae9f-0b3c4f04bfe4')], Value='4938.48')]
What I want is to create table which has all above columns and column Reports_Rows_Rows_Cells
should look like
-------- |-------|Reports_Rows_Rows_Cells_Value | Reports_Rows_Rows_Cells_Value | Reports_Rows_Rows_Cells_Value|
| Business Bank Account |10105.54 | 4938.48
Not after parsing json my table look like next:
-------- |-------|Reports_Rows_Rows_Cells_Value|
| Business Bank Account |
| 10105.54 |
| 4938.48 |
My code which I use to parse json
def flatten_df(nested_df):
# for ncol in nested_df.columns:
array_cols = [c[0] for c in nested_df.dtypes if c[1][:5] == 'array']
for col in array_cols:
nested_df = nested_df.withColumn(col, explode_outer(nested_df[col]))
nested_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'struct']
if len(nested_cols) == 0:
return nested_df
flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] != 'struct']
flat_df = nested_df.select(flat_cols +
[F.col(nc+'.'+c).alias(nc+'_'+c)
for nc in nested_cols
for c in nested_df.select(nc+'.*').columns])
return flatten_df(flat_df)