trying to group the column values based on related records
partColumns = (["partnumber","colVal1","colVal2", "colVal3","colVal4","colVal5"])
partrelations = ([("part0","part1","", "","",""),
("part1","","part2", "","part4",""),
("part2","part3", "", "part5","part6","part7"),
("part10","part11","", "","",""),
("part11","part13","part21", "","",""),
("part13","part21","part18", "","part20",""),
])
df_part_groups = spark.createDataFrame(data=partrelations, schema = partColumns)
trying to get output as below -
edges = (df_part_groups
.withColumnRenamed("partnumber", "src")
.withColumnRenamed("colVal1", "dst")
)
vertices = (edges.select("src").distinct()
.union(edges.select("dst").distinct())
.withColumnRenamed("src", "id"))
#create a graph and find all connected components
g = G.GraphFrame(vertices, edges)
cc = g.connectedComponents()
display(df_part_groups
.join(cc.distinct(), df_part_groups.device == cc.id)
.orderBy("component", "partnumber", "colVal1"))
Above is what I am trying to put together
thanks for help!!