You can try GraphFrame.connectedComponents, add a prefix to all Device-ID
s, so that they can be split from Account-ID
s in the post-processing step:
from graphframes import GraphFrame
from pyspark.sql.functions import collect_set, expr
df = spark.createDataFrame([
("d1","a1"), ("d2","a1"), ("d1","a2"), ("d1","a4"),
("d2","a3"), ("d3","a4"), ("d3","a5"), ("d4","a6")
], ["Device-Id","Account-Id"])
# set checkpoint which is required for Graphframe
spark.sparkContext.setCheckpointDir("/tmp/111")
# for testing purpose, set a small shuffle partitions
spark.conf.set("spark.sql.shuffle.partitions", 2)
# set up edges and vertices, add an underscore as prefix of Device-ID
edges = df.withColumn('Device-Id', expr('concat("_", `Device-Id`)')).toDF('src', 'dst')
vertices = edges.selectExpr('src as id').distinct().union(edges.select('dst').distinct())
# set up the graph
g = GraphFrame(vertices, edges)
# compute the connected components and group resultset by component
# and collect corresponding ids using collect_set(id)
df1 = g.connectedComponents().groupby('component').agg(collect_set('id').alias('ids'))
df1.show(truncate=False)
+------------+-----------------------------------+
|component |ids |
+------------+-----------------------------------+
|309237645312|[a6, _d4] |
|85899345920 |[_d1, a4, a1, _d3, a3, a5, a2, _d2]|
+------------+-----------------------------------+
# split the ids based on the prefix we predefined when creating edges.
df1.selectExpr(
'transform(filter(ids, x -> left(x,1) = "_"), y -> substr(y,2)) AS `Devices-Used`'
, 'filter(ids, x -> left(x,1) != "_") AS `Accounts-Used`'
, 'component AS `Unique-User-Id`'
).show()
+------------+--------------------+--------------+
|Devices-Used| Accounts-Used|Unique-User-Id|
+------------+--------------------+--------------+
|[d1, d3, d2]|[a4, a1, a3, a5, a2]| 85899345920|
| [d4]| [a6]| 309237645312|
+------------+--------------------+--------------+
Edit: The above method is less efficient in creating large list of edges/vertices which is unnecessary, using self-join to create edges list should be a better choice (inspired by this post):
edges = df.alias('d1').join(df.alias('d2'), ["Account-Id"]) \
.filter("d1.`Device-Id` > d2.`Device-Id`") \
.toDF("account", "src", "dst")
+-------+---+---+
|account|src|dst|
+-------+---+---+
| a1| d2| d1|
| a4| d3| d1|
+-------+---+---+
vertices = df.selectExpr('`Device-Id` as id', "`Account-Id` as acct_id")
g = GraphFrame(vertices, edges)
df1 = g.connectedComponents() \
.groupby('component') \
.agg(
collect_set('id').alias('Device-Ids'),
collect_set('acct_id').alias('Account-Ids')
)
+---------+------------+--------------------+
|component| Device-Ids| Account-Ids|
+---------+------------+--------------------+
| 0|[d1, d2, d3]|[a4, a1, a3, a5, a2]|
| 1| [d4]| [a6]|
+---------+------------+--------------------+