To tweak Mck's answer a little bit (drop duplicate df_A
entries and select the relevant columns):
SHORT
df_A = df_A.withColumn("uid_A", F.monotonically_increasing_id()) # Add an index column
df_B = df_B.withColumn("uid_B", F.monotonically_increasing_id()) # Add an index column
df_A = df_A.alias('df_A')
(
df_A
.join(df_B.withColumn('flag',F.lit(True)),'col1','left')
.fillna(False)
.dropDuplicates(['uid_A'])
.select('df_A.col1','flag')
).show()
'''
+----+-----+
|col1| flag|
+----+-----+
| A|false|
| B|false|
| C| true|
| B|false|
| C| true|
| D| true|
+----+-----+
'''
FULL
### Initialise dataframe ###
df_A = pd.DataFrame({'col1': ['A', 'B', 'C', 'B', 'C', 'D'],
'col2': [1, 2, 3, 4, 5, 6]})
df_A = spark.createDataFrame(df_A)
df_B = pd.DataFrame({'col1': ['C', 'E', 'D', 'C', 'F', 'G', 'H'],
'col2': [10, 20, 30, 40, 50, 60, 70]})
df_A = spark.createDataFrame(df_B)
### PREVIOUS ###
df_A.join(df_B.withColumn('flag',F.lit(True)),'col1','left').fillna(False).show()
'''
+----+----+----+-----+
|col1|col2|col2| flag|
+----+----+----+-----+
| A| 1|null|false|
| B| 2|null|false|
| B| 4|null|false|
| C| 3| 10| true|
| C| 3| 40| true|
| C| 5| 10| true|
| C| 5| 40| true|
| D| 6| 30| true|
+----+----+----+-----+
'''
### BETTER ###
df_A = df_A.withColumn("uid_A", F.monotonically_increasing_id()) # Add an index column
df_B = df_B.withColumn("uid_B", F.monotonically_increasing_id()) # Add an index column
df_A = df_A.alias('df_A')
(
df_A
.join(df_B.withColumn('flag',F.lit(True)),'col1','left')
.fillna(False)
.dropDuplicates(['uid_A'])
.select('df_A.col1','flag')
).show()
'''
+----+-----+
|col1| flag|
+----+-----+
| A|false|
| B|false|
| C| true|
| B|false|
| C| true|
| D| true|
+----+-----+
'''
Extra nuggets: To take only column values based on the True
/False
values of the .isin
results, it may be more straightforward to use pyspark's leftsemi
join which takes only the left table columns based on the matching results of the specified cols on the right, shown also in this stackoverflow post.
# Pandas (isin)
df_A["column1"].isin(df_B["column1"])
# Pyspark (isin equivalent, applied to column values)
df_A.join(df_B, ["column1"], "leftsemi").select("column1")
'''
+----+----+
|col1|col2|
+----+----+
| C| 3|
| C| 5|
| D| 6|
+----+----+
'''
# Pandas (~isin)
df_A["column1"].isin(df_B["column1"])
# Pyspark (~isin equivalent, applied to column values)
df_A.join(df_B, ["column1"], "leftanti").select("column1")
Further join examples/info - PySpark Join Types | Join Two DataFrames