0

I have two data frames like below.

  df = spark.createDataFrame(sc.parallelize([[1,1,2],[1,2,9], [2,1,2],[2,2,1],
                [4,1,5],[4,2,6]]), ["sid","cid","Cr"])
  df.show() 
+---+---+---+
|sid|cid| Cr|
+---+---+---+
|  1|  1|  2|
|  1|  2|  9|
|  2|  1|  2|
|  2|  2|  1|
|  4|  1|  5|
|  4|  2|  6|
|  5|  1|  3|
|  5|  2|  8|
+---+---+---+

next I have created df1 like below.

df1 = spark.createDataFrame(sc.parallelize([[1,1],[1,2],[1,3], [2,1],[2,2],[2,3],[4,1],[4,2],[4,3],[5,1],[5,2],[5,3]]), ["sid","cid"])

df1.show()
+---+---+
|sid|cid|
+---+---+
|  1|  1|
|  1|  2|
|  1|  3|
|  2|  1|
|  2|  2|
|  2|  3|
|  4|  1|
|  4|  2|
|  4|  3|
|  5|  1|
|  5|  2|
|  5|  3|
+---+---+

now I want my final output should be like below i.e . if any of the data presented i.e. if (df1.sid==df.sid)&(df1.cid==df.cid) then flag value 1 else 0. and missing Cr values will be '0'

+---+---+---+----+
|sid|cid| Cr|flag|
+---+---+---+----+
|  1|  1|  2|  1 |
|  1|  2|  9|  1 |
|  1|  3|  0|  0 |
|  2|  1|  2|  1 |
|  2|  2|  1|  1 |
|  2|  3|  0|  0 |
|  4|  1|  5|  1 |
|  4|  2|  6|  1 |
|  4|  3|  0|  0 |
|  5|  1|  3|  1 |
|  5|  2|  8|  1 |
|  5|  3|  0|  0 |
+---+---+---+----+

please help me on this.

Sai
  • 1,075
  • 5
  • 31
  • 58

1 Answers1

1

With data:

from pyspark.sql.functions import col, when, lit, coalesce

df = spark.createDataFrame(
    [(1, 1, 2), (1, 2, 9), (2, 1, 2), (2, 2, 1), (4, 1, 5), (4, 2, 6), (5, 1, 3), (5, 2, 8)], 
   ("sid", "cid", "Cr"))

df1 = spark.createDataFrame(
    [[1,1],[1,2],[1,3], [2,1],[2,2],[2,3],[4,1],[4,2],[4,3],[5,1],[5,2],[5,3]],
    ["sid","cid"])

outer join:

joined = (df.alias("df")
    .join(
        df1.alias("df1"),
        (col("df.sid") == col("df1.sid")) & (col("df.cid") == col("df1.cid")),
        "rightouter"))

and select

joined.select(
    col("df1.*"),
    coalesce(col("Cr"), lit(0)).alias("Cr"), 
    col("df.sid").isNotNull().cast("integer").alias("flag")
).orderBy("sid", "cid").show()

# +---+---+---+----+                 
# |sid|cid| Cr|flag|
# +---+---+---+----+
# |  1|  1|  2|   1|
# |  1|  2|  9|   1|
# |  1|  3|  0|   0|
# |  2|  1|  2|   1|
# |  2|  2|  1|   1|
# |  2|  3|  0|   0|
# |  4|  1|  5|   1|
# |  4|  2|  6|   1|
# |  4|  3|  0|   0|
# |  5|  1|  3|   1|
# |  5|  2|  8|   1|
# |  5|  3|  0|   0|
# +---+---+---+----+
Alper t. Turker
  • 34,230
  • 9
  • 83
  • 115