Join dataframes and rename resulting columns with same names

Question

Shortened example:

vals1 = [(1, "a"), 
        (2, "b"), 
      ]
columns1 = ["id","name"]
df1 = spark.createDataFrame(data=vals1, schema=columns1)

vals2 = [(1, "k"), 
      ]
columns2 = ["id","name"]
df2 = spark.createDataFrame(data=vals2, schema=columns2)

df1 = df1.alias('df1').join(df2.alias('df2'), 'id', 'full')
df1.show()

The result has one column named id and two columns named name. How do I rename the columns with duplicate names, assuming that the real dataframes have tens of such columns?

qaziqarta · Answer 1 · 2022-08-23T04:30:25.803

3

You can rename duplicate columns before join, except for columns required for join:

import pyspark.sql.functions as F

def add_prefix(df, prefix, columns=None):
  if not columns:
    columns = df.columns
  return df.select(*[F.col(c).alias(prefix+c if c in columns else c) for c in df.columns])

def add_suffix(df, suffix, columns=None):
  if not columns:
    columns = df.columns
  return df.select(*[F.col(c).alias(c+suffix if c in columns else c) for c in df.columns])

join_cols = ['id']
columns_to_rename = [c for c in df1.columns if c in df2.columns and c not in join_cols]
df2 = add_suffix(df2, '_y', columns_to_rename)
df3 = df1.join(df2, *join_cols, 'full')
+---+----+------+
| id|name|name_y|
+---+----+------+
|  1|   a|     k|
|  2|   b|  null|
+---+----+------+

edited Aug 23 '22 at 04:30

answered Aug 18 '22 at 19:45

qaziqarta

1,782
1
4
11

Specify each column manually? – user626528 Aug 22 '22 at 21:20
1

@user626528, oh I see, I missed the point of renaming only the columns with **same names**. Is that what you mean? I edited answer accordingly. – qaziqarta Aug 23 '22 at 04:30

score 3 · Accepted Answer · answered Aug 23 '22 at 13:04

Another method to rename only the intersecting columns

from typing import List

from pyspark.sql import DataFrame


def join_intersect(df_left: DataFrame, df_right: DataFrame, join_cols: List[str], how: str = 'inner'):
    intersected_cols = set(df1.columns).intersection(set(df2.columns))
    cols_to_rename = [c for c in intersected_cols if c not in join_cols]

    for c in cols_to_rename:
        df_left = df_left.withColumnRenamed(c, f"{c}__1")
        df_right = df_right.withColumnRenamed(c, f"{c}__2")

    return df_left.join(df_right, on=join_cols, how=how)


vals1 = [(1, "a"), (2, "b")]
columns1 = ["id", "name"]
df1 = spark.createDataFrame(data=vals1, schema=columns1)
vals2 = [(1, "k")]
columns2 = ["id", "name"]
df2 = spark.createDataFrame(data=vals2, schema=columns2)

df_joined = join_intersect(df1, df2, ['name'])
df_joined.show()

score 1 · Answer 3 · answered Aug 23 '22 at 10:44

@quaziqarta proposed a method to rename columns before the join, note that you can also rename them after the join:

join_column = 'id'
df1 = df1.join(df2, join_column, 'full') \
         .select(
             [join_column] +
             [df1.alias('df1')['df1.'+c].alias(c+"_1") for c in df1.columns if c != join_column] + 
             [df2.alias('df2')['df2.'+c].alias(c+"_2") for c in df2.columns if c != join_column]
             ) \
         .show()

+---+------+------+
| id|name_1|name_2|
+---+------+------+
|  1|     a|     k|
|  2|     b|  null|
+---+------+------+

You only need to alias the dataframes (as you did in your example) in order to be able to specify which column you are referring when you ask Spark to get the column "name".

score 0 · Answer 4 · answered Aug 29 '22 at 13:31

You can just use a for loop to change the names of columns except the join column in the second dataframe

vals1 = [(1, "a"),
         (2, "b"),
         ]
columns1 = ["id", "name"]
df1 = spark.createDataFrame(data=vals1, schema=columns1)

vals2 = [(1, "k"),
         ]
columns2 = ["id", "name"]
df2 = spark.createDataFrame(data=vals2, schema=columns2)
for i in df2.columns:
    if i != 'id':
        df2=df2.withColumnRenamed(i,i+'_1')
df1 = df1.alias('df1').join(df2.alias('df2'), 'id', 'full')
df1.show()

Join dataframes and rename resulting columns with same names

4 Answers4