First, let's expand the example to show a row with all null/empty fields (to show how the algorithm handles this case).
import polars as pl
import numpy as np
df = pl.DataFrame(
{
"last_name": ["Unknown", "Mallesham", np.nan, "Bhavik", "Unknown", None],
"first_name_or_initial": ["U", np.nan, "TRUE", "yamulla", np.nan, None],
"number": ["003123490", "012457847", "100030303", "", "0023004648", None],
"date_of_birth": [np.nan, "12/09/1900", "12/09/1900", "12/09/1900", np.nan, None],
}
)
df
shape: (6, 4)
┌───────────┬───────────────────────┬────────────┬───────────────┐
│ last_name ┆ first_name_or_initial ┆ number ┆ date_of_birth │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str │
╞═══════════╪═══════════════════════╪════════════╪═══════════════╡
│ Unknown ┆ U ┆ 003123490 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Mallesham ┆ null ┆ 012457847 ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ TRUE ┆ 100030303 ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Bhavik ┆ yamulla ┆ ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Unknown ┆ null ┆ 0023004648 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ null ┆ null ┆ null │
└───────────┴───────────────────────┴────────────┴───────────────┘
The Algorithm
df = df.with_row_count()
(
df
.join(
df
.melt(id_vars="row_nr")
.filter(pl.col('value').is_not_null() & (pl.col('value') != ""))
.groupby('row_nr')
.agg(pl.col('variable').alias('conso_field'))
.with_column(pl.col('conso_field').arr.join(','))
,
on='row_nr',
how='left'
)
)
shape: (6, 6)
┌────────┬───────────┬───────────────────────┬────────────┬───────────────┬─────────────────────────────────────┐
│ row_nr ┆ last_name ┆ first_name_or_initial ┆ number ┆ date_of_birth ┆ conso_field │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str ┆ str ┆ str ┆ str │
╞════════╪═══════════╪═══════════════════════╪════════════╪═══════════════╪═════════════════════════════════════╡
│ 0 ┆ Unknown ┆ U ┆ 003123490 ┆ null ┆ last_name,first_name_or_initial,... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ Mallesham ┆ null ┆ 012457847 ┆ 12/09/1900 ┆ last_name,number,date_of_birth │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ null ┆ TRUE ┆ 100030303 ┆ 12/09/1900 ┆ first_name_or_initial,number,dat... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ Bhavik ┆ yamulla ┆ ┆ 12/09/1900 ┆ last_name,first_name_or_initial,... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ Unknown ┆ null ┆ 0023004648 ┆ null ┆ last_name,number │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ null ┆ null ┆ null ┆ null ┆ null │
└────────┴───────────┴───────────────────────┴────────────┴───────────────┴─────────────────────────────────────┘
Note that the algorithm keeps the last row with all null/empty values.
How it works
To see how it works, let's take it in steps.
First, we'll need to attach a row number to each row. (This is needed in case any row has all null/empty values.)
Then we'll use melt
to place each value in each column on a separate row, next to it's column name.
df = df.with_row_count()
(
df
.melt(id_vars="row_nr")
)
shape: (24, 3)
┌────────┬───────────────┬────────────┐
│ row_nr ┆ variable ┆ value │
│ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str │
╞════════╪═══════════════╪════════════╡
│ 0 ┆ last_name ┆ Unknown │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ last_name ┆ Mallesham │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ last_name ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ last_name ┆ Bhavik │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ date_of_birth ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ date_of_birth ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ date_of_birth ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ date_of_birth ┆ null │
└────────┴───────────────┴────────────┘
Note that columns values will be converted to string values in this step.
Next, we'll filter out any rows with null
or "" values.
df = df.with_row_count()
(
df
.melt(id_vars="row_nr")
.filter(pl.col('value').is_not_null() & (pl.col('value') != ""))
)
shape: (14, 3)
┌────────┬───────────────┬────────────┐
│ row_nr ┆ variable ┆ value │
│ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str │
╞════════╪═══════════════╪════════════╡
│ 0 ┆ last_name ┆ Unknown │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ last_name ┆ Mallesham │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ last_name ┆ Bhavik │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ last_name ┆ Unknown │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ number ┆ 0023004648 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ date_of_birth ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ date_of_birth ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ date_of_birth ┆ 12/09/1900 │
└────────┴───────────────┴────────────┘
In the next step, we'll aggregate up all the remaining rows by row number, keeping only the column names. These represent columns with non-null, non-empty values.
df = df.with_row_count()
(
df
.melt(id_vars="row_nr")
.filter(pl.col('value').is_not_null() & (pl.col('value') != ""))
.groupby('row_nr')
.agg(pl.col('variable').alias('conso_field'))
)
shape: (5, 2)
┌────────┬─────────────────────────────────────┐
│ row_nr ┆ conso_field │
│ --- ┆ --- │
│ u32 ┆ list[str] │
╞════════╪═════════════════════════════════════╡
│ 2 ┆ ["first_name_or_initial", "numbe... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ ["last_name", "first_name_or_ini... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ ["last_name", "number"] │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0 ┆ ["last_name", "first_name_or_ini... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ ["last_name", "number", "date_of... │
└────────┴─────────────────────────────────────┘
Note that we get a list of column names for each row. (Note: we don't need to worry about the order of the rows at this point. We'll use the row number and a left-join in the last step recombine the values to the original DataFrame.)
Then, it's simply a matter of joining the columns names into one string:
df = df.with_row_count()
(
df
.melt(id_vars="row_nr")
.filter(pl.col('value').is_not_null() & (pl.col('value') != ""))
.groupby('row_nr')
.agg(pl.col('variable').alias('conso_field'))
.with_column(pl.col('conso_field').arr.join(','))
)
shape: (5, 2)
┌────────┬─────────────────────────────────────┐
│ row_nr ┆ conso_field │
│ --- ┆ --- │
│ u32 ┆ str │
╞════════╪═════════════════════════════════════╡
│ 3 ┆ last_name,first_name_or_initial,... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ first_name_or_initial,number,dat... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ last_name,number │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0 ┆ last_name,first_name_or_initial,... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ last_name,number,date_of_birth │
└────────┴─────────────────────────────────────┘
From here, we simply use a "left join" to merge the data back to the original dataset (as shown at the beginning.)