In polars, nested when-then-otherwise gives unexpected behavior in groupby / window context

Question

I have a custom expression to wrap around the "rank" expression to ignore nulls.

def rank(_exp,method='average',reverse=False):
    #Fill nans so as not to affect ranking
    fill = -np.Inf if reverse else np.Inf
    tmp = pl.when(_exp.is_not_null()).then(_exp).otherwise(fill).rank(reverse=reverse,method=method)
    #Plug nans back in
    exp = pl.when(_exp.is_not_null()).then(tmp).otherwise(_exp)
    return exp

For simple dataframe with nulls, this gives expected results:


df_with_nan = pl.DataFrame({'X': np.where(np.isin(np.arange(10),[1,3]),np.NaN,np.arange(10)), 'G1':np.array(['A']*5 + ['B']*5), 'G2':np.array(['C','D']*5)})
df_with_null = df_with_nan.fill_nan(None)

print(df_with_null.with_columns(rank(pl.col(['X'])).suffix('_rnk')))

shape: (10, 4)
┌──────┬─────┬─────┬───────┐
│ X    ┆ G1  ┆ G2  ┆ X_rnk │
│ ---  ┆ --- ┆ --- ┆ ---   │
│ f64  ┆ str ┆ str ┆ f64   │
╞══════╪═════╪═════╪═══════╡
│ 0.0  ┆ A   ┆ C   ┆ 1.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ null ┆ A   ┆ D   ┆ null  │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0  ┆ A   ┆ C   ┆ 2.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ null ┆ A   ┆ D   ┆ null  │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ...  ┆ ... ┆ ... ┆ ...   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 6.0  ┆ B   ┆ C   ┆ 5.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 7.0  ┆ B   ┆ D   ┆ 6.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 8.0  ┆ B   ┆ C   ┆ 7.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 9.0  ┆ B   ┆ D   ┆ 8.0   │
└──────┴─────┴─────┴───────┘

The window context seems to jumble order:

print(df_with_null.with_columns(rank(pl.col(['X'])).over(['G1','G2']).suffix('_rnk')))
shape: (10, 4)
┌──────┬─────┬─────┬───────┐
│ X    ┆ G1  ┆ G2  ┆ X_rnk │
│ ---  ┆ --- ┆ --- ┆ ---   │
│ f64  ┆ str ┆ str ┆ f64   │
╞══════╪═════╪═════╪═══════╡
│ 0.0  ┆ A   ┆ C   ┆ 1.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ null ┆ A   ┆ D   ┆ 1.5   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0  ┆ A   ┆ C   ┆ 2.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ null ┆ A   ┆ D   ┆ 1.5   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ...  ┆ ... ┆ ... ┆ ...   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 6.0  ┆ B   ┆ C   ┆ null  │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 7.0  ┆ B   ┆ D   ┆ null  │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 8.0  ┆ B   ┆ C   ┆ 2.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 9.0  ┆ B   ┆ D   ┆ 3.0   │
└──────┴─────┴─────┴───────┘

Similarly, groupby context seems to jumble the order

print(df_with_null.groupby(['G1','G2']).agg([pl.col(['X']),rank(pl.col(['X'])).suffix('_rnk')]).explode(['X','X_rnk']))

shape: (10, 4)
┌─────┬─────┬──────┬───────┐
│ G1  ┆ G2  ┆ X    ┆ X_rnk │
│ --- ┆ --- ┆ ---  ┆ ---   │
│ str ┆ str ┆ f64  ┆ f64   │
╞═════╪═════╪══════╪═══════╡
│ A   ┆ C   ┆ 0.0  ┆ 1.0   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ A   ┆ C   ┆ 2.0  ┆ null  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ A   ┆ C   ┆ 4.0  ┆ 3.0   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ A   ┆ D   ┆ null ┆ null  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ...  ┆ ...   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ B   ┆ D   ┆ 7.0  ┆ 2.0   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ B   ┆ D   ┆ 9.0  ┆ 3.0   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ B   ┆ C   ┆ 6.0  ┆ 1.0   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ B   ┆ C   ┆ 8.0  ┆ 2.0   │
└─────┴─────┴──────┴───────┘

Through process of elimination, it seems to be the line in the my rank function that "plugs nans back in", but unsure why, and also unsure of alternative ways to implement this functionality.

This solution may be of interest: https://stackoverflow.com/questions/73252912/how-to-set-masked-values-within-each-group-in-groupby-context-using-py-polars — , Aug 22 '22 at 02:38

score 1 · Answer 1 · answered Jun 23 '22 at 06:23

Since NaN and null values can create headaches with calculations, I recommend the following:

Partition your dataset into two: one dataset with NaN/null values, the other without.
Run your calculations on the dataset without the NaN/null values.
Concatenate the two datasets.

For example, using your df_with_nan dataset, we can use the partition_by function to split your dataset into two.

part_df_dict = (
    df_with_nan
    .with_column(pl.col('X').is_nan().alias('X_nan'))
    .partition_by('X_nan', as_dict=True)
)
part_df_dict

{False: shape: (8, 4)
┌─────┬─────┬─────┬───────┐
│ X   ┆ G1  ┆ G2  ┆ X_nan │
│ --- ┆ --- ┆ --- ┆ ---   │
│ f64 ┆ str ┆ str ┆ bool  │
╞═════╪═════╪═════╪═══════╡
│ 0.0 ┆ A   ┆ C   ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0 ┆ A   ┆ C   ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 4.0 ┆ A   ┆ C   ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 5.0 ┆ B   ┆ D   ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 6.0 ┆ B   ┆ C   ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 7.0 ┆ B   ┆ D   ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 8.0 ┆ B   ┆ C   ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 9.0 ┆ B   ┆ D   ┆ false │
└─────┴─────┴─────┴───────┘,
True: shape: (2, 4)
┌─────┬─────┬─────┬───────┐
│ X   ┆ G1  ┆ G2  ┆ X_nan │
│ --- ┆ --- ┆ --- ┆ ---   │
│ f64 ┆ str ┆ str ┆ bool  │
╞═════╪═════╪═════╪═══════╡
│ NaN ┆ A   ┆ D   ┆ true  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN ┆ A   ┆ D   ┆ true  │
└─────┴─────┴─────┴───────┘}

Next, run your calculations on the dataset without the NaN values:

part_df_dict[False] = (
    part_df_dict[False]
    .with_column(pl.col('X').rank().over(['G1', 'G2']).alias('X_rank'))
)
part_df_dict[False]

shape: (8, 5)
┌─────┬─────┬─────┬───────┬────────┐
│ X   ┆ G1  ┆ G2  ┆ X_nan ┆ X_rank │
│ --- ┆ --- ┆ --- ┆ ---   ┆ ---    │
│ f64 ┆ str ┆ str ┆ bool  ┆ f32    │
╞═════╪═════╪═════╪═══════╪════════╡
│ 0.0 ┆ A   ┆ C   ┆ false ┆ 1.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2.0 ┆ A   ┆ C   ┆ false ┆ 2.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 4.0 ┆ A   ┆ C   ┆ false ┆ 3.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 5.0 ┆ B   ┆ D   ┆ false ┆ 1.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 6.0 ┆ B   ┆ C   ┆ false ┆ 1.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 7.0 ┆ B   ┆ D   ┆ false ┆ 2.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 8.0 ┆ B   ┆ C   ┆ false ┆ 2.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 9.0 ┆ B   ┆ D   ┆ false ┆ 3.0    │
└─────┴─────┴─────┴───────┴────────┘

Then use the concat function along with how="diagonal" to concatenate the results together. You can also sort if you'd like.

result = (
    pl.concat(list(part_df_dict.values()), how='diagonal')
    .sort(['G1', 'G2'])
)
result

shape: (10, 5)
┌─────┬─────┬─────┬───────┬────────┐
│ X   ┆ G1  ┆ G2  ┆ X_nan ┆ X_rank │
│ --- ┆ --- ┆ --- ┆ ---   ┆ ---    │
│ f64 ┆ str ┆ str ┆ bool  ┆ f32    │
╞═════╪═════╪═════╪═══════╪════════╡
│ 0.0 ┆ A   ┆ C   ┆ false ┆ 1.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2.0 ┆ A   ┆ C   ┆ false ┆ 2.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 4.0 ┆ A   ┆ C   ┆ false ┆ 3.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ NaN ┆ A   ┆ D   ┆ true  ┆ null   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ NaN ┆ A   ┆ D   ┆ true  ┆ null   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 6.0 ┆ B   ┆ C   ┆ false ┆ 1.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 8.0 ┆ B   ┆ C   ┆ false ┆ 2.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 5.0 ┆ B   ┆ D   ┆ false ┆ 1.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 7.0 ┆ B   ┆ D   ┆ false ┆ 2.0    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 9.0 ┆ B   ┆ D   ┆ false ┆ 3.0    │
└─────┴─────┴─────┴───────┴────────┘

The result is much cleaner. And you're assured that the NaN/null values will not affect your results. An added bonus is that your code is easier to follow.

In polars, nested when-then-otherwise gives unexpected behavior in groupby / window context

1 Answers1