I have a custom expression to wrap around the "rank" expression to ignore nulls.
def rank(_exp,method='average',reverse=False):
#Fill nans so as not to affect ranking
fill = -np.Inf if reverse else np.Inf
tmp = pl.when(_exp.is_not_null()).then(_exp).otherwise(fill).rank(reverse=reverse,method=method)
#Plug nans back in
exp = pl.when(_exp.is_not_null()).then(tmp).otherwise(_exp)
return exp
For simple dataframe with nulls, this gives expected results:
df_with_nan = pl.DataFrame({'X': np.where(np.isin(np.arange(10),[1,3]),np.NaN,np.arange(10)), 'G1':np.array(['A']*5 + ['B']*5), 'G2':np.array(['C','D']*5)})
df_with_null = df_with_nan.fill_nan(None)
print(df_with_null.with_columns(rank(pl.col(['X'])).suffix('_rnk')))
shape: (10, 4)
┌──────┬─────┬─────┬───────┐
│ X ┆ G1 ┆ G2 ┆ X_rnk │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ str ┆ str ┆ f64 │
╞══════╪═════╪═════╪═══════╡
│ 0.0 ┆ A ┆ C ┆ 1.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ null ┆ A ┆ D ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0 ┆ A ┆ C ┆ 2.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ null ┆ A ┆ D ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 6.0 ┆ B ┆ C ┆ 5.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 7.0 ┆ B ┆ D ┆ 6.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 8.0 ┆ B ┆ C ┆ 7.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 9.0 ┆ B ┆ D ┆ 8.0 │
└──────┴─────┴─────┴───────┘
The window context seems to jumble order:
print(df_with_null.with_columns(rank(pl.col(['X'])).over(['G1','G2']).suffix('_rnk')))
shape: (10, 4)
┌──────┬─────┬─────┬───────┐
│ X ┆ G1 ┆ G2 ┆ X_rnk │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ str ┆ str ┆ f64 │
╞══════╪═════╪═════╪═══════╡
│ 0.0 ┆ A ┆ C ┆ 1.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ null ┆ A ┆ D ┆ 1.5 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0 ┆ A ┆ C ┆ 2.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ null ┆ A ┆ D ┆ 1.5 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 6.0 ┆ B ┆ C ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 7.0 ┆ B ┆ D ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 8.0 ┆ B ┆ C ┆ 2.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 9.0 ┆ B ┆ D ┆ 3.0 │
└──────┴─────┴─────┴───────┘
Similarly, groupby context seems to jumble the order
print(df_with_null.groupby(['G1','G2']).agg([pl.col(['X']),rank(pl.col(['X'])).suffix('_rnk')]).explode(['X','X_rnk']))
shape: (10, 4)
┌─────┬─────┬──────┬───────┐
│ G1 ┆ G2 ┆ X ┆ X_rnk │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 ┆ f64 │
╞═════╪═════╪══════╪═══════╡
│ A ┆ C ┆ 0.0 ┆ 1.0 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ A ┆ C ┆ 2.0 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ A ┆ C ┆ 4.0 ┆ 3.0 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ A ┆ D ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ B ┆ D ┆ 7.0 ┆ 2.0 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ B ┆ D ┆ 9.0 ┆ 3.0 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ B ┆ C ┆ 6.0 ┆ 1.0 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ B ┆ C ┆ 8.0 ┆ 2.0 │
└─────┴─────┴──────┴───────┘
Through process of elimination, it seems to be the line in the my rank function that "plugs nans back in", but unsure why, and also unsure of alternative ways to implement this functionality.