0

I would like to replace with NaNs values that are more than 0.99 quantile and less than 0.01 quantile in the whole dataframe.
For now I found a way of doing so with one column, so I can do it one-at-a-time, but maybe there is possibility to apply the function to all the columns without ugly for-loops?

I also tried numpy implementation with masking, but since the length of the result is not constant, this does not seem like a proper solution to me.

Quantile replacer for one column that works:

train_pl.select(
    pl.when(pl.col('B_14') > pl.col('B_14').quantile(0.99))
    .then(float("nan"))
    .otherwise(pl.col('B_14'))
)

And here are my numpy functions in case you need them:

def replace_high_quantile(arr, q = 0.99):
    mask = arr <= np.quantile(arr, q)
    return ma.masked_array(arr, ~mask).filled(np.nan)

def replace_low_quantile(arr, q = 0.01):
    mask = arr >= np.quantile(arr, q)
    return ma.masked_array(arr, ~mask).filled(np.nan)

def replace_both_quantiles(arr, low = 0.01, high = 0.99):
    mask = (arr >= np.quantile(arr, low)) & (arr <= np.quantile(arr, high)
    return ma.masked_array(arr, ~mask).filled(np.nan)
yk4r2
  • 1
  • 2

1 Answers1

3

Let's start with this data:

import polars as pl

train_pl = pl.DataFrame(
    {
        "B_14": pl.arange(0, 101, eager=True),
        "B_15": pl.arange(0, 101, eager=True).reverse(),
        "B_16": pl.arange(0, 101, eager=True),
        "other": pl.arange(0, 101, eager=True).reverse(),
    }
)
train_pl
>>> train_pl
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ ---  ┆ ---  ┆ ---  ┆ ---   │
│ i64  ┆ i64  ┆ i64  ┆ i64   │
╞══════╪══════╪══════╪═══════╡
│ 0    ┆ 100  ┆ 0    ┆ 100   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1    ┆ 99   ┆ 1    ┆ 99    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2    ┆ 98   ┆ 2    ┆ 98    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3    ┆ 97   ┆ 3    ┆ 97    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ...  ┆ ...  ┆ ...  ┆ ...   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97   ┆ 3    ┆ 97   ┆ 3     │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98   ┆ 2    ┆ 98   ┆ 2     │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99   ┆ 1    ┆ 99   ┆ 1     │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 100  ┆ 0    ┆ 100  ┆ 0     │
└──────┴──────┴──────┴───────┘

All columns

To replace all values above the 0.99 quantile and below the 0.01 quantile, simultaneously, and for all columns in parallel:

(
    train_pl
    .with_column(
        pl.when(
            (pl.all() > pl.all().quantile(0.99)) |
            (pl.all() < pl.all().quantile(0.01))
        )
        .then(float("nan"))
        .otherwise(pl.all())
        .keep_name()
    )
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ ---  ┆ ---  ┆ ---  ┆ ---   │
│ f64  ┆ f64  ┆ f64  ┆ f64   │
╞══════╪══════╪══════╪═══════╡
│ NaN  ┆ NaN  ┆ NaN  ┆ NaN   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0  ┆ 99.0 ┆ 1.0  ┆ 99.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0  ┆ 98.0 ┆ 2.0  ┆ 98.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0  ┆ 97.0 ┆ 3.0  ┆ 97.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ...  ┆ ...  ┆ ...  ┆ ...   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0  ┆ 97.0 ┆ 3.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0  ┆ 98.0 ┆ 2.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0  ┆ 99.0 ┆ 1.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN  ┆ NaN  ┆ NaN  ┆ NaN   │
└──────┴──────┴──────┴───────┘

Named columns

To restrict the operation to only some columns:

only_these = ['B_14', 'B_15', 'other']
(
    train_pl
    .with_column(
        pl.when(
            (pl.col(only_these) > pl.col(only_these).quantile(0.99)) |
            (pl.col(only_these) < pl.col(only_these).quantile(0.01))
        )
        .then(float("nan"))
        .otherwise(pl.col(only_these))
        .keep_name()
    )
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ ---  ┆ ---  ┆ ---  ┆ ---   │
│ f64  ┆ f64  ┆ i64  ┆ f64   │
╞══════╪══════╪══════╪═══════╡
│ NaN  ┆ NaN  ┆ 0    ┆ NaN   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0  ┆ 99.0 ┆ 1    ┆ 99.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0  ┆ 98.0 ┆ 2    ┆ 98.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0  ┆ 97.0 ┆ 3    ┆ 97.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ...  ┆ ...  ┆ ...  ┆ ...   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0  ┆ 97   ┆ 3.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0  ┆ 98   ┆ 2.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0  ┆ 99   ┆ 1.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN  ┆ NaN  ┆ 100  ┆ NaN   │
└──────┴──────┴──────┴───────┘

Selecting by regex expression

I should also mention that you can use regex expressions in polars.col. Just remember that the regex expression must start with ^ and end with $. (These cannot be omitted.)

regex = r"^B_.*$"
(
    train_pl
    .with_column(
        pl.when(
            (pl.col(regex) > pl.col(regex).quantile(0.99)) |
            (pl.col(regex) < pl.col(regex).quantile(0.01))
        )
        .then(float("nan"))
        .otherwise(pl.col(regex))
        .keep_name()
    )
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ ---  ┆ ---  ┆ ---  ┆ ---   │
│ f64  ┆ f64  ┆ f64  ┆ i64   │
╞══════╪══════╪══════╪═══════╡
│ NaN  ┆ NaN  ┆ NaN  ┆ 100   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0  ┆ 99.0 ┆ 1.0  ┆ 99    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0  ┆ 98.0 ┆ 2.0  ┆ 98    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0  ┆ 97.0 ┆ 3.0  ┆ 97    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ...  ┆ ...  ┆ ...  ┆ ...   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0  ┆ 97.0 ┆ 3     │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0  ┆ 98.0 ┆ 2     │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0  ┆ 99.0 ┆ 1     │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN  ┆ NaN  ┆ NaN  ┆ 0     │
└──────┴──────┴──────┴───────┘

Selecting by DataType

You can also select which columns by datatype.

these_types = [pl.Int64, pl.Float64]
(
    train_pl
    .with_column(
        pl.when(
            (pl.col(these_types) > pl.col(these_types).quantile(0.99)) |
            (pl.col(these_types) < pl.col(these_types).quantile(0.01))
        )
        .then(float("nan"))
        .otherwise(pl.col(these_types))
        .keep_name()
    )
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ ---  ┆ ---  ┆ ---  ┆ ---   │
│ f64  ┆ f64  ┆ f64  ┆ f64   │
╞══════╪══════╪══════╪═══════╡
│ NaN  ┆ NaN  ┆ NaN  ┆ NaN   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0  ┆ 99.0 ┆ 1.0  ┆ 99.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0  ┆ 98.0 ┆ 2.0  ┆ 98.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0  ┆ 97.0 ┆ 3.0  ┆ 97.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ...  ┆ ...  ┆ ...  ┆ ...   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0  ┆ 97.0 ┆ 3.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0  ┆ 98.0 ┆ 2.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0  ┆ 99.0 ┆ 1.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN  ┆ NaN  ┆ NaN  ┆ NaN   │
└──────┴──────┴──────┴───────┘

By Exclusion

Sometimes it's easier to specify the columns that you do not want. The above methods will work with polars.exclude, so that you can exclude columns that fit some criteria. For example, to select all columns except columns of strings (polars.Utf8):

these_types = [pl.Utf8]
(
    train_pl
    .with_columns(
        pl.when(
            (pl.exclude(these_types) > pl.exclude(these_types).quantile(0.99)) |
            (pl.exclude(these_types) < pl.exclude(these_types).quantile(0.01))
        )
        .then(float("nan"))
        .otherwise(pl.exclude(these_types))
        .keep_name()
    )
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ ---  ┆ ---  ┆ ---  ┆ ---   │
│ f64  ┆ f64  ┆ f64  ┆ f64   │
╞══════╪══════╪══════╪═══════╡
│ NaN  ┆ NaN  ┆ NaN  ┆ NaN   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0  ┆ 99.0 ┆ 1.0  ┆ 99.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0  ┆ 98.0 ┆ 2.0  ┆ 98.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0  ┆ 97.0 ┆ 3.0  ┆ 97.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ...  ┆ ...  ┆ ...  ┆ ...   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0  ┆ 97.0 ┆ 3.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0  ┆ 98.0 ┆ 2.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0  ┆ 99.0 ┆ 1.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN  ┆ NaN  ┆ NaN  ┆ NaN   │
└──────┴──────┴──────┴───────┘

Or for example, to exclude any column that beings with B_:

regex = r"^B_.*$"
(
    train_pl
    .with_columns(
        pl.when(
            (pl.exclude(regex) > pl.exclude(regex).quantile(0.99)) |
            (pl.exclude(regex) < pl.exclude(regex).quantile(0.01))
        )
        .then(float("nan"))
        .otherwise(pl.exclude(regex))
        .keep_name()
    )
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ ---  ┆ ---  ┆ ---  ┆ ---   │
│ i64  ┆ i64  ┆ i64  ┆ f64   │
╞══════╪══════╪══════╪═══════╡
│ 0    ┆ 100  ┆ 0    ┆ NaN   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1    ┆ 99   ┆ 1    ┆ 99.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2    ┆ 98   ┆ 2    ┆ 98.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3    ┆ 97   ┆ 3    ┆ 97.0  │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ...  ┆ ...  ┆ ...  ┆ ...   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97   ┆ 3    ┆ 97   ┆ 3.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98   ┆ 2    ┆ 98   ┆ 2.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99   ┆ 1    ┆ 99   ┆ 1.0   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 100  ┆ 0    ┆ 100  ┆ NaN   │
└──────┴──────┴──────┴───────┘
  • 1
    Great answer. Maybe also add numeric selection via `pl.col([pl.Float32, pl.Float64])`? I can imagine wanting to exclude other types. – ritchie46 Aug 05 '22 at 04:39
  • I've edited the answer to include selections by datatype, as well as using `polars.exclude` (which can be very handy). –  Aug 05 '22 at 11:03