Let's start with this data:
import polars as pl
train_pl = pl.DataFrame(
{
"B_14": pl.arange(0, 101, eager=True),
"B_15": pl.arange(0, 101, eager=True).reverse(),
"B_16": pl.arange(0, 101, eager=True),
"other": pl.arange(0, 101, eager=True).reverse(),
}
)
train_pl
>>> train_pl
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═══════╡
│ 0 ┆ 100 ┆ 0 ┆ 100 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1 ┆ 99 ┆ 1 ┆ 99 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 98 ┆ 2 ┆ 98 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ 97 ┆ 3 ┆ 97 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97 ┆ 3 ┆ 97 ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98 ┆ 2 ┆ 98 ┆ 2 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99 ┆ 1 ┆ 99 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 100 ┆ 0 ┆ 100 ┆ 0 │
└──────┴──────┴──────┴───────┘
All columns
To replace all values above the 0.99 quantile and below the 0.01 quantile, simultaneously, and for all columns in parallel:
(
train_pl
.with_column(
pl.when(
(pl.all() > pl.all().quantile(0.99)) |
(pl.all() < pl.all().quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.all())
.keep_name()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ NaN │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN ┆ NaN ┆ NaN ┆ NaN │
└──────┴──────┴──────┴───────┘
Named columns
To restrict the operation to only some columns:
only_these = ['B_14', 'B_15', 'other']
(
train_pl
.with_column(
pl.when(
(pl.col(only_these) > pl.col(only_these).quantile(0.99)) |
(pl.col(only_these) < pl.col(only_these).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.col(only_these))
.keep_name()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ i64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ 0 ┆ NaN │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0 ┆ 99.0 ┆ 1 ┆ 99.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0 ┆ 98.0 ┆ 2 ┆ 98.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0 ┆ 97.0 ┆ 3 ┆ 97.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0 ┆ 97 ┆ 3.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0 ┆ 98 ┆ 2.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0 ┆ 99 ┆ 1.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN ┆ NaN ┆ 100 ┆ NaN │
└──────┴──────┴──────┴───────┘
Selecting by regex expression
I should also mention that you can use regex expressions in polars.col
. Just remember that the regex expression must start with ^
and end with $
. (These cannot be omitted.)
regex = r"^B_.*$"
(
train_pl
.with_column(
pl.when(
(pl.col(regex) > pl.col(regex).quantile(0.99)) |
(pl.col(regex) < pl.col(regex).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.col(regex))
.keep_name()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ i64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ 100 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN ┆ NaN ┆ NaN ┆ 0 │
└──────┴──────┴──────┴───────┘
Selecting by DataType
You can also select which columns by datatype.
these_types = [pl.Int64, pl.Float64]
(
train_pl
.with_column(
pl.when(
(pl.col(these_types) > pl.col(these_types).quantile(0.99)) |
(pl.col(these_types) < pl.col(these_types).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.col(these_types))
.keep_name()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ NaN │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN ┆ NaN ┆ NaN ┆ NaN │
└──────┴──────┴──────┴───────┘
By Exclusion
Sometimes it's easier to specify the columns that you do not want. The above methods will work with polars.exclude
, so that you can exclude columns that fit some criteria. For example, to select all columns except columns of strings (polars.Utf8
):
these_types = [pl.Utf8]
(
train_pl
.with_columns(
pl.when(
(pl.exclude(these_types) > pl.exclude(these_types).quantile(0.99)) |
(pl.exclude(these_types) < pl.exclude(these_types).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.exclude(these_types))
.keep_name()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ NaN │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ NaN ┆ NaN ┆ NaN ┆ NaN │
└──────┴──────┴──────┴───────┘
Or for example, to exclude any column that beings with B_
:
regex = r"^B_.*$"
(
train_pl
.with_columns(
pl.when(
(pl.exclude(regex) > pl.exclude(regex).quantile(0.99)) |
(pl.exclude(regex) < pl.exclude(regex).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.exclude(regex))
.keep_name()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ 0 ┆ 100 ┆ 0 ┆ NaN │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1 ┆ 99 ┆ 1 ┆ 99.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 98 ┆ 2 ┆ 98.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ 97 ┆ 3 ┆ 97.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 97 ┆ 3 ┆ 97 ┆ 3.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 98 ┆ 2 ┆ 98 ┆ 2.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 99 ┆ 1 ┆ 99 ┆ 1.0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 100 ┆ 0 ┆ 100 ┆ NaN │
└──────┴──────┴──────┴───────┘