One way that we can solve this problem is to use the arr.eval
expression. The arr.eval
expression allows us to treat a list as if it were a Series/column, so that we can apply all the same expressions we accustomed to using.
The Algorithm
(
df_groups
.filter(
pl.col("str_list")
.arr.eval(
pl.element().filter(
((pl.element() == succ).cumsum() == 0) & (pl.element() == pre)
)
)
.arr.lengths() > 0
)
.collect()
.filter(pl.col("str_list").arr.contains(succ))
)
shape: (1, 2)
┌───────┬─────────────────┐
│ group ┆ str_list │
│ --- ┆ --- │
│ i64 ┆ list[str] │
╞═══════╪═════════════════╡
│ 1 ┆ ["A", "B", "B"] │
└───────┴─────────────────┘
Note: there is currently a bug in Polars which causes an error when we use
.filter(pl.col("str_list").arr.contains(succ))
in lazy mode. (I'll file a bug report for that.)
How the algorithm works, in steps
The arr.eval
expression allows us to treat a list as a Series/column, so that we can apply our usual toolkit of expressions to our problem.
That said, using arr.eval
can seem a bit confusing at first, so we'll walk through this in steps.
As a Series
Let's first see how the algorithm works when our data is a Series/column, and then back into how we code this when the data is in lists.
Let's start with this data. We'll attempt to find any time that cat
appears before dog
.
df_groups = pl.DataFrame([
pl.Series('cat_dog', ['aardvark', 'cat', 'mouse', 'dog', 'sloth', 'zebra']),
pl.Series('dog_cat', ['aardvark', 'dog', 'mouse', 'cat', 'sloth', 'zebra']),
pl.Series('cat_dog_cat', ['aardvark', 'cat', 'mouse', 'dog', 'monkey', 'cat']),
pl.Series('dog_cat_dog', ['aardvark', 'dog', 'mouse', 'cat', 'monkey', 'dog']),
pl.Series('no_dog', ['aardvark', 'cat', 'mouse', 'cat', 'monkey', 'zebra']),
pl.Series('no_cat', ['aardvark', 'mouse', 'dog', 'monkey', 'dog', 'zebra']),
pl.Series('neither', ['aardvark', 'mouse', 'tiger', 'zebra', 'sloth', 'zebra']),
])
df_groups
shape: (6, 7)
┌──────────┬──────────┬─────────────┬─────────────┬──────────┬──────────┬──────────┐
│ cat_dog ┆ dog_cat ┆ cat_dog_cat ┆ dog_cat_dog ┆ no_dog ┆ no_cat ┆ neither │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ str ┆ str ┆ str │
╞══════════╪══════════╪═════════════╪═════════════╪══════════╪══════════╪══════════╡
│ aardvark ┆ aardvark ┆ aardvark ┆ aardvark ┆ aardvark ┆ aardvark ┆ aardvark │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ cat ┆ dog ┆ cat ┆ dog ┆ cat ┆ mouse ┆ mouse │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ mouse ┆ mouse ┆ mouse ┆ mouse ┆ mouse ┆ dog ┆ tiger │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ dog ┆ cat ┆ dog ┆ cat ┆ cat ┆ monkey ┆ zebra │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ sloth ┆ sloth ┆ monkey ┆ monkey ┆ monkey ┆ dog ┆ sloth │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ zebra ┆ zebra ┆ cat ┆ dog ┆ zebra ┆ zebra ┆ zebra │
└──────────┴──────────┴─────────────┴─────────────┴──────────┴──────────┴──────────┘
To detect our first occurrence of a dog
, we'll use the cumsum
expression on a boolean expression.
pre = "cat"
succ = "dog"
df_groups = df_groups.with_columns(
(pl.all() == succ).cumsum().suffix('__cumsum')
)
df_groups.select(sorted(df_groups.columns))
shape: (6, 14)
┌──────────┬─────────────────┬─────────────┬─────────────────────┬──────────┬─────────────────┬─────────────┬─────────────────────┬──────────┬─────────────────┬──────────┬────────────────┬──────────┬────────────────┐
│ cat_dog ┆ cat_dog__cumsum ┆ cat_dog_cat ┆ cat_dog_cat__cumsum ┆ dog_cat ┆ dog_cat__cumsum ┆ dog_cat_dog ┆ dog_cat_dog__cumsum ┆ neither ┆ neither__cumsum ┆ no_cat ┆ no_cat__cumsum ┆ no_dog ┆ no_dog__cumsum │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ u32 │
╞══════════╪═════════════════╪═════════════╪═════════════════════╪══════════╪═════════════════╪═════════════╪═════════════════════╪══════════╪═════════════════╪══════════╪════════════════╪══════════╪════════════════╡
│ aardvark ┆ 0 ┆ aardvark ┆ 0 ┆ aardvark ┆ 0 ┆ aardvark ┆ 0 ┆ aardvark ┆ 0 ┆ aardvark ┆ 0 ┆ aardvark ┆ 0 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ cat ┆ 0 ┆ cat ┆ 0 ┆ dog ┆ 1 ┆ dog ┆ 1 ┆ mouse ┆ 0 ┆ mouse ┆ 0 ┆ cat ┆ 0 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ mouse ┆ 0 ┆ mouse ┆ 0 ┆ mouse ┆ 1 ┆ mouse ┆ 1 ┆ tiger ┆ 0 ┆ dog ┆ 1 ┆ mouse ┆ 0 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dog ┆ 1 ┆ dog ┆ 1 ┆ cat ┆ 1 ┆ cat ┆ 1 ┆ zebra ┆ 0 ┆ monkey ┆ 1 ┆ cat ┆ 0 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ sloth ┆ 1 ┆ monkey ┆ 1 ┆ sloth ┆ 1 ┆ monkey ┆ 1 ┆ sloth ┆ 0 ┆ dog ┆ 2 ┆ monkey ┆ 0 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ zebra ┆ 1 ┆ cat ┆ 1 ┆ zebra ┆ 1 ┆ dog ┆ 2 ┆ zebra ┆ 0 ┆ zebra ┆ 2 ┆ zebra ┆ 0 │
└──────────┴─────────────────┴─────────────┴─────────────────────┴──────────┴─────────────────┴─────────────┴─────────────────────┴──────────┴─────────────────┴──────────┴────────────────┴──────────┴────────────────┘
So our goal will be to see if there are any rows where the value is cat
and where the cumsum
on the dog
expression is also zero.
Using lists and arr.eval
We'll take this in steps.
First, let's create some data as lists (instead of Series). I'm also going to eliminate the lazy
mode, to reduce the clutter.
df_groups = pl.DataFrame({
'group': ['cat_dog', 'dog_cat', 'cat_dog_cat', 'dog_cat_dog', 'no_dog', 'no_cat', 'neither'],
'str_list': [
['aardvark', 'cat', 'mouse', 'dog'],
['aardvark', 'dog', 'mouse', 'cat'],
['aardvark', 'cat', 'mouse', 'dog', 'monkey', 'cat'],
['aardvark', 'dog', 'mouse', 'cat', 'monkey', 'dog'],
['aardvark', 'cat', 'mouse', 'cat', 'monkey'],
['aardvark', 'mouse', 'dog', 'monkey', 'dog'],
['aardvark', 'mouse', 'tiger', 'zebra'],
]
})
df_groups
shape: (7, 2)
┌─────────────┬─────────────────────────────────────┐
│ group ┆ str_list │
│ --- ┆ --- │
│ str ┆ list[str] │
╞═════════════╪═════════════════════════════════════╡
│ cat_dog ┆ ["aardvark", "cat", ... "dog"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dog_cat ┆ ["aardvark", "dog", ... "cat"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ cat_dog_cat ┆ ["aardvark", "cat", ... "cat"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dog_cat_dog ┆ ["aardvark", "dog", ... "dog"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ no_dog ┆ ["aardvark", "cat", ... "monkey"... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ no_cat ┆ ["aardvark", "mouse", ... "dog"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ neither ┆ ["aardvark", "mouse", ... "zebra... │
└─────────────┴─────────────────────────────────────┘
Now, let's look for any list where cat
appears before a dog (i.e., the cumsum
value on dog
is zero).
pre = "cat"
succ = "dog"
(
df_groups
.with_columns(
pl.col("str_list")
.arr.eval(
pl.element().filter(
((pl.element() == succ).cumsum() == 0) & (pl.element() == pre)
)
)
)
)
shape: (7, 2)
┌─────────────┬────────────────┐
│ group ┆ str_list │
│ --- ┆ --- │
│ str ┆ list[str] │
╞═════════════╪════════════════╡
│ cat_dog ┆ ["cat"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dog_cat ┆ [] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ cat_dog_cat ┆ ["cat"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dog_cat_dog ┆ [] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ no_dog ┆ ["cat", "cat"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ no_cat ┆ [] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ neither ┆ [] │
└─────────────┴────────────────┘
We see that only three lists have a cat
before a dog
appears.
Next, we'll change the with_columns
to a filter
to keep only those rows where we found one or more cat
before a dog
.
(
df_groups
.filter(
pl.col("str_list")
.arr.eval(
pl.element().filter(
((pl.element() == succ).cumsum() == 0) & (pl.element() == pre)
)
)
.arr.lengths() > 0
)
)
shape: (3, 2)
┌─────────────┬─────────────────────────────────────┐
│ group ┆ str_list │
│ --- ┆ --- │
│ str ┆ list[str] │
╞═════════════╪═════════════════════════════════════╡
│ cat_dog ┆ ["aardvark", "cat", ... "dog"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ cat_dog_cat ┆ ["aardvark", "cat", ... "cat"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ no_dog ┆ ["aardvark", "cat", ... "monkey"... │
└─────────────┴─────────────────────────────────────┘
And finally, we need to eliminate any rows where a dog
never appears.
(
df_groups
.filter(pl.col("str_list").arr.contains(succ))
.filter(
pl.col("str_list")
.arr.eval(
pl.element().filter(
((pl.element() == succ).cumsum() == 0) & (pl.element() == pre)
)
)
.arr.lengths() > 0
)
)
shape: (2, 2)
┌─────────────┬────────────────────────────────┐
│ group ┆ str_list │
│ --- ┆ --- │
│ str ┆ list[str] │
╞═════════════╪════════════════════════════════╡
│ cat_dog ┆ ["aardvark", "cat", ... "dog"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ cat_dog_cat ┆ ["aardvark", "cat", ... "cat"] │
└─────────────┴────────────────────────────────┘