2

Using a function defined in a Python context manager, I want to modify a Polars dataframe by reassignment. I then went the function in the context manager to print the previous and new row counts.

I tried the following:

import polars as pl


def count_rows(df: pl.DataFrame) -> int:
    """ Counts the number of rows in a polars dataframe. """
    return df.select(pl.count()).item()

# define the function I want to work
@contextmanager
def log_row_count_change(df: pl.DataFrame, action_desc: str = '', df_name: str = 'df') -> None:
    """ An easy way to log how many rows were added or removed from a dataframe during filters, joins, etc. """
    try:
        row_count_before = count_rows(df)
        logger.debug(f"Before '{action_desc}' action on '{df_name}', row count: {row_count_before:,}")
        yield
    finally:
        row_count_after = count_rows(df)
        row_count_change = row_count_after - row_count_before
        row_count_change_pct = row_count_change / row_count_before * 100
        print(f"During '{action_desc}' action on '{df_name}', row count changed by {row_count_change:,} rows ({row_count_before:,} -> {row_count_after:,}) ({row_count_change_pct:.2f}%).")

# define a dataframe for testing
df = pl.DataFrame({"a":[1,1,2], "b":[2,2,3], "c":[1,2,3]})

# call the main part
with log_row_count_change(df, 'drop duplicates on column a', 'df'):
    df = df.unique(subset=['a'])

When you run the above, it shows the row count equal to 3 both before and after. I want it to show a row count of 3 before and 2 after.

  • 1
    [`df.height`](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.height.html) – jqurious Aug 01 '23 at 17:53

2 Answers2

0

Quick-and-dirty solution would be to pass lambda to contextmanager that returns the dataframe:

from contextlib import contextmanager
import polars as pl


def count_rows(df: pl.DataFrame) -> int:
    """Counts the number of rows in a polars dataframe."""
    return df.select(pl.count()).item()


@contextmanager
def log_row_count_change(get_df, action_desc: str = "", df_name: str = "df") -> None:
    try:
        df = get_df()
        row_count_before = count_rows(df)
        print(
            f"Before '{action_desc}' action on '{df_name}', row count: {row_count_before:,}"
        )
        yield
    finally:
        df = get_df()
        row_count_after = count_rows(df)
        row_count_change = row_count_after - row_count_before
        row_count_change_pct = row_count_change / row_count_before * 100
        print(
            f"During '{action_desc}' action on '{df_name}', row count changed by {row_count_change:,} rows ({row_count_before:,} -> {row_count_after:,}) ({row_count_change_pct:.2f}%)."
        )


df = pl.DataFrame({"a": [1, 1, 2], "b": [2, 2, 3], "c": [1, 2, 3]})

with log_row_count_change(lambda: df, "drop duplicates on column a", "df"): # <-- note the lambda:
    df = df.unique(subset=["a"])

Prints:

Before 'drop duplicates on column a' action on 'df', row count: 3
During 'drop duplicates on column a' action on 'df', row count changed by -1 rows (3 -> 2) (-33.33%).
Andrej Kesely
  • 168,389
  • 15
  • 48
  • 91
0

I'd prefer to explicitly nest functions to express variable scopes & state:

def main(df):
    @contextmanager
    def log_row_count_change(action_desc: str = '', df_name: str = 'df') -> None:
        """ An easy way to log how many rows were added or removed from a dataframe during filters, joins, etc. """
        try:
            row_count_before = count_rows(df)
            logger.debug(f"Before '{action_desc}' action on '{df_name}', row count: {row_count_before:,}")
            yield
        finally:
            row_count_after = count_rows(df)
            row_count_change = row_count_after - row_count_before
            row_count_change_pct = row_count_change / row_count_before * 100
            print(f"During '{action_desc}' action on '{df_name}', row count changed by {row_count_change:,} rows ({row_count_before:,} -> {row_count_after:,}) ({row_count_change_pct:.2f}%).")

    with log_row_count_change('drop duplicates on column a', 'df'):
        df = df.unique(subset=['a'])


main(pl.DataFrame({"a":[1,1,2], "b":[2,2,3], "c":[1,2,3]}))
# => During 'drop duplicates on column a' action on 'df', row count changed by -1 rows (3 -> 2) (-33.33%).

But if you prefer more OO-style, equivalently:

class DfCounter:
    """ An easy way to log how many rows were added or removed from a dataframe during filters, joins, etc. """
    def __init__(self, df, action_desc: str = '', df_name: str = 'df'):
        self.df = df
        self.action_desc = action_desc
        self.df_name = df_name

    def __enter__(self):
        self.row_count_before = count_rows(self.df)
        logger.debug(f"Before '{self.action_desc}' action on '{self.df_name}', row count: {self.row_count_before:,}")
        return self

    def __exit__(self, ex_type, ex_val, ex_tb):
        row_count_after = count_rows(self.df)
        row_count_change = row_count_after - self.row_count_before
        row_count_change_pct = row_count_change / self.row_count_before * 100
        print(f"During '{self.action_desc}' action on '{self.df_name}', row count changed by {row_count_change:,} rows ({self.row_count_before:,} -> {row_count_after:,}) ({row_count_change_pct:.2f}%).")


df = pl.DataFrame({"a":[1,1,2], "b":[2,2,3], "c":[1,2,3]})

with DfCounter(df, 'drop duplicates on column a', 'df') as counter:
    counter.df = df.unique(subset=['a'])
Kache
  • 15,647
  • 12
  • 51
  • 79
  • I want a solution like the second one (where the df to monitor is named explicitly to the contextmanager), but I don't want to have to assign the df back to an attribute of the contextmanager object – HumpbackWhale194 Aug 01 '23 at 20:35
  • If you want to use a "return modified copy" method and don't want to modify the object in place, it means you're comparing two different objects, so you need a way to reference two objects. It doesn't make sense to want to compare two objects by only giving one (direct) name. You must either use a parent (scope or object) to reference the two objects, use two names (for before/after), or modify one object in-place. – Kache Aug 01 '23 at 21:47