How to accumulate a DataFrame in RxPy?

Question

I'm simulating a calculation over a number of inputs which take a long while. Whenever a calculation is done (i.e. on_next is emitted), I hope to reactively append the result to a results data frame and print the final DF when on_completed is emitted. However, the DF is empty, why are no values accumulating?

This is with Python 3.9.9 and rxpy 3.2.0.

import time
from random import random, randint
from rx import create
import pandas as pd
import rx
print(rx.__version__)

def accumulate(result, i):
    # Here the values should accumulate !?
    result = result.append(pd.DataFrame({'a': [i]}))

def average_df(observer, scheduler):
    for pid in pids:
        time.sleep(random()*0.8)
        observer.on_next(randint(0, 100))
    observer.on_completed()

def print_result(result):
    print(result)

# Client
if __name__ == "__main__":
    result = pd.DataFrame({'a': []})

    # Observable
    pids = [1, 2, 3, 4]
    source = create(average_df)
    
    source.subscribe(
        on_next = lambda i: accumulate(result, i),
        on_error = lambda e: print("Error: {0}".format(e)),
        on_completed = lambda: print_result(result)
    )

score 0 · Answer 1 · answered Jan 24 '22 at 11:35

The pandas append function returns a new object. In accumulate, this new object is set to the local variable result. This does not update the result variable that is in the main block.

You can get the final dataframe by returning the accumulated values at each step, thanks to the scan operator:

import time
from random import random, randint
import rx
import rx.operators as ops
import pandas as pd
import rx
print(rx.__version__)

def accumulate():
    def _accumulate(acc, I):
        return acc.append(pd.DataFrame({'a': [i]}))

    return  ops.scan(_accumulate, seed=pd.DataFrame({'a': []}))

def average_df(observer, scheduler):
    for pid in pids:
        time.sleep(random()*0.8)
        observer.on_next(randint(0, 100))
    observer.on_completed()


# Client
if __name__ == "__main__":
    result = pd.DataFrame({'a': []})

    # Observable
    pids = [1, 2, 3, 4]
    source = rx.create(average_df)

    df = source.pipe(
        accumulate(),
        ops.last(),
    ).run()
print(df)

Also for more simplifications, you can use the to_pandas operators of the rxsci library (disclaimer, I am the author). Here is a solution with rxsci, and a more reactive way to create the source observable:

from typing import NamedTuple
import time
from random import random, randint
import rx
import rx.operators as ops
import pandas as pd
import rxsci as rs
print(rx.__version__)


class Item(NamedTuple):
    pid: int
        

if __name__ == "__main__":
    result = pd.DataFrame({'a': []})

    # Observable
    pids = [1, 2, 3, 4]
    source = rx.from_(pids).pipe(
        ops.do_action(lambda i: time.sleep(random()*0.8)),
        ops.map(lambda i: Item(pid=randint(0, 100))),
    )
    
    df = source.pipe(
        rs.ops.to_pandas(),
    ).run()
    
print(df)

How to accumulate a DataFrame in RxPy?

1 Answers1