For some dataset group_1
I need to iterate over all rows k
times for robustness and find a matching random sample of another data frame group_2
according to some criteria expressed as data frame columns.
Unfortunately, this is fairly slow.
How can I improve performance?
The bottleneck is the apply
-ed function, i.e. randomMatchingCondition
.
import tqdm
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
seed = 47
np.random.seed(seed)
###################################################################
# generate dummy data
size = 10000
df = pd.DataFrame({i: np.random.randint(1,100,size=size) for i in ['metric']})
df['label'] = np.random.randint(0,2, size=size)
df['group_1'] = pd.Series(np.random.randint(1,12, size=size)).astype(object)
df['group_2'] = pd.Series(np.random.randint(1,10, size=size)).astype(object)
group_0 = df[df['label'] == 0]
group_0 = group_0.reset_index(drop=True)
group_0 = group_0.rename(index=str, columns={"metric": "metric_group_0"})
join_columns_enrich = ['group_1', 'group_2']
join_real = ['metric_group_0']
join_real.extend(join_columns_enrich)
group_0 = group_0[join_real]
display(group_0.head())
group_1 = df[df['label'] == 1]
group_1 = group_1.reset_index(drop=True)
display(group_1.head())
###################################################################
# naive find random element matching condition
def randomMatchingCondition(original_element, group_0, join_columns, random_state):
limits_dict = original_element[join_columns_enrich].to_dict()
query = ' & '.join([f"{k} == {v}" for k, v in limits_dict.items()])
candidates = group_0.query(query)
if len(candidates) > 0:
return candidates.sample(n=1, random_state=random_state)['metric_group_0'].values[0]
else:
return np.nan
###################################################################
# iterate over pandas dataframe k times for more robust sampling
k = 3
resulting_df = None
for i in range(1, k+1):
group_1['metric_group_0'] = group_1.progress_apply(randomMatchingCondition,
args=[group_0, join_columns_enrich, None],
axis = 1)
group_1['run'] = i
if resulting_df is None:
resulting_df = group_1.copy()
else:
resulting_df = pd.concat([resulting_df, group_1])
resulting_df.head()
Experimenting with pre-sorting the data:
group_0 = group_0.sort_values(join_columns_enrich)
group_1 = group_1.sort_values(join_columns_enrich)
does not show any difference.