I have dataframe which is similar to this one.
import pandas as pd
import string
import random
def generate_example_dataframe()-> pd.DataFrame:
"""
This simple function will generate simple dataframe in long format
"""
num = 20 # number of regions udsed in simulations
subjects_num = 10
random.seed(1)
conditions = ["open", "closed"]
groups = ["old", "young"]
means = [1,1.5,1.25,1.75]
regions = [f"region_{s}" for s in string.ascii_letters[:num]]
subjects = [f"subject_{s}" for s in list(range(1, subjects_num))]
list_of_dataframes = []
for subject in subjects:
for region in regions:
lst = iter(means)
for condition in conditions:
for group in groups:
mean = next(lst)
values = mean + np.random.rand(num) + 0.2*random.random()
temp_df = pd.DataFrame({'region':[region] *num, 'group':[group] * num, 'condition':[condition] *num ,'subject':[subject] *num ,'values':values})
list_of_dataframes.append(temp_df)
return pd.concat(list_of_dataframes)
# %% [markdown]
# Genereting sample dataframe is presented in the long format - one obe
# %%
df = generate_example_dataframe()
df.head(10).to_clipboard(sep=',', index=True)
Which give output like this
,region,group,condition,subject,values
0,region_a,old,open,subject_1,1.4914914311214753
1,region_a,old,open,subject_1,1.9742822483723783
2,region_a,old,open,subject_1,1.0461147549953116
3,region_a,old,open,subject_1,1.9369465073938947
4,region_a,old,open,subject_1,1.817792271839675
5,region_a,old,open,subject_1,1.4272522367426221
6,region_a,old,open,subject_1,1.129423554333859
7,region_a,old,open,subject_1,1.9021298911486018
8,region_a,old,open,subject_1,1.950500304961099
9,region_a,old,open,subject_1,1.6832358513116206
I want to do a simple t-test on values with separation by region, group and condition. (number of tests = Regions x groups x conditions) What is the most pythonic way to this? The only way I am thinking now is in a loop iterate over values of these variables and subset the big data frame.