I have a List of DataFrames. Each DataFrame has a parent index field and a child index field, both indexes contain UUIDs.
The first df has no parent and the index is 0 but all the other dfs have a parent. Some dfs with multiple rows might not have a match in a child df, but there will always be at least one row that matches.
for example:
df_list[0] = pd.DataFrame({'a':[1,2,3,4], 'b':[4,5,6,7], 'parent_index':[0,0,0,0],'child_index':['index1','index2','index3','index4']})
df_list[1] = pd.DataFrame({'c':[11,12,13,14],'d':[4,5,6,7], 'parent_index':['index1','index1','index1','index1'],'child_index':['index11','index12','index13','index14']})
df_list[2] = pd.DataFrame({'e':[21,22,23,24],'f':[4,5,6,7], 'parent_index':['index14','index14','index14','index14'],'child_index':['index21','index22','index23','index24']})
df_list[3] = pd.DataFrame({'g':[31,32,33,34],'h':[4,5,6,7], 'parent_index':['index22','index22','index22','index22'],'child_index':['index31','index32','index33','index34']})
Is there a function that accepts a list of dfs and returns one or multiple dfs after trying to merge each one with each other one?
If there is a function, the rest of the problem will be solved easily, if not, here are the other details. The other main problem is that I want to merge the first df until it get to a child df that has multiple rows. Then, instead of repeating rows, I want it to continue by using the df with the multiple rows as if it is the parent df. I was able to merge the first df correctly with the following code:
# df = None
# last_merged_df_i = 0
list_of_dfs_w_many_rows = []
# current_index = '0_1'
df = df_list[0]
merges_count = 0
df.rename(columns={'parent_index':f'parent_index_{merges_count}','index_row':f'index_row_{merges_count}'}, inplace=True)
df_for_next_round = df_list[1:].copy()
at_least_one_merge = True
for j in range(1,15):
if not at_least_one_merge:
break
at_least_one_merge = False
# current_index = current_index + '_' + str(j+1)
dfs_left = df_for_next_round.copy()
df_for_next_round = []
for temp_df in dfs_left:
run_for_this_temp_df = False
for c in temp_df.columns:
if 'index' not in c:
run_for_this_temp_df = True
if not run_for_this_temp_df:
df_for_next_round.append(temp_df)
continue
# print(current_index)
# print(at_least_one_merge)
# display(temp_df)
if (temp_df.shape[0] == 1):
display(temp_df)
if (temp_df.iloc[0]['parent_index'] == df.iloc[0][f'index_row_{merges_count}']):
temp_df.rename(columns={'parent_index':f'parent_index_{merges_count+1}','index_row':f'index_row_{merges_count+1}'}, inplace=True)
# print('df: ')
# display(df)
# print('temp_df: ')
# display(temp_df)
df = df.merge(temp_df, how='left', left_on=f'index_row_{merges_count}', right_on=f'parent_index_{merges_count+1}')
merges_count += 1
at_least_one_merge = True
else:
df_for_next_round.append(temp_df)
else:
if j == 1:
list_of_dfs_w_many_rows.append(temp_df)
but the second part does not seem to work:
# at_least_one_merge = True
for i in range(5):
for temp_df2 in list_of_dfs_w_many_rows:
merges_count = 0
temp_df2.rename(columns={'parent_index':f'parent_index_{merges_count}','index_row':f'index_row_{merges_count}'}, inplace=True)
# if not at_least_one_merge:
# break
# at_least_one_merge = False
# current_index = temp_df2.iloc[0]['index_row'] + '_' + str(len(str(temp_df2.iloc[0]['index_row']).replace('_','')))
dfs_left = df_for_next_round.copy()
df_for_next_round = []
for temp_df in dfs_left:
# run_for_this_temp_df = False
# for c in temp_df.columns:
# if 'index' not in c:
# run_for_this_temp_df = True
# if not run_for_this_temp_df: continue
# print(current_index)
# print(at_least_one_merge)
# display(temp_df)
if (temp_df.shape[0] == 1):
# if (temp_df.iloc[0]['index_row'] == current_index):
# temp_df2 = temp_df2.merge(temp_df, how='left', on='index')
# print(temp_df2.iloc[0][f'index_row_{merges_count}'])
# print(temp_df.iloc[0]['parent_index'])
if (temp_df.iloc[0]['parent_index'] == temp_df2.iloc[0][f'index_row_{merges_count}']):
display(temp_df)
display(temp_df2)
temp_df.rename(columns={'parent_index':f'parent_index_{merges_count+1}','index_row':f'index_row_{merges_count+1}'}, inplace=True)
# print('temp_df2: ')
# display(temp_df2)
# print('temp_df: ')
# display(temp_df)
temp_df2 = dtemp_df2f.merge(temp_df, how='left', left_on=f'index_row_{merges_count}', right_on=f'parent_index_{merges_count+1}')
merges_count += 1
# at_least_one_merge = True
else:
df_for_next_round.append(temp_df)
Any ideas or suggestion would be most appreciated.