0

I have a List of DataFrames. Each DataFrame has a parent index field and a child index field, both indexes contain UUIDs.

The first df has no parent and the index is 0 but all the other dfs have a parent. Some dfs with multiple rows might not have a match in a child df, but there will always be at least one row that matches.

for example:

df_list[0] = pd.DataFrame({'a':[1,2,3,4],    'b':[4,5,6,7], 'parent_index':[0,0,0,0],'child_index':['index1','index2','index3','index4']})
df_list[1] = pd.DataFrame({'c':[11,12,13,14],'d':[4,5,6,7], 'parent_index':['index1','index1','index1','index1'],'child_index':['index11','index12','index13','index14']})
df_list[2] = pd.DataFrame({'e':[21,22,23,24],'f':[4,5,6,7], 'parent_index':['index14','index14','index14','index14'],'child_index':['index21','index22','index23','index24']})
df_list[3] = pd.DataFrame({'g':[31,32,33,34],'h':[4,5,6,7], 'parent_index':['index22','index22','index22','index22'],'child_index':['index31','index32','index33','index34']})

Is there a function that accepts a list of dfs and returns one or multiple dfs after trying to merge each one with each other one?


If there is a function, the rest of the problem will be solved easily, if not, here are the other details. The other main problem is that I want to merge the first df until it get to a child df that has multiple rows. Then, instead of repeating rows, I want it to continue by using the df with the multiple rows as if it is the parent df. I was able to merge the first df correctly with the following code:

# df = None
# last_merged_df_i = 0

list_of_dfs_w_many_rows = []

# current_index = '0_1'

df = df_list[0]

merges_count = 0

df.rename(columns={'parent_index':f'parent_index_{merges_count}','index_row':f'index_row_{merges_count}'}, inplace=True)

df_for_next_round = df_list[1:].copy()

at_least_one_merge = True



for j in range(1,15):
    if not at_least_one_merge:
        break
    
    at_least_one_merge = False
    
    # current_index = current_index + '_' + str(j+1)
    
    dfs_left = df_for_next_round.copy()
    df_for_next_round = []
    
    for temp_df in dfs_left:

        run_for_this_temp_df = False
        for c in temp_df.columns:
            if 'index' not in c:
                run_for_this_temp_df = True
        if not run_for_this_temp_df: 
            df_for_next_round.append(temp_df)
            continue
        
#         print(current_index)
#         print(at_least_one_merge)
#         display(temp_df)
        if (temp_df.shape[0] == 1):
            
            display(temp_df)
            if (temp_df.iloc[0]['parent_index'] == df.iloc[0][f'index_row_{merges_count}']):
                
                temp_df.rename(columns={'parent_index':f'parent_index_{merges_count+1}','index_row':f'index_row_{merges_count+1}'}, inplace=True)
                
#                 print('df: ')
#                 display(df)
#                 print('temp_df: ')
#                 display(temp_df)
                
                df = df.merge(temp_df, how='left', left_on=f'index_row_{merges_count}', right_on=f'parent_index_{merges_count+1}')
                
                merges_count += 1
                
                at_least_one_merge = True
            else:
                df_for_next_round.append(temp_df)
        else: 
            if j == 1:
                list_of_dfs_w_many_rows.append(temp_df)

but the second part does not seem to work:

# at_least_one_merge = True
for i in range(5):
    for temp_df2 in list_of_dfs_w_many_rows:
        
        
        merges_count = 0

        temp_df2.rename(columns={'parent_index':f'parent_index_{merges_count}','index_row':f'index_row_{merges_count}'}, inplace=True)

#         if not at_least_one_merge:
#             break

#         at_least_one_merge = False

#         current_index = temp_df2.iloc[0]['index_row'] + '_' + str(len(str(temp_df2.iloc[0]['index_row']).replace('_','')))

        dfs_left = df_for_next_round.copy()
        df_for_next_round = []

        for temp_df in dfs_left:

    #         run_for_this_temp_df = False
    #         for c in temp_df.columns:
    #             if 'index' not in c:
    #                 run_for_this_temp_df = True
    #         if not run_for_this_temp_df: continue

#             print(current_index)
#             print(at_least_one_merge)
#             display(temp_df)
            if (temp_df.shape[0] == 1):
#                 if (temp_df.iloc[0]['index_row'] == current_index):
#                     temp_df2 = temp_df2.merge(temp_df, how='left', on='index')
                
                
#                 print(temp_df2.iloc[0][f'index_row_{merges_count}'])
#                 print(temp_df.iloc[0]['parent_index'])
                
                if (temp_df.iloc[0]['parent_index'] == temp_df2.iloc[0][f'index_row_{merges_count}']):
                    
                    display(temp_df)
                    display(temp_df2)
                    
                    temp_df.rename(columns={'parent_index':f'parent_index_{merges_count+1}','index_row':f'index_row_{merges_count+1}'}, inplace=True)

#                     print('temp_df2: ')
#                     display(temp_df2)
#                     print('temp_df: ')
#                     display(temp_df)

                    temp_df2 = dtemp_df2f.merge(temp_df, how='left', left_on=f'index_row_{merges_count}', right_on=f'parent_index_{merges_count+1}')

                    merges_count += 1
#                     at_least_one_merge = True
                else:
                    df_for_next_round.append(temp_df)

Any ideas or suggestion would be most appreciated.

Michael
  • 161
  • 1
  • 9

0 Answers0