Here's a double recursive function, rather than an explicit loop.
def hierarchy(df, joinlevel=1, stacklevel=1, sofar=None, sublevel={}):
dfs=(df
#make a column denoting the employee isn't anyone's manager
.with_columns(emponly=pl.col('employee_id').is_in(pl.col('manager_1_id')))
#sort by that bool column so that the 0 and 1 index are well defined
.sort('emponly')
.partition_by('emponly'))
# when called from scratch sofar is blank otherwise it's carried forward
if sofar is None:
sofar=dfs[0].drop('emponly')
# when the partition doesn't have 2 dataframes that means there are no managers
if len(dfs)<2:
if stacklevel+1 in sublevel.keys():
return pl.concat([sofar,sublevel[stacklevel+1]], how='diagonal') #and sublevels
else:
return sofar
#This is the second level of recursion treating each manager rung of management separately
if stacklevel+1 not in sublevel.keys():
sublevel[stacklevel+1]=hierarchy(dfs[1].drop('emponly'), stacklevel=stacklevel+1)
#This is the first level of recursion pulling each next level of management into more columns
sofar=sofar.join(
dfs[1].drop('emponly').rename({'manager_1_id':f'manager_{joinlevel+1}_id'}),
left_on=f'manager_{joinlevel}_id', right_on='employee_id', how='left')
#This starts the recursion over in a quasi-loop
return hierarchy(dfs[1].drop('emponly'),
stacklevel=stacklevel,
joinlevel=joinlevel+1,
sofar=sofar,
sublevel=sublevel)
This logic of this is to partition the df
into one group of employees who aren't anyone's manager and another group who are managers. It then joins the first group with the second group in order to get the initial employee's manager's manager. It recalls itself in order to join another level of management. It keeps doing that until there are no more managers. There's a separate recursion fork where it treats each mid level manager as a low level employee and repeats the above process.
df=pl.DataFrame({'employee_id':[1,2,3,4,5], 'manager_1_id':[3,5,4,5,None]})
hierarchy(df)
shape: (5, 5)
┌─────────────┬──────────────┬──────────────┬──────────────┬──────────────┐
│ employee_id ┆ manager_1_id ┆ manager_2_id ┆ manager_3_id ┆ manager_4_id │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════════════╪══════════════╪══════════════╪══════════════╪══════════════╡
│ 1 ┆ 3 ┆ 4 ┆ 5 ┆ null │
│ 2 ┆ 5 ┆ null ┆ null ┆ null │
│ 3 ┆ 4 ┆ 5 ┆ null ┆ null │
│ 4 ┆ 5 ┆ null ┆ null ┆ null │
│ 5 ┆ null ┆ null ┆ null ┆ null │
└─────────────┴──────────────┴──────────────┴──────────────┴──────────────┘
The joinlevel
keeps track of the management chain recursion and the stacklevel
keeps track of all the times it starts over with a new level of middle management. sofar
keeps the joinlevel
progress and the sublevel
dict keeps the mid level management.
A non-recursive version
df_ans=df
i=1
while df_ans.height>df_ans.select(pl.col(f"manager_{i}_id").null_count()).item():
df_ans=df_ans.join(df.rename({'manager_1_id':f'manager_{i+1}_id'}), left_on=f"manager_{i}_id", right_on="employee_id", how='left')
i+=1
df_ans