I have a dataframe that looks like this
import numpy as np
import pandas as pd
df = pd.DataFrame({"ph_number" : ['1234','2345','1234','1234','2345','1234','2345'],
"year": [2022,2022,2023,2022,2022,2022,2022],
"month": [9,10,1,10,8,11,12],
"device_set": ['vivo 1915:vivo','SM-A510F:samsung','1718:vivo vivo~^!vivo 1718:vivo','vivo 1915:vivo','SM-A510F:samsung~^!vivo 1718:vivo','vivo 1915:vivo','SM-A510F:samsung']
})
I want the output to be like this
in python i can do it by
df['device_set'] = df['device_set'].astype(str).apply(lambda x: x.split('~^!'))
pd.DataFrame({col:np.repeat(np.array(df[col].values), np.array(df["device_set"].str.len())) for col in df.columns.drop("device_set")}).assign(**{"device_set":np.concatenate(df["device_set"].values)})[df.columns]
i want to do it in DASK, i have written the code for dask as this, but its not working
def dask_explode(df, lst_cols):
with dask_session(time_to_use=2*60*60) as dask_client:
mdd = dd.from_pandas(df, npartitions=54)
mdd = pd.DataFrame({col:np.repeat(np.array(mdd[col].values), np.array(mdd[lst_cols].str.len())) for col in mdd.columns.drop(lst_cols)}).assign(**{lst_cols:np.concatenate(mdd[lst_cols].values)})[mdd.columns]
return mdd.compute()
df['device_set'] = df['device_set'].astype(str).apply(lambda x: x.split('~^!'))
output = dask_explode(df, "device_set")
Note: i cant use pd.explode my prod env version is pandas==0.23.4
Reference : Python PANDAS: Converting from pandas/numpy to dask dataframe/array