Starting Dask Scheduler on Node1 (4CPU, 8GB):
Dask Scheduler: dask-scheduler --host 0.0.0.0 --port 8786
Starting Workers on Node2(8CPU, 32GB) and Node3 (8CPU, 32GB): Dask Worker:
dask-worker tcp://http://xxx.xxx.xxx.xxx:8786 --nanny-port 3000:3004 --worker-port 3100:3104 --dashboard-address :8789
Here is my prototype, redacted some_private_processing
and some_processing
methods:
import glob
import pandas as pd
from dask.distributed import Client
N_CORES = 16
THREADS_PER_WORKER = 2
dask_cluster = Client(
'127.0.0.1:8786'
)
def get_clean_str1(str1):
ret_tuple = None, False, True, None, False
if not str1:
return ret_tuple
if string_validators(str1) is not True:
return ret_tuple
data = some_processing(str1)
match_flag = False
if str1 == data.get('formated_str1'):
match_flag = True
private_data = some_private_processing(str1)
private_match_flag = False
if str1 == private_data.get('formated_private_str1'):
private_match_flag = True
ret_tuple = str1, match_flag, False, private_str1, private_match_flag
return ret_tuple
files = [
'part-00000-abcd.gz.parquet',
'part-00001-abcd.gz.parquet',
'part-00002-abcd.gz.parquet',
]
print('Starting...')
for idx, each_file in enumerate(files):
dask_cluster.restart()
print(f'Processing file {idx}: {each_file}')
all_str1s_df = pd.read_parquet(
each_file,
engine='pyarrow'
)
print(f'Read file {idx}: {each_file}')
all_str1s_df = dd.from_pandas(all_str1s_df, npartitions=16000)
print(f'Starting file processing {idx}: {each_file}')
str1_res_tuple = all_str1s_df.map_partitions(
lambda part: part.apply(
lambda x: get_clean_str1(x['str1']),
axis=1
),
meta=tuple
)
(clean_str1,
match_flag,
bad_str1_flag,
private_str1,
private_match_flag) = zip(*str1_res_tuple)
all_str1s_df = all_str1s_df.assign(
clean_str1=pd.Series(clean_str1)
)
all_str1s_df = all_str1s_df.assign(
match_flag=pd.Series(match_flag)
)
all_str1s_df = all_str1s_df.assign(
bad_str1_flag=pd.Series(bad_str1_flag)
)
all_str1s_df = all_str1s_df.assign(
private_str1=pd.Series(private_str1)
)
all_str1s_df = all_str1s_df.assign(
private_match_flag=pd.Series(private_match_flag)
)
all_str1s_df = all_str1s_df[
all_str1s_df['match_flag'] == False
]
all_str1s_df = all_str1s_df.repartition(npartitions=200)
all_str1s_df.to_csv(
f'results-str1s-{idx}-*.csv'
)
print(f'Finished file {idx}: {each_file}')
This processing is taking more than 8hours and I see all the data is being processed only on one node either on Node2 or Node3 but not on both Node2 and Node3.
Need help to understand the insights and to understand where I am doing wrong to make this simple data transformation to run for more than 8hours and still not yet completed.