The aim of below code is to find the number of hours a customer id spends online on a #particular date. I am unable to get the online hours for all of the #dates(i.e. 01/06/2017 to 21/06/2017). I have attached the images of o/p of current code, csv file of customer data and pings data:
import datetime
import math
final_train_df = pd.DataFrame(columns=['id', 'date', 'hours'])
excluded_cust_ids = []
final_set = {}
train__df = pd.DataFrame()
for index, i in enumerate(customer_data['id'][:10].values.tolist()):
print('Started for CustID '+ str(i) + " count is "+ str(index))
train__df['timestamp'] = pings_data.loc[pings_data['id']==i]["timestamp"]
train__df['date'] = pings_data.loc[pings_data['id']==i]["timestamp"].apply(lambda arr:
datetime.datetime.fromtimestamp(arr).strftime('%Y-%m-%d'))
train__df['time'] = pings_data.loc[pings_data['id']==i]["timestamp"].apply(lambda arr:
datetime.datetime.fromtimestamp(arr).strftime('%H:%M:%S'))
unique_dates = pd.unique(train__df['date']).tolist()
print(unique_dates)
if math.isnan(train__df['timestamp'].values.tolist()[0]):
excluded_cust_ids.append(i)
else:
for j in unique_dates:
time_stamp = train__df.loc[train__df['date']==j].sort_values(by='time')
end_time = time_stamp['time'].iloc[len(time_stamp)-1]
start_time = time_stamp['time'].iloc[0]
t1 = pd.to_datetime(start_time)
t2 = pd.to_datetime(end_time)
diff_time = round((pd.Timedelta(t2 - t1).seconds/(3600)),1)
val = {'id':i, 'date':j, 'hours':diff_time}
final_train_df = final_train_df.append({'id':i, 'date':j, 'hours':diff_time}, True)
print(final_train_df)
print(excluded_cust_ids)
final_train_df.to_excel("Final_train_df_excluding_cust_ids.xlsx")
print("completed for all customers except for non-active customers")