This is the data:
data5 = requests.get('https://data.ny.gov/resource/8mkn-d32t.json?$limit=1000000').text #pulling the data using API
json5 = json.loads(data5) #converting the data to json
cjtp_df = pd.DataFrame(json5) #storing json to dataframe for easier operations`
cjtp_df['month'] = pd.to_datetime(cjtp_df['month']) #converting the month column to datatime
cjtp_df['month'] = cjtp_df['month'].dt.strftime('%Y-%m') #formatting the datetime column as per the requirements
cjtp_df['number_of_customers'] = cjtp_df['number_of_customers'].astype(float)
cjtp_df['additional_bus_stop_time'] = cjtp_df['additional_bus_stop_time'].astype(float)
cjtp_df['additional_travel_time'] = cjtp_df['additional_travel_time'].astype(float)
cjtp_df['customer_journey_time'] = cjtp_df['customer_journey_time'].astype(float)
This is what the data looks like
This is the systemwide code for what I have so far:
# create a systemwide set of rows
cjtp_systemwide = cjtp_df.groupby(["month"]).agg(
{ "additional_bus_stop_time" : "sum" , "additional_travel_time" : "sum", "customer_journey_time": "sum" , "number_of_customers": "sum"}
)
cjtp_systemwide["borough"] = 'Systemwide' #operation for system wide row
cjtp_systemwide['trip_type'] = 'Systemwide' #operation for system wide row
cjtp_systemwide['route_id'] = 'Systemwide' #operation for system wide row
cjtp_systemwide['period'] = 'Systemwide'#operation for system wide row
cjtp_systemwide["additional_bus_stop_time"] = (cjtp_systemwide["additional_bus_stop_time"]* cjtp_systemwide[ "number_of_customers"]) / cjtp_systemwide[ "number_of_customers"]
cjtp_systemwide["additional_travel_time"] = (cjtp_systemwide["additional_travel_time"]* cjtp_systemwide[ "number_of_customers"]) / cjtp_systemwide[ "number_of_customers"]
cjtp_systemwide["customer_journey_time"] = (cjtp_systemwide["customer_journey_time"]* cjtp_systemwide[ "number_of_customers"]) / cjtp_systemwide[ "number_of_customers"]
cjtp_systemwide.reset_index(inplace=True)
cjtp_df = pd.concat([cjtp_df, cjtp_systemwide])
cjtp_df.sort_values(by=["month", "borough", "trip_type"], ascending=[True, True, True], inplace=True)
cjtp = cjtp_df.reset_index(drop=True) #dropping the index
cjtp.tail(60)
However the numbers are too large I know for a fact customer_journey_time (the last column) systemwide needs to be 0.698 or close to this