Uploading Divvy datasets (csv files) here
Oct_2020_tripdata <- read_csv("Oct 2020.csv")
Nov_2020_tripdata <- read_csv("Nov 2020.csv")
Dec_2020_tripdata <- read_csv("Dec 2020.csv")
Jan_2021_tripdata <- read_csv("Jan 2021.csv")
Feb_2021_tripdata <- read_csv("Feb 2021.csv")
Mar_2021_tripdata <- read_csv("Mar 2021.csv")
Converting relevant columns to character so that they can stack correctly
Oct_2020_tripdata <- mutate(Oct_2020_tripdata, start_station_id = as.character(start_station_id)
,end_station_id = as.character(end_station_id))
Nov_2020_tripdata <- mutate(Nov_2020_tripdata, start_station_id = as.character(start_station_id)
,end_station_id = as.character(end_station_id))
Dec_2020_tripdata <- mutate(Dec_2020_tripdata, start_station_id = as.character(start_station_id)
,end_station_id = as.character(end_station_id))
Jan_2021_tripdata <- mutate(Jan_2021_tripdata, start_station_id = as.character(start_station_id)
,end_station_id = as.character(end_station_id))
Feb_2021_tripdata <- mutate(Feb_2021_tripdata, start_station_id = as.character(start_station_id)
,end_station_id = as.character(end_station_id))
Mar_2021_tripdata <- mutate(Mar_2021_tripdata, start_station_id = as.character(start_station_id)
,end_station_id = as.character(end_station_id))
Binding data frames
all_trips <- bind_rows(Oct_2020_tripdata, Nov_2020_tripdata, Dec_2020_tripdata,
Jan_2021_tripdata, Feb_2021_tripdata, Mar_2021_tripdata)
Removing rows with missing values
colSums(is.na(all_trips))
all_trips_cleaned <- all_trips[complete.cases(all_trips), ]
Flitering started_at data that is greater than ended_at
all_trips_cleaned <- all_trips_cleaned %>%
filter(all_trips_cleaned$started_at < all_trips_cleaned$ended_at)
Creating new columns to list the date, month, day, and year of each ride
all_trips_cleaned$date <- as.Date(all_trips_cleaned$started_at, format= "%m/%d/%Y")
all_trips_cleaned$month <- format(as.Date(all_trips_cleaned$date), "%m")
all_trips_cleaned$day <- format(as.Date(all_trips_cleaned$date), "%d")
all_trips_cleaned$year <- format(as.Date(all_trips_cleaned$date), "%Y")
all_trips_cleaned$day_of_week <- format(as.Date(all_trips_cleaned$date), "%A")
Trying to add a new column to calculate each ride length in seconds as numeric using R
all_trips_cleaned$ride_length <- as.numeric(difftime(all_trips_cleaned$ended_at, all_trips_cleaned$started_at))
This is the error message I'm getting:
Error in as.POSIXlt.character(x, tz, ...) :
character string is not in a standard unambiguous format