I am trying to execute logistic regression in order to predict the probability a user_type : 'customer' will convert to 'subscriber' based upon user_type attributes 'gender', 'birth year', &'day of week'.
Here I have a sample of my dataset:
dput(head(dataset2,10))
structure(list(trip_id = 21742443:21742452, start_time = c("2019-01-01 0:04:37",
"2019-01-01 0:08:13", "2019-01-01 0:13:23", "2019-01-01 0:13:45",
"2019-01-01 0:14:52", "2019-01-01 0:15:33", "2019-01-01 0:16:06",
"2019-01-01 0:18:41", "2019-01-01 0:18:43", "2019-01-01 0:19:18"
), end_time = c("2019-01-01 0:11:07", "2019-01-01 0:15:34", "2019-01-01 0:27:12",
"2019-01-01 0:43:28", "2019-01-01 0:20:56", "2019-01-01 0:19:09",
"2019-01-01 0:19:03", "2019-01-01 0:20:21", "2019-01-01 0:47:30",
"2019-01-01 0:24:54"), bikeid = c(2167L, 4386L, 1524L, 252L,
1170L, 2437L, 2708L, 2796L, 6205L, 3939L), tripduration = c("390",
"441", "829", "1,783.00", "364", "216", "177", "100", "1,727.00",
"336"), from_station_id = c(199L, 44L, 15L, 123L, 173L, 98L,
98L, 211L, 150L, 268L), from_station_name = c("Wabash Ave & Grand Ave",
"State St & Randolph St", "Racine Ave & 18th St", "California Ave & Milwaukee Ave",
"Mies van der Rohe Way & Chicago Ave", "LaSalle St & Washington St",
"LaSalle St & Washington St", "St. Clair St & Erie St", "Fort Dearborn Dr & 31st St",
"Lake Shore Dr & North Blvd"), to_station_id = c(84L, 624L, 644L,
176L, 35L, 49L, 49L, 142L, 148L, 141L), to_station_name = c("Milwaukee Ave & Grand Ave",
"Dearborn St & Van Buren St (*)", "Western Ave & Fillmore St (*)",
"Clark St & Elm St", "Streeter Dr & Grand Ave", "Dearborn St & Monroe St",
"Dearborn St & Monroe St", "McClurg Ct & Erie St", "State St & 33rd St",
"Clark St & Lincoln Ave"), user_type = c("Subscriber", "Subscriber",
"Subscriber", "Subscriber", "Subscriber", "Subscriber", "Subscriber",
"Subscriber", "Subscriber", "Subscriber"), gender = c("Male",
"Female", "Female", "Male", "Male", "Female", "Male", "Male",
"Male", "Male"), birthyear = c(1989L, 1990L, 1994L, 1993L, 1994L,
1983L, 1984L, 1990L, 1995L, 1996L), ride_length = c("0:06:30",
"0:07:21", "0:13:49", "0:29:43", "0:06:04", "0:03:36", "0:02:57",
"0:01:40", "0:28:47", "0:05:36"), day_of_week = c(3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L)), row.names = c(NA, 10L), class = "data.frame")`
I am trying to execute the split test, but before I complete that I need to convert the data format of a few columns, as they are erroneously listed as characters as opposed to integers and specifically speaking for the ride_length column, it needs to be converted to hms.
To convert the columns I executed this code:
library(tidyverse)
library(tidymodels) library(hms)
dataset2 <- read.csv("Bike_Trips_2019.csv")
dataset2$user_type <- factor(dataset2$user_type, levels = c("Customer", "Subscriber"), labels = c("no","yes"))
dataset2$trip_id <- as.character(dataset2$trip_id)
dataset2$start_time <- as.POSIXct(dataset2$start_time)
dataset2$end_time <- as.POSIXct(dataset2$end_time)
dataset2$tripduration <- parse_number(dataset2$tripduration)
dataset2$ride_length <- as_hms(dataset2$ride_length) set.seed(421) split <- initial_split(dataset2, prop = 0.8, strata = user_type) train <-split %>% training() test <- split %>% testing() `
Error Message: Error inabort_lossy_cast(): ! Lossy cast from <character> to <hms> at position(s) 101, 146, 854, 1405, 7935, ... (and 187 more)
What's wrong with
periodtype from {lubridate}? Or, if it has to behmsthen something like should work:Created on 2024-03-17 with reprex v2.1.0