I have developed a V0 time series forecasting model for forecasting citibike rental usage into the future using PyCaret. The model
- builds data from an existing database
- sorts training, validation and forecast datasets
- trains, evaluates and forecasts using pycaret
- visulaise in plotly
Whilst the model itself is working well, I would like to harded the quality of the code and would it would be great if anyone has advice on how to improve this when considering:
- Using standard patterns / naming conventions
- Ensuring the code is reuseable
- Avoding hard coding elements and parameterisation conflicts
- How the model is trained / deployed in production
The code
class citibike_rental_forecast():
def __init__(self,hours_to_forecast = 168,example_station_id = 127):
# define variables
self.hours_to_forecast = hours_to_forecast
self.example_station_id = example_station_id
# prep data
self.prep_timeseries()
def train_model(self):
# prep train andn validation data sets
self.prep_train_val()
# train_pycaret_model
self.train_pycaret_model()
# eval the model
self.eval_pycaret_model()
def predict_forecast(self):
# filter to 518 only (space saving)
self.station_data_pd = self.model_data_pd.loc[self.model_data_pd['station_id']==self.example_station_id]
self.station_data_pd.sort_index(inplace=True)
# Load pre trained model
self.model = load_model('model_objects/pycaret_final_model')
# prep scoring data
forecast_pd = self.station_data_pd.loc[self.station_data_pd['forecast_period']==1]
self.x_forecast = forecast_pd[self.model_feature_list]
# scoring
forecast = predict_model(estimator=self.model,data=self.x_forecast)
forecast.sort_index(inplace=True)
self.forecast = forecast
# visualise
self.visualise_forecast()
def prep_timeseries(self):
query1 = f'
SELECT
a.station_id,
cast(a.hour as string) as ds,
EXTRACT(year from a.hour) as year,
EXTRACT(month from a.hour) as month,
EXTRACT(day from a.hour) as day,
EXTRACT(dayofweek from a.hour) as dayofweek,
EXTRACT(hour from a.hour) as hour,
CASE WHEN d.date IS NULL THEN 0 ELSE 1 END as is_holiday,
COALESCE(b.rentals,0) as y,
a.forecast_period
FROM ( -- all rental hours by currently active stations
SELECT
y.station_id,
x.hour,
CASE WHEN x.hour <= y.end_date THEN 0 ELSE 1 END as forecast_period
FROM citibike.periods x
INNER JOIN citibike.stations_most_active y
ON x.hour BETWEEN y.start_date AND (y.end_date + INTERVAL {self.hours_to_forecast} HOURS)
) a
LEFT OUTER JOIN citibike.rentals b
ON a.station_id=b.station_id AND a.hour=b.hour
LEFT OUTER JOIN citibike.holidays d
ON TO_DATE(a.hour)=d.date
'
# assemble historical dataset for training
model_data_pd = execute_spark_sql_query(query1).toPandas()
#Fix ds format
model_data_pd['ds'] = pd.to_datetime(model_data_pd['ds'])
#set dttm as index
model_data_pd.set_index('ds',inplace=True)
self.model_data_pd = model_data_pd
#specify training features
self.model_feature_list = model_data_pd.columns[1:7].tolist()
def prep_train_val(self):
# filter to 518 only (space saving)
self.station_data_pd = self.model_data_pd.loc[self.model_data_pd['station_id']==self.example_station_id]
self.station_data_pd.sort_index(inplace=True)
# split out training periods and forecast periods
ts_pd = self.station_data_pd.loc[self.station_data_pd['forecast_period']==0]
#prep train
self.train_pd = ts_pd['2016-10-31':'2017-10-31']
self.x_train = self.train_pd[self.model_feature_list]
self.y_train = self.train_pd['y'].values
#prep val
self.val_pd = ts_pd['2017-11-01':'2017-11-30']
self.x_val = self.val_pd[self.model_feature_list]
self.y_val = self.val_pd['y'].values
def train_pycaret_model(self):
# Training with PyCaret
discard_columns = ['station_id','forecast_period']
# Initialize setup
setup(self.train_pd,
target='y',
ignore_features=discard_columns,
numeric_features=self.model_feature_list,
#remove_multicollinearity=True,
#multicollinearity_threshold=0.85,
#fix_imbalance_method=RandomOverSampler(random_state=0),
#fix_imbalance=True,
#session_id=123,
)
#init dummy models are a benchmark
dummy_mean = DummyRegressor(strategy="mean")
dummy_constant = DummyRegressor(strategy="constant")
dummy_quantile = DummyRegressor(strategy="quantile")
#list our models to test
bhmk_models = [dummy_mean,dummy_constant,dummy_quantile]
model_selection = ["lr", "rf", "gbr", "knn", "dt", "lasso", "ridge"]
# train models and select best
best = compare_models(include=model_selection+bhmk_models,verbose=True,round=5,fold=3)
#fine tune our winner
self.model = tune_model(best)
self.final_v = finalize_model(self.model)
save_model(self.final_v, 'model_objects/pycaret_final_model')
def eval_pycaret_model(self):
# predict for val data
print('Train data')
self.train_predictions = predict_model(estimator=self.model,data=self.train_pd)
# predict for val data
print('Validation data')
self.val_predictions = predict_model(estimator=self.model,data=self.val_pd)
def visualise_model(self):
plot_type = 'error'
plot_model(self.final_v, plot = plot_type, save=True)
plot_type = 'feature'
plot_model(self.final_v, plot = plot_type, save=True)
def visualise_forecast(self):
# prep training data and score for visualisation
self.prep_train_val()
self.eval_pycaret_model()
train_predictions_plt = self.train_predictions['2016-08':'2018-02'].reset_index().sort_values('ds')
val_predictions_plt = self.val_predictions['2016-08':'2018-02'].reset_index().sort_values('ds')
forecast_plt = self.forecast['2016-08':'2018-02'].reset_index().sort_values('ds')
# Create a figure
fig = go.Figure()
#Training data
fig.add_trace(go.Line(x=train_predictions_plt.ds,
y=train_predictions_plt.prediction_label,
marker_color='black',
name='train_pred'
))
fig.add_trace(go.Scatter(x=train_predictions_plt.ds,
y=train_predictions_plt.y,
mode='markers',
marker_color='black',
name='train_actual'
))
# Validation data
fig.add_trace(go.Line(x=val_predictions_plt.ds,
y=val_predictions_plt.prediction_label,
marker_line_color='black',
name='val_pred'
))
fig.add_trace(go.Scatter(x=val_predictions_plt.ds,
y=val_predictions_plt.y,
mode='markers',
marker_color='black',
name='val_actual'
))
#forecast
fig.add_trace(go.Line(x=forecast_plt.ds,
y=forecast_plt.prediction_label,
#marker_line_color='black',
name='forecast'
))
fig.update_layout(xaxis_range=[datetime.strptime('2017-10-01','%Y-%m-%d'), datetime.strptime('2017-12-17','%Y-%m-%d')])
fig.update_layout(xaxis_title="Date", yaxis_title="rentals")
# Show the plot
fig.show()
fig.write_image("images/pycaret_forecast.png")
Run using ...
''' cbr = citibike_rental_forecast() cbr.train_model() '''
and
''' cbr = citibike_rental_forecast() cbr.predict_forecast() '''
Model runs but am looking for advice on improving the quality of the code and implementing standard production practise