How can I improve the production quality of this time series model

21 Views Asked by At

I have developed a V0 time series forecasting model for forecasting citibike rental usage into the future using PyCaret. The model

  • builds data from an existing database
  • sorts training, validation and forecast datasets
  • trains, evaluates and forecasts using pycaret
  • visulaise in plotly

Whilst the model itself is working well, I would like to harded the quality of the code and would it would be great if anyone has advice on how to improve this when considering:

  • Using standard patterns / naming conventions
  • Ensuring the code is reuseable
  • Avoding hard coding elements and parameterisation conflicts
  • How the model is trained / deployed in production

The code

class citibike_rental_forecast():
    
    def __init__(self,hours_to_forecast = 168,example_station_id = 127):
        
        # define variables
        self.hours_to_forecast = hours_to_forecast        
        self.example_station_id = example_station_id
        
        # prep data
        self.prep_timeseries()    
        
    def train_model(self):
        
        # prep train andn validation data sets
        self.prep_train_val()
        # train_pycaret_model
        self.train_pycaret_model()
        # eval the model
        self.eval_pycaret_model()
        
    def predict_forecast(self):
        
        # filter to 518 only (space saving)
        self.station_data_pd = self.model_data_pd.loc[self.model_data_pd['station_id']==self.example_station_id]
        self.station_data_pd.sort_index(inplace=True)                
        
        # Load pre trained model
        self.model = load_model('model_objects/pycaret_final_model')           
        
        # prep scoring data
        forecast_pd = self.station_data_pd.loc[self.station_data_pd['forecast_period']==1]
        self.x_forecast = forecast_pd[self.model_feature_list] 
        
        # scoring
        forecast  = predict_model(estimator=self.model,data=self.x_forecast)
        forecast.sort_index(inplace=True)  
        self.forecast = forecast
        
        # visualise
        self.visualise_forecast()
        

    def prep_timeseries(self):

        query1 = f'
           SELECT
            a.station_id,
            cast(a.hour as string) as ds,
            EXTRACT(year from a.hour) as year,
            EXTRACT(month from a.hour) as month,
            EXTRACT(day from a.hour) as day,    
            EXTRACT(dayofweek from a.hour) as dayofweek,
            EXTRACT(hour from a.hour) as hour,
            CASE WHEN d.date IS NULL THEN 0 ELSE 1 END as is_holiday,
            COALESCE(b.rentals,0) as y,
            a.forecast_period
          FROM ( -- all rental hours by currently active stations
            SELECT 
              y.station_id,
              x.hour,
              CASE WHEN x.hour <= y.end_date THEN 0 ELSE 1 END as forecast_period
            FROM citibike.periods x
            INNER JOIN citibike.stations_most_active y
             ON x.hour BETWEEN y.start_date AND (y.end_date + INTERVAL {self.hours_to_forecast} HOURS)
            ) a
          LEFT OUTER JOIN citibike.rentals b
            ON a.station_id=b.station_id AND a.hour=b.hour
          LEFT OUTER JOIN citibike.holidays d
            ON TO_DATE(a.hour)=d.date
          '

        # assemble historical dataset for training
        model_data_pd = execute_spark_sql_query(query1).toPandas()

        #Fix ds format
        model_data_pd['ds'] = pd.to_datetime(model_data_pd['ds'])

        #set dttm as index
        model_data_pd.set_index('ds',inplace=True)
        self.model_data_pd = model_data_pd
        
        #specify training features
        self.model_feature_list = model_data_pd.columns[1:7].tolist()
        
    def prep_train_val(self):
        
        # filter to 518 only (space saving)
        self.station_data_pd = self.model_data_pd.loc[self.model_data_pd['station_id']==self.example_station_id]
        self.station_data_pd.sort_index(inplace=True)
        
        # split out training periods and forecast periods
        ts_pd = self.station_data_pd.loc[self.station_data_pd['forecast_period']==0]
        
        #prep train
        self.train_pd = ts_pd['2016-10-31':'2017-10-31']
        self.x_train = self.train_pd[self.model_feature_list]
        self.y_train = self.train_pd['y'].values

        #prep val
        self.val_pd = ts_pd['2017-11-01':'2017-11-30']
        self.x_val = self.val_pd[self.model_feature_list]
        self.y_val = self.val_pd['y'].values

    def train_pycaret_model(self):
        
        # Training with PyCaret
        discard_columns = ['station_id','forecast_period']

        # Initialize setup
        setup(self.train_pd, 
              target='y',
              ignore_features=discard_columns,
              numeric_features=self.model_feature_list,
              #remove_multicollinearity=True,
              #multicollinearity_threshold=0.85,
              #fix_imbalance_method=RandomOverSampler(random_state=0),
              #fix_imbalance=True,
              #session_id=123,
              )

        #init dummy models are a benchmark
        dummy_mean = DummyRegressor(strategy="mean")
        dummy_constant = DummyRegressor(strategy="constant")
        dummy_quantile = DummyRegressor(strategy="quantile")

        #list our models to test
        bhmk_models = [dummy_mean,dummy_constant,dummy_quantile]
        model_selection = ["lr", "rf", "gbr", "knn", "dt", "lasso", "ridge"]

        # train models and select best
        best = compare_models(include=model_selection+bhmk_models,verbose=True,round=5,fold=3)             

        #fine tune our winner
        self.model = tune_model(best)
        
        self.final_v = finalize_model(self.model)

        save_model(self.final_v, 'model_objects/pycaret_final_model')        
        
    def eval_pycaret_model(self):

        # predict for val data
        print('Train data')
        self.train_predictions  = predict_model(estimator=self.model,data=self.train_pd)

        # predict for val data
        print('Validation data')
        self.val_predictions  = predict_model(estimator=self.model,data=self.val_pd)    
        
    def visualise_model(self):
        
        plot_type = 'error'

        plot_model(self.final_v, plot = plot_type, save=True)

        plot_type = 'feature'

        plot_model(self.final_v, plot = plot_type, save=True)        
        
    def visualise_forecast(self):
        
        # prep training data and score for visualisation
        self.prep_train_val()
        self.eval_pycaret_model()
        
        train_predictions_plt = self.train_predictions['2016-08':'2018-02'].reset_index().sort_values('ds')
        val_predictions_plt = self.val_predictions['2016-08':'2018-02'].reset_index().sort_values('ds')
        forecast_plt = self.forecast['2016-08':'2018-02'].reset_index().sort_values('ds')

        # Create a figure
        fig = go.Figure()

        #Training data
        fig.add_trace(go.Line(x=train_predictions_plt.ds, 
                              y=train_predictions_plt.prediction_label, 
                              marker_color='black', 
                              name='train_pred'
                             ))

        fig.add_trace(go.Scatter(x=train_predictions_plt.ds, 
                                 y=train_predictions_plt.y, 
                                 mode='markers', 
                                 marker_color='black',
                                 name='train_actual'
                                ))

        # Validation data
        fig.add_trace(go.Line(x=val_predictions_plt.ds, 
                              y=val_predictions_plt.prediction_label, 
                              marker_line_color='black', 
                              name='val_pred'
                             ))
        fig.add_trace(go.Scatter(x=val_predictions_plt.ds, 
                                 y=val_predictions_plt.y, 
                                 mode='markers', 
                                 marker_color='black',
                                 name='val_actual'
                                ))

        #forecast
        fig.add_trace(go.Line(x=forecast_plt.ds, 
                              y=forecast_plt.prediction_label, 
                              #marker_line_color='black', 
                              name='forecast'
                             ))

        fig.update_layout(xaxis_range=[datetime.strptime('2017-10-01','%Y-%m-%d'), datetime.strptime('2017-12-17','%Y-%m-%d')])
        fig.update_layout(xaxis_title="Date", yaxis_title="rentals")

        # Show the plot
        fig.show()

        fig.write_image("images/pycaret_forecast.png")   

Run using ...

''' cbr = citibike_rental_forecast() cbr.train_model() '''

and

''' cbr = citibike_rental_forecast() cbr.predict_forecast() '''

Model runs but am looking for advice on improving the quality of the code and implementing standard production practise

0

There are 0 best solutions below