Create a Arima model Grouped by a column

33 Views Asked by At

I'm trying to create a Arima model to forecast impressions(numerical Y variable) based on a keyword.

My table :IOS

Date Keyword Impressions
2019-01-01 Cococola 51354
2019-01-01 Apple 31231
2019-01-01 Samsung 12412
2019-01-01 Mango
2019-01-01 Grapes
2019-01-02 Cococola 3124312

​Can anyone please help me

I have written a code to get it

<groups_iter = np.unique(IOS['Keyword'])
len(groups_iter)
dict_org = {}
dict_pred = {}
group_accuracy = {}
predictions = list()
# Iterate over all groups and get data 
# from Dataframe by filtering for specific group
for i in range(len(groups_iter)):
    X = IOS[IOS['Keyword'] == groups_iter[i]]['Impressions'].values
    size = int(len(X) * 0.70)
    train, test = X[0:size], X[size:len(X)]
    history = [x for x in train]

    # Using ARIMA model here you can also do grid search for best parameters
    for t in range(len(test)):
        model = ARIMA(history, order = (5, 1, 0))
        model_fit = model.fit()
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        obs = test[t]
        history.append(obs)
        print(groups_iter[i],"Predicted:%f, expected:%f" %(yhat, obs))
    error = mean_squared_log_error(test, predictions)
    dict_org.update({groups_iter[i]: test})
    dict_pred.update({groups_iter[i]: test})

    print("Group: ", groups_iter[i], "Test MSE:%f"% error)
    group_accuracy.update({groups_iter[i]: error})
    plt.plot(test)
    plt.plot(predictions, color = 'red')
    plt.show()

When I'm trying to run the code, I'm getting following error when it runs the 2nd loop:

ValueError                                Traceback (most recent call last)
Cell In[10], line 19
     17     history.append(obs)
     18     print(groups_iter[i],"Predicted:%f, expected:%f" %(yhat, obs))
---> 19 error = mean_squared_log_error(test, predictions)
     20 dict_org.update({groups_iter[i]: test})
     21 dict_pred.update({groups_iter[i]: test})

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_regression.py:519, in mean_squared_log_error(y_true, y_pred, sample_weight, multioutput, squared)
    461 def mean_squared_log_error(
    462     y_true, y_pred, *, sample_weight=None, multioutput="uniform_average", squared=True
    463 ):
    464     """Mean squared logarithmic error regression loss.
    465 
    466     Read more in the :ref:`User Guide <mean_squared_log_error>`.
   (...)
    517     0.060...
    518     """
--> 519     y_type, y_true, y_pred, multioutput = _check_reg_targets(
    520         y_true, y_pred, multioutput
    521     )
    522     check_consistent_length(y_true, y_pred, sample_weight)
    524     if (y_true < 0).any() or (y_pred < 0).any():

File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_regression.py:100, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
     66 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
     67     """Check that y_true and y_pred belong to the same regression task.
     68 
     69     Parameters
   (...)
     98         correct keyword.
     99     """
--> 100     check_consistent_length(y_true, y_pred)
    101     y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
    102     y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py:397, in check_consistent_length(*arrays)
    395 uniques = np.unique(lengths)
    396 if len(uniques) > 1:
--> 397     raise ValueError(
    398         "Found input variables with inconsistent numbers of samples: %r"
    399         % [int(l) for l in lengths]
    400     )

ValueError: Found input variables with inconsistent numbers of samples: [104, 520]

0

There are 0 best solutions below