I'm trying to create a Arima model to forecast impressions(numerical Y variable) based on a keyword.
My table :IOS
| Date | Keyword | Impressions |
|---|---|---|
| 2019-01-01 | Cococola | 51354 |
| 2019-01-01 | Apple | 31231 |
| 2019-01-01 | Samsung | 12412 |
| 2019-01-01 | Mango | |
| 2019-01-01 | Grapes | |
| 2019-01-02 | Cococola | 3124312 |
Can anyone please help me
I have written a code to get it
<groups_iter = np.unique(IOS['Keyword'])
len(groups_iter)
dict_org = {}
dict_pred = {}
group_accuracy = {}
predictions = list()
# Iterate over all groups and get data
# from Dataframe by filtering for specific group
for i in range(len(groups_iter)):
X = IOS[IOS['Keyword'] == groups_iter[i]]['Impressions'].values
size = int(len(X) * 0.70)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
# Using ARIMA model here you can also do grid search for best parameters
for t in range(len(test)):
model = ARIMA(history, order = (5, 1, 0))
model_fit = model.fit()
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
print(groups_iter[i],"Predicted:%f, expected:%f" %(yhat, obs))
error = mean_squared_log_error(test, predictions)
dict_org.update({groups_iter[i]: test})
dict_pred.update({groups_iter[i]: test})
print("Group: ", groups_iter[i], "Test MSE:%f"% error)
group_accuracy.update({groups_iter[i]: error})
plt.plot(test)
plt.plot(predictions, color = 'red')
plt.show()
When I'm trying to run the code, I'm getting following error when it runs the 2nd loop:
ValueError Traceback (most recent call last)
Cell In[10], line 19
17 history.append(obs)
18 print(groups_iter[i],"Predicted:%f, expected:%f" %(yhat, obs))
---> 19 error = mean_squared_log_error(test, predictions)
20 dict_org.update({groups_iter[i]: test})
21 dict_pred.update({groups_iter[i]: test})
File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_regression.py:519, in mean_squared_log_error(y_true, y_pred, sample_weight, multioutput, squared)
461 def mean_squared_log_error(
462 y_true, y_pred, *, sample_weight=None, multioutput="uniform_average", squared=True
463 ):
464 """Mean squared logarithmic error regression loss.
465
466 Read more in the :ref:`User Guide <mean_squared_log_error>`.
(...)
517 0.060...
518 """
--> 519 y_type, y_true, y_pred, multioutput = _check_reg_targets(
520 y_true, y_pred, multioutput
521 )
522 check_consistent_length(y_true, y_pred, sample_weight)
524 if (y_true < 0).any() or (y_pred < 0).any():
File ~/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_regression.py:100, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
66 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
67 """Check that y_true and y_pred belong to the same regression task.
68
69 Parameters
(...)
98 correct keyword.
99 """
--> 100 check_consistent_length(y_true, y_pred)
101 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
102 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py:397, in check_consistent_length(*arrays)
395 uniques = np.unique(lengths)
396 if len(uniques) > 1:
--> 397 raise ValueError(
398 "Found input variables with inconsistent numbers of samples: %r"
399 % [int(l) for l in lengths]
400 )
ValueError: Found input variables with inconsistent numbers of samples: [104, 520]