import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pandas_datareader as web
import datetime as dt
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
# Load Data
company = 'TSLA'
start = dt.datetime(2012, 1, 1)
end = dt.datetime(2020, 1, 1)
data = yf.download(company, start=start, end=end)
# Add additional data points
data['Volume_MA10'] = data['Volume'].rolling(window=10).mean()
data['Volume_MA5'] = data['Volume'].rolling(window=10).mean()
# Drop rows with NaN values after adding the moving average column
data.dropna(inplace=True)
# Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data[['Close', 'Volume', 'Volume_MA10', 'Volume_MA5']])
prediction_days = 60
x_train = []
y_train = []
for x in range(prediction_days, len(scaled_data)):
x_train.append(scaled_data[x - prediction_days:x, :])
y_train.append(scaled_data[x, 0])
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], x_train.shape[2]))
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, epochs=25, batch_size=32)
This all seems to work fine, but when I test the following:
# Test the model accuracy on existing data
# Load Test Data
test_start = dt.datetime(2020, 1, 1)
test_end = dt.datetime(2023, 10, 30)
test_data = yf.download(company, start=test_start, end=test_end)
actual_prices = test_data['Close'].values
total_dataset = pd.concat((data['Close'], test_data['Close']), axis=0)
total_dataset = pd.concat((total_dataset, data['Volume'], data['Volume_MA10'], data['Volume_Binary']), axis=1)
model_inputs = total_dataset[len(total_dataset) - len(test_data) - prediction_days:].values
model_inputs = model_inputs.reshape(-1, 4) # Reshape to include all four data points
# Fit scaler with all four features
scaler = MinMaxScaler(feature_range=(0, 1))
model_inputs = scaler.fit_transform(model_inputs)
# Make Predictions on Test Data
x_test = []
for x in range(prediction_days, len(model_inputs)):
x_test.append(model_inputs[x - prediction_days:x, :])
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], x_test.shape[2]))
predicted_prices = model.predict(x_test)
# Perform inverse scaling on the predicted prices
predicted_prices = scaler.inverse_transform(predicted_prices)
# Extract the 'Close' column from the predicted prices
predicted_prices = predicted_prices[:, 0]
# Plot the Test Predictions
plt.plot(actual_prices, color='black', label='Actual Prices')
plt.plot(range(prediction_days, prediction_days + len(predicted_prices)), predicted_prices, color='green', label='Predicted Prices')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()
I get the following error:
ValueError Traceback (most recent call last) in <cell line: 33>() 31 32 # Perform inverse scaling on the predicted prices ---> 33 predicted_prices = scaler.inverse_transform(predicted_prices) 34 35 # Extract the 'Close' column from the predicted prices
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/data.py in inverse_transform(self, X) 539 ) 540 --> 541 X -= self.min 542 X /= self.scale_ 543 return X
ValueError: non-broadcastable output operand with shape (963,1) doesn't match the broadcast shape (963,4)