I'm using the following code to construct an iterable dataset to work with multiple datafiles:
class LargeScaleDataset(IterableDataset):
def __init__(self, csv_files, target_file, chunksize=10 ** 6): # 1 GB chunks
self.csv_files = [f for f in csv_files if f != target_file]
self.target_file = target_file
self.chunksize = chunksize
# Inferring n_features from a sample file
sample_df = pd.read_csv(self.csv_files[0], nrows=1)
self.n_features = len(sample_df.columns)
def create_sequences(self, data, labels):
seq_data, seq_labels = [], []
for i in range(len(data) - LOOKBACK):
seq_data.append(data[i:i + LOOKBACK])
seq_labels.append(
labels[i + LOOKBACK])
return np.array(seq_data), np.array(seq_labels)
def preprocess_data(self, df):
# Parse the date-time and set it as index
df['datetime'] = pd.to_datetime(df.iloc[:, 0])
df.set_index('datetime', inplace=True)
# Read the target file date-time index
target_df = pd.read_csv(self.target_file, usecols=[0], parse_dates=[0], index_col=0)
# Reindex current dataframe with respect to the target's date-time index
df = df.reindex(target_df.index)
# Fill missing values
df.fillna(method="ffill", inplace=True)
df.fillna(method="bfill", inplace=True)
# Convert the date-time index to Unix timestamp and add as a new column
# df["time_in_epoch_seconds"] = (df.index - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df.drop(columns=df.columns[0], axis=1, inplace=True)
df.reset_index()
# df.set_index('time_in_epoch_seconds')
if('datetime' in df.columns):
df.drop('datetime', axis=1)
print(df.head())
df.drop(columns=df.columns[0], axis=1, inplace=True)
df.drop(columns=df.columns[0], axis=1, inplace=True)
# Convert DataFrame to NumPy array excluding datetime index.
timeseries = df.values.astype('float32')
target = timeseries[:, -2] # Adjusted because we added the timestamp column, and it's the second-last column
seq_data, seq_target = self.create_sequences(timeseries, target)
return seq_data, seq_target
def __iter__(self):
for file in self.csv_files:
chunk_iter = pd.read_csv(file, chunksize=self.chunksize)
for df in chunk_iter:
timeseries, target = self.preprocess_data(df)
# Group sequences into mini-batches for LSTM
for i in range(0, len(timeseries) - LOOKBACK + 1):
d = timeseries[i:i + LOOKBACK, :], target[i + LOOKBACK - 1]
yield timeseries[i:i + LOOKBACK, :], target[i + LOOKBACK - 1]
def get_test_train_data(self):
# Split based on files rather than on target data.
train_files, test_files = np.split(self.csv_files, [int(0.8 * len(self.csv_files))]) # 80-20 split
return train_files, test_files
train_data, test_data = LargeScaleDataset(all_data_files, target_file).get_test_train_data()
train_dataset = LargeScaleDataset(train_data, target_file)
test_dataset = LargeScaleDataset(test_data, target_file)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
However, when I iterate through the batches in the dataloader I've constructed, I find that each item has the following shape:
torch.Size([64, 10, 10, 3])
Both my test and train data should be loading into memory as a 3-D tensor, not 4-D, and this problem has only come up after I've added "BATCH_SIZE" to my implementation, which I believe is adding an extra dimension to my tensor, or duplicating an existing dimension. I have tried using squeeze() function to remove this extra dimension with no luck. How can I make sure that the batches within my dataloader are 3-D tensors, not 4-D?