Pytorch DataLoader returning incorrect dimensions

74 Views Asked by At

I'm using the following code to construct an iterable dataset to work with multiple datafiles:

class LargeScaleDataset(IterableDataset):
    def __init__(self, csv_files, target_file, chunksize=10 ** 6):  # 1 GB chunks
        self.csv_files = [f for f in csv_files if f != target_file]
        self.target_file = target_file
        self.chunksize = chunksize
        # Inferring n_features from a sample file
        sample_df = pd.read_csv(self.csv_files[0], nrows=1)
        self.n_features = len(sample_df.columns)

    def create_sequences(self, data, labels):
        seq_data, seq_labels = [], []
        for i in range(len(data) - LOOKBACK):
            seq_data.append(data[i:i + LOOKBACK])
            seq_labels.append(
                labels[i + LOOKBACK])
        return np.array(seq_data), np.array(seq_labels)

    def preprocess_data(self, df):
        # Parse the date-time and set it as index
        df['datetime'] = pd.to_datetime(df.iloc[:, 0])
        df.set_index('datetime', inplace=True)

        # Read the target file date-time index
        target_df = pd.read_csv(self.target_file, usecols=[0], parse_dates=[0], index_col=0)
        # Reindex current dataframe with respect to the target's date-time index
        df = df.reindex(target_df.index)

        # Fill missing values
        df.fillna(method="ffill", inplace=True)
        df.fillna(method="bfill", inplace=True)

        # Convert the date-time index to Unix timestamp and add as a new column
        # df["time_in_epoch_seconds"] = (df.index - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

        df.drop(columns=df.columns[0], axis=1, inplace=True)
        df.reset_index()
        # df.set_index('time_in_epoch_seconds')
        if('datetime' in df.columns):
            df.drop('datetime', axis=1)
        print(df.head())
        df.drop(columns=df.columns[0], axis=1, inplace=True)
        df.drop(columns=df.columns[0], axis=1, inplace=True)
        # Convert DataFrame to NumPy array excluding datetime index.
        timeseries = df.values.astype('float32')
        target = timeseries[:, -2]  # Adjusted because we added the timestamp column, and it's the second-last column

        seq_data, seq_target = self.create_sequences(timeseries, target)
        return seq_data, seq_target


    def __iter__(self):
        for file in self.csv_files:
            chunk_iter = pd.read_csv(file, chunksize=self.chunksize)
            for df in chunk_iter:
                timeseries, target = self.preprocess_data(df)
                # Group sequences into mini-batches for LSTM
                for i in range(0, len(timeseries) - LOOKBACK + 1):
                    d = timeseries[i:i + LOOKBACK, :], target[i + LOOKBACK - 1]
                    yield timeseries[i:i + LOOKBACK, :], target[i + LOOKBACK - 1]

    def get_test_train_data(self):
        # Split based on files rather than on target data.
        train_files, test_files = np.split(self.csv_files, [int(0.8 * len(self.csv_files))]) # 80-20 split
        return train_files, test_files
train_data, test_data = LargeScaleDataset(all_data_files, target_file).get_test_train_data()
train_dataset = LargeScaleDataset(train_data, target_file)
test_dataset = LargeScaleDataset(test_data, target_file)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

However, when I iterate through the batches in the dataloader I've constructed, I find that each item has the following shape:

torch.Size([64, 10, 10, 3])

Both my test and train data should be loading into memory as a 3-D tensor, not 4-D, and this problem has only come up after I've added "BATCH_SIZE" to my implementation, which I believe is adding an extra dimension to my tensor, or duplicating an existing dimension. I have tried using squeeze() function to remove this extra dimension with no luck. How can I make sure that the batches within my dataloader are 3-D tensors, not 4-D?

0

There are 0 best solutions below