I try to convert my code to use data.dataset. I'm not far but still have proble with my features and model input layer never seen before I use data.dataset
I load a lot of .csv with many columns for features, csv had heather with name string.
my simple test code is:
import tensorflow as tf
import pandas as pd
bd_path = 'C:/Users/my doc/Python/mini_test/'
keep_columns = ['precipitation', 'temperature_min', 'temperature_max',
'snow_depth_water_equivalent_max', 'streamflow']
name_columns = pd.read_csv(bd_path + 'camels_01022500+attributs_mensuels.csv').columns
# Enable eager execution
tf.config.run_functions_eagerly(True)
# Load a single CSV file and preprocess it
def load_and_preprocess_csv(filename):
columns = name_columns
dataset = tf.data.experimental.make_csv_dataset(
file_pattern=filename,
num_parallel_reads=2,
batch_size=32,
num_epochs=1,
label_name='streamflow',
column_names=columns,
select_columns=keep_columns,
shuffle_buffer_size=10000,
header=True,
field_delim=','
)
# Apply preprocessing to the dataset
def preprocess_fn(features, label):
# Normalize the features (example: scaling to [0, 1])
features['precipitation'] /= 100.0
features['temperature_min'] /= 100.0
features['temperature_max'] /= 100.0
features['snow_depth_water_equivalent_max'] /= 100.0
# last trial I did
# Create a 'main_inputs' feature by stacking the selected columns
features['main_inputs'] = tf.stack([
features['precipitation'],
features['temperature_min'],
features['temperature_max'],
features['snow_depth_water_equivalent_max']
], axis=-1)
# here an other trial without sucess...
# Rename the columns to match the model's input layer
#features['main_inputs'] = tf.cast(features['main_inputs'], tf.float32) # Ensure the dtype is correct
#features['main_inputs'] = tf.identity(features['main_inputs'], name='main_inputs') # Rename the feature
return features, label
dataset = dataset.map(preprocess_fn)
return dataset
# Create a list of file paths matching pattern
file_paths = tf.io.gfile.glob(bd_path + '*.csv')
# Load and preprocess CSV files in parallel
building_datasets = []
for file_path in file_paths:
dataset = load_and_preprocess_csv(file_path)
building_datasets.append(dataset)
# Combine the individual datasets into a single dataset
combined_dataset = tf.data.Dataset.sample_from_datasets(building_datasets)
# Optional, further transform, shuffle, and batch the dataset as needed
# For example:
combined_dataset = combined_dataset.shuffle(buffer_size=10000)
#combined_dataset = combined_dataset.batch(64)
# model
tensor_input = tf.keras.layers.Input(shape=(4,), name='main_inputs')
xy = tf.keras.layers.Dense(10, activation='linear')(tensor_input)
xy = tf.keras.layers.Dropout(rate=0.2)(xy)
out = tf.keras.layers.Dense(1, activation='linear')(xy)
model = tf.keras.Model(inputs=tensor_input, outputs=out)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mse')
# Train the model
history = model.fit(combined_dataset, epochs=1)
the warning I get is:
... \keras\engine\functional.py:637: UserWarning: Input dict contained keys ['temperature_min', 'snow_depth_water_equivalent_max', 'temperature_max', 'precipitation'] which did not match any model input. They will be ignored by the model.
my experience is passing array directly to model, is the input layer must be modify or this is my dataset who need more modification?
In
preprocess_fnmake sure you use same string to refer to feature indexes -