I need to automatically predict the machine costs based upon a csv file with header of which the number of columns and the names of the columns are only known once the file is read. Below is a small example of the dataset, but the number of columns can be huge! (sometimes 100 or It can not be hardcoded into the program (different names and unknow column count).
/*
Trainfile.csv
Costs,Mach1,Mach2,Mach3,Maint1,Maint2,Maint31,Inst1,LM
12.3, 3.2, 5.5, 5.3, 12, 53, 72, 16,2
17.4, 1.2, 6.2, 5.6, 12, 64, 75, 19,7
13.9, 3.0, 6.6, 5.2, 11, 44, 42, 12,4
9.4, 2.2, 4.5, 4.0, 12, 54, 52, 13,0
Trainfile.csv other time could be
Costs,Mach13,Mach2,Mach3,Maint1,Maint12,Maint5,Inst1,LM,XM
12.3, 3.2, 5.5, 5.3, 12, 53, 72, 16,2, 11,2
17.4, 1.2, 6.2, 5.6, 12, 64, 75, 19,7, 16,1
13.9, 3.0, 6.6, 5.2, 11, 44, 42, 12,4, 12,2
9.4, 2.2, 4.5, 4.0, 12, 54, 52, 13,0, 16,3
Trainfile.csv other time could be
Costs,Mach6,Mach3,Maint11,Maint12,Maint3,Inst1,LM,XM
12.3, 3.2, 5.3, 12, 53, 72, 16,2, 11,2
17.4, 1.2, 5.6, 12, 64, 75, 19,7, 16,1
13.9, 3.0, 5.2, 11, 44, 42, 12,4, 12,2
9.4, 2.2, 4.0, 12, 54, 52, 13,0, 16,3
I need to train, evaluate and predict.I store all Feature importances in a database to see wheter the data should still be collected in the factory) Also i should be able to save the Best model and later on load it again to use it for predictions. I can ofcourse choose to rename the features and put a 1000 features in the class an fill the ones not in the csv with 0; But that's a ugly workaround in my opinion.
the code goes haywire on creating the Prediction engine; not finding a column name. like Mach1
running VS2022, .Net6, The nuget Libs: Microsoft.ML 2.0.1 and Microsoft.ML.AutoML 0.20.0 ML tried many solutions given but the all die during the process. Any one that can make a prediction engine here ??
I've been googling, edging, binging, Chatgpting for a few days now but no avail.
I tried many solutions given but they all die during the process.
So any suggestions/improvements for me?
Code so far:
private async Task TrainModelAsync()
{
try
{
var context = new MLContext(seed: 0);
// Load the data
var Options = new TextLoader.Options();
var TColumns = new List<TextLoader.Column>();
Options.Separators = new char[] { FieldSeperator };
Options.HasHeader = true;
foreach (var column in HeaderRow)
{
TColumns.Add(new(column, DataKind.Single, TColumns.Count));
}
Options.Columns = TColumns.ToArray();
var textLoader = context.Data.CreateTextLoader(Options);
IDataView dataView = textLoader.Load(DatasetFile);
var dataSchema = dataView.Schema;
// Split the data into a training set and a test set
var trainTestData = context.Data.TrainTestSplit(dataView, testFraction: 0.2, seed: 0);
var trainData = trainTestData.TrainSet;
var testData = trainTestData.TestSet;
ColumnInferenceResults columnInference = context.Auto().InferColumns(DatasetFile, labelColumnName: LabelName, groupColumns: true);
var pipeline = context.Auto().Featurizer(dataView, columnInformation: columnInference.ColumnInformation)
.Append(context.Auto().Regression(labelColumnName: columnInference.ColumnInformation.LabelColumnName)) ;
AutoMLExperiment experiment = context.Auto().CreateExperiment();
experiment
.SetPipeline(pipeline)
.SetRegressionMetric(RegressionMetric.RSquared, labelColumn: columnInference.ColumnInformation.LabelColumnName)
.SetTrainingTimeInSeconds(10)
.SetDataset(trainData);
TrialResult experimentResults = await experiment.RunAsync();
var BestModel = experimentResults.Model;
var Rsquared = experimentResults.Metric;
context.Model.Save(BestModel, dataSchema, ModelFile); // save for later test
var transformedData = BestModel.Transform(trainData);
var pfiResults = context.Regression.PermutationFeatureImportance(BestModel, transformedData, permutationCount: 3);
List<(string Key, double RSquared)> Importances = new();
foreach (var item in pfiResults)
{
Importances.Add((item.Key, item.Value.RSquared.Mean));
}
foreach (var pair in Importances.OrderByDescending(x => x.RSquared))
{
System.Diagnostics.Debug.WriteLine($"{pair.Item1}\t{pair.Item2}");
}
// Load Trained Model
DataViewSchema predictionPipelineSchema;
ITransformer LoadedModel = context.Model.Load(ModelFile, out predictionPipelineSchema);
var predictionEngine = context.Model.CreatePredictionEngine<InputFX, OutputFX>(BestModel);
foreach (var checkRow in context.Data.CreateEnumerable<InputFX>(testData, false))
{
var prediction = predictionEngine.Predict(checkRow);
System.Diagnostics.Debug.WriteLine($"Predict1: ${prediction.Prediction:F3}; vs {checkRow.Label:F3}");
}
// try if we can Create a PredictEngine form the saced file ;
var Context2 = new MLContext();
ITransformer Model2 = Context2.Model.Load(ModelFile, out var _);
predictionEngine = Context2.Model.CreatePredictionEngine<InputFX, OutputFX>( Model2);
foreach (var checkRow in context.Data.CreateEnumerable<InputFX>(testData, false))
{
var prediction = predictionEngine.Predict(checkRow);
System.Diagnostics.Debug.WriteLine($"Predict2: ${prediction.Prediction:F3}; vs {checkRow.Label:F3}");
}
}
catch (Exception ex) { System.Diagnostics.Debug.WriteLine(ex.Message + " " + ex.StackTrace.ToString()); }
}