I am working on a project tracking poverty across the US between 1995 and 2020.
As I am working on a linear regression scatterplot, I have this:
# Create a regression object.
regression = LinearRegression()
# This is the regression object, which will be fit onto the training set.
# Fit the regression object onto the training set.
regression.fit(X_train, y_train)
With the following error message:
ValueError: could not convert string to float: 'New Hampshire'
Assuming this is due to New Hampshire being two words instead of one; Is this correct? How can I get it to assign a _ between States which have this issue:
New Hampshire New Jersey New Mexico New York North Carolina North Dakota Rhode Island South Carolina South Dakota West Virginia
Pre-error code and error in full below.
# Create a regression object.
regression = LinearRegression()
# This is the regression object, which will be fit onto the training set.
# Fit the regression object onto the training set.
regression.fit(X_train, y_train)
With the following error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[55], line 3
1 # Fit the regression object onto the training set.
----> 3 regression.fit(X_train, y_train)
File ~/anaconda3/lib/python3.11/site-packages/sklearn/linear_model/_base.py:648, in LinearRegression.fit(self, X, y, sample_weight)
644 n_jobs_ = self.n_jobs
646 accept_sparse = False if self.positive else ["csr", "csc", "coo"]
--> 648 X, y = self._validate_data(
649 X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
650 )
652 sample_weight = _check_sample_weight(
653 sample_weight, X, dtype=X.dtype, only_non_negative=True
654 )
656 X, y, X_offset, y_offset, X_scale = _preprocess_data(
657 X,
658 y,
(...)
661 sample_weight=sample_weight,
662 )
File ~/anaconda3/lib/python3.11/site-packages/sklearn/base.py:584, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
582 y = check_array(y, input_name="y", **check_y_params)
583 else:
--> 584 X, y = check_X_y(X, y, **check_params)
585 out = X, y
587 if not no_val_X and check_params.get("ensure_2d", True):
File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1101 estimator_name = _check_estimator_name(estimator)
1102 raise ValueError(
1103 f"{estimator_name} requires y to be passed, but the target y is None"
1104 )
-> 1106 X = check_array(
1107 X,
1108 accept_sparse=accept_sparse,
1109 accept_large_sparse=accept_large_sparse,
1110 dtype=dtype,
1111 order=order,
1112 copy=copy,
1113 force_all_finite=force_all_finite,
1114 ensure_2d=ensure_2d,
1115 allow_nd=allow_nd,
1116 ensure_min_samples=ensure_min_samples,
1117 ensure_min_features=ensure_min_features,
1118 estimator=estimator,
1119 input_name="X",
1120 )
1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1124 check_consistent_length(X, y)
File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py:879, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
877 array = xp.astype(array, dtype, copy=False)
878 else:
--> 879 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
880 except ComplexWarning as complex_warning:
881 raise ValueError(
882 "Complex data not supported\n{}\n".format(array)
883 ) from complex_warning
File ~/anaconda3/lib/python3.11/site-packages/sklearn/utils/_array_api.py:185, in _asarray_with_order(array, dtype, order, copy, xp)
182 xp, _ = get_namespace(array)
183 if xp.__name__ in {"numpy", "numpy.array_api"}:
184 # Use NumPy API to support order
--> 185 array = numpy.asarray(array, order=order, dtype=dtype)
186 return xp.asarray(array, copy=copy)
187 else:
ValueError: could not convert string to float: 'New Hampshire'
# Predict the values of y using X.
Assuming this is due to New Hampshire being two words instead of one; Is this correct? How can I get it to assign a _ between States which have this issue?
It looks like there are strings in the dataframe.
LinearRegressionis expecting numbers only. You'll need to convert the strings to numbers usingOneHotEncoderor similar. That will look at how many unique strings (unique categories) there are, and then each string (category) will be represented numerically using one-hot encoding. Regression can then work with that numerical representation of the categories.