I've read that random forest algorithm in sklearn > 1.4 should be able to handle NaN. I've checked that I've the latest version of Sklearn.
! pip install --upgrade scikit-learn
import sklearn
print(sklearn.__version__)
1.4.1
however i still get the error:
ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
Why? Should I import something else? I'm confused.
edit:
this is a minimal code that should give the error I've mentioned:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Example DataFrame with NaN values
data = {
"tipo_locazione": ["A", "B", None, "A"],
"flg_polizza_caa": [1, 0, 1, 0],
"cl_bisogni_3": [0, 1, 1, 0]
}
df = pd.DataFrame(data)
def random_forest_model(variabili):
X = df[variabili]
y = df['cl_bisogni_3'].astype(str)
# Identifying categorical features
categorical_features = X.select_dtypes(include=['object']).columns
# Transformer for categorical features
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Preprocessor to apply transformations
preprocessor = ColumnTransformer(
transformers=[
('cat', categorical_transformer, categorical_features)
],
remainder='passthrough'
)
# Model pipeline
model = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=42))
])
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fitting the model
model.fit(X_train, y_train)
print("Model trained successfully")
# Attempt to train the model with NaN values
variables = ['tipo_locazione', 'flg_polizza_caa']
random_forest_model(variables)
edit2 my traceback error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-75-1a77bdc05207> in <cell line: 65>()
66 selected_variables = sample(variabili_ab, 2) # Adjust number to be <= length of variabili_ab
67
---> 68 metrics = random_forest_model(selected_variables)
69 results[tuple(selected_variables)] = metrics
70 count_cicli -= 1
8 frames
<ipython-input-75-1a77bdc05207> in random_forest_model(variabili)
47
48 # Fitting the model
---> 49 model.fit(X_train, y_train)
50 y_pred = model.predict(X_test)
51
/usr/local/lib/python3.10/dist-packages/sklearn/base.py in wrapper(estimator, *args, **kwargs)
1472 )
1473 ):
-> 1474 return fit_method(estimator, *args, **kwargs)
1475
1476 return wrapper
/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py in fit(self, X, y, **params)
473 if self._final_estimator != "passthrough":
474 last_step_params = routed_params[self.steps[-1][0]]
--> 475 self._final_estimator.fit(Xt, y, **last_step_params["fit"])
476
477 return self
/usr/local/lib/python3.10/dist-packages/sklearn/base.py in wrapper(estimator, *args, **kwargs)
1472 )
1473 ):
-> 1474 return fit_method(estimator, *args, **kwargs)
1475
1476 return wrapper
/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py in fit(self, X, y, sample_weight)
375 estimator = type(self.estimator)(criterion=self.criterion)
376 missing_values_in_feature_mask = (
--> 377 estimator._compute_missing_values_in_feature_mask(
378 X, estimator_name=self.__class__.__name__
379 )
/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py in _compute_missing_values_in_feature_mask(self, X, estimator_name)
212
213 if not self._support_missing_values(X):
--> 214 assert_all_finite(X, **common_kwargs)
215 return None
216
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in assert_all_finite(X, allow_nan, estimator_name, input_name)
214 Test failed: Array contains non-finite values.
215 """
--> 216 _assert_all_finite(
217 X.data if sp.issparse(X) else X,
218 allow_nan=allow_nan,
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
124 return
125
--> 126 _assert_all_finite_element_wise(
127 X,
128 xp=xp,
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in _assert_all_finite_element_wise(X, xp, allow_nan, msg_dtype, estimator_name, input_name)
173 "#estimators-that-handle-nan-values"
174 )
--> 175 raise ValueError(msg_err)
176
177
ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
print(pd.__version__)
print(np.__version__)
2.0.3
1.25.2