Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3115,7 +3115,7 @@ def set_weight(
# Check if the weight contains values other than one
if weight is not None:
if _is_pyarrow_array(weight):
if pa_compute.all(pa_compute.equal(weight, 1)).as_py():
if pa_compute.all(pa_compute.equal(weight, 1)).as_py(): # type: ignore[attr-defined]
weight = None
elif np.all(weight == 1):
weight = None
Expand Down
22 changes: 14 additions & 8 deletions python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,8 @@ def fit(
params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
params["metric"] = [metric for metric in params["metric"] if metric is not None]

self._fitted_with_feature_names = isinstance(X, (pd_DataFrame, pa_Table))

if not isinstance(X, (pd_DataFrame, pa_Table)):
_X, _y = _LGBMValidateData(
self,
Expand Down Expand Up @@ -1359,25 +1361,29 @@ def feature_name_(self) -> List[str]:
def feature_names_in_(self) -> np.ndarray:
""":obj:`array` of shape = [n_features]: scikit-learn compatible version of ``.feature_name_``.

Only available when training data had feature names (e.g. a pandas DataFrame).
When training was done with data without feature names (e.g. a numpy array),
accessing this attribute raises ``AttributeError``.

.. versionadded:: 4.5.0
"""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.")
if not self._fitted_with_feature_names:
raise AttributeError(
f"'{type(self).__name__}' object has no attribute 'feature_names_in_'. "
"The training data did not have feature names "
"(e.g. was a numpy array rather than a pandas DataFrame)."
)
return np.array(self.feature_name_)

@feature_names_in_.deleter
def feature_names_in_(self) -> None:
"""Intercept calls to delete ``feature_names_in_``.

Some code paths in ``scikit-learn`` try to delete the ``feature_names_in_`` attribute
on estimators when a new training dataset that doesn't have features is passed.
LightGBM automatically assigns feature names to such datasets
(like ``Column_0``, ``Column_1``, etc.) and so does not want that behavior.

However, that behavior is coupled to ``scikit-learn`` automatically updating
``n_features_in_`` in those same code paths, which is necessary for compliance
with its API (via argument ``reset`` to functions like ``validate_data()`` and
``check_array()``).
on estimators when a new training dataset that doesn't have feature names is passed.
This is handled via ``_fitted_with_feature_names``, so deletion is a no-op here.

.. note::

Expand Down
24 changes: 21 additions & 3 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import itertools
import math
import re
import warnings
from functools import partial
from os import getenv
from pathlib import Path
Expand Down Expand Up @@ -1688,8 +1689,8 @@ def test_fit_only_raises_num_rounds_warning_when_expected(capsys):

@pytest.mark.parametrize("estimator_class", estimator_classes)
def test_getting_feature_names_in_np_input(estimator_class):
# input is a numpy array, which doesn't have feature names. LightGBM adds
# feature names to the fitted model, which is inconsistent with sklearn's behavior
# Input is a numpy array, which doesn't have feature names.
# feature_names_in_ should not be set (raises AttributeError), consistent with sklearn's behavior.
X, y = load_digits(n_class=2, return_X_y=True)
params = {"n_estimators": 2, "num_leaves": 7}
if estimator_class is lgb.LGBMModel:
Expand All @@ -1703,7 +1704,24 @@ def test_getting_feature_names_in_np_input(estimator_class):
model.fit(X, y, group=[X.shape[0]])
else:
model.fit(X, y)
np_assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]), strict=True)
assert not hasattr(model, "feature_names_in_"), (
"feature_names_in_ should not be set when training data had no feature names"
)
# auto-generated names should still be accessible via the LightGBM-specific feature_name_ property
assert model.feature_name_ == [f"Column_{i}" for i in range(X.shape[1])]


@pytest.mark.parametrize("estimator_class", [lgb.LGBMClassifier, lgb.LGBMRegressor])
def test_no_spurious_feature_name_warning_on_np_predict(estimator_class):
# Regression test for https://github.com/lightgbm-org/LightGBM/issues/6798
# sklearn 1.6+ warns "X does not have valid feature names, but ... was fitted with feature names"
# when predict() is called with a numpy array after fit() on a numpy array, because LightGBM
# auto-generates feature names. This should not produce any warning.
X, y = load_digits(n_class=2, return_X_y=True)
model = estimator_class(n_estimators=2, num_leaves=7).fit(X, y)
with warnings.catch_warnings():
warnings.simplefilter("error")
model.predict(X[:5])


@pytest.mark.parametrize("estimator_class", estimator_classes)
Expand Down