-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbench_polars_null_in_cat_xgb_hgb.py
More file actions
100 lines (83 loc) · 3.24 KB
/
bench_polars_null_in_cat_xgb_hgb.py
File metadata and controls
100 lines (83 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""Verify null-in-Categorical behavior for XGBoost 3.x and sklearn HGB
(HistGradientBoostingClassifier) -- the other two polars-capable
strategies in mlframe (alongside CatBoost).
Upstream fill in core.py applies to the base Polars DFs BEFORE tier
filtering, so XGB and HGB get the same pre-filled frame as CB. But
are they actually affected by null-in-Categorical? If yes, the fill
is load-bearing for them too; if not, it's a harmless no-op for
their path.
Bench matrix:
- XGB 3.x: null-free Categorical, null-in-Categorical
- HGB: null-free Categorical, null-in-Categorical
"""
from __future__ import annotations
import sys
import numpy as np
import polars as pl
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
def build_df(with_nulls: bool, n: int = 500) -> pl.DataFrame:
"""Float32 + Categorical column. with_nulls=True salts the cat
column with 10% None values."""
vals = list(np.random.choice(["a", "b", "c"], size=n))
if with_nulls:
for i in range(0, n, 10):
vals[i] = None
return pl.DataFrame({
"num": np.random.randn(n).astype(np.float32),
"cat": pl.Series("cat", vals, dtype=pl.String).cast(pl.Categorical),
})
def fit_xgb(df: pl.DataFrame):
from xgboost import XGBClassifier
y = np.random.randint(0, 2, size=df.height)
m = XGBClassifier(
enable_categorical=True,
tree_method="hist",
n_estimators=2,
max_depth=2,
verbosity=0,
)
try:
m.fit(df, y)
return True, None
except Exception as e:
return False, f"{type(e).__name__}: {str(e)[:200]}"
def fit_hgb(df):
"""sklearn HGB requires a pandas DataFrame (even when we call it
'polars-capable' -- mlframe's HGBStrategy casts to pandas internally
before fit). Try direct polars -> see what happens, then try
via pandas (the actual mlframe path)."""
from sklearn.ensemble import HistGradientBoostingClassifier
y = np.random.randint(0, 2, size=df.height)
m = HistGradientBoostingClassifier(
max_iter=2, max_depth=2, categorical_features="from_dtype"
)
# Direct polars: sklearn 1.8 has some polars support via Arrow.
try:
m.fit(df, y)
return True, None
except Exception as e:
return False, f"{type(e).__name__}: {str(e)[:200]}"
def main():
np.random.seed(42)
import xgboost, sklearn
print(f"xgboost: {xgboost.__version__}, sklearn: {sklearn.__version__}, polars: {pl.__version__}")
print()
df_clean = build_df(with_nulls=False)
df_dirty = build_df(with_nulls=True)
print(f"df_clean: cat null_count = {df_clean['cat'].null_count()}")
print(f"df_dirty: cat null_count = {df_dirty['cat'].null_count()}")
print()
print("XGB 3.x:")
ok, err = fit_xgb(df_clean)
print(f" clean cat: {'OK' if ok else 'FAIL'} {err or ''}")
ok, err = fit_xgb(df_dirty)
print(f" dirty cat: {'OK' if ok else 'FAIL'} {err or ''}")
print()
print("sklearn HGB (direct polars -> fit):")
ok, err = fit_hgb(df_clean)
print(f" clean cat: {'OK' if ok else 'FAIL'} {err or ''}")
ok, err = fit_hgb(df_dirty)
print(f" dirty cat: {'OK' if ok else 'FAIL'} {err or ''}")
if __name__ == "__main__":
main()