Thomas J. Fan
@thomasjpfan
This talk on Github: thomasjpfan/2020-richmond-ds-meetup-gradient-boosting$$ y = f(X) $$
(n_samples, n_features)
(n_samples,)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier()
clf.fit(X, y)
clf.predict(X)
$$ f(X) = h_0(X) + h_1(X) + h_2(X) + … $$
$$ f(X) = \sum_i h_i(X) $$
least_squares
least_absolute_deviation
poisson
binary_crossentropy
categorical_crossentropy
auto
least_squares
$$ L(y, f(X)) = \frac{1}{2}||y - f(X)||^2 $$
$$ \nabla L(y, f(X)) = -(y - f(X)) $$
$$ \nabla^2 L(y, f(X)) = 1 $$
$$ f_0(X) = C $$
$$ f_{m+1}(X) = f_{m}(X) - \eta \nabla L(y, f_{m}(X)) $$
where $\eta$ is the learning rate
least_squares
$$ f_{m+1}(X) = f_{m}(X) + \eta(y - f_{m}(X)) $$
$$ f_{m+1}(X) = f_{m}(X) + \eta h_{m}(X) $$
$$ f_0(X) = C $$
$$ f_{m+1}(X) = f_{m}(X) + h_{m}(X) $$
With two iterations of boosting:
$$ f(X) = C + h_0(X) + h_1(X) $$
For example, with $X=40$
$$ f(40) = 78 + h_0(40) + h_1(40) $$
least_square
$$ L(y, f(X)) = \frac{1}{2}||y - f(X)||^2 $$
$$ G = \nabla L(y, f(X)) = -(y - f(X)) $$
$$ H = \nabla^2 L(y, f(X)) = 1 $$
Maximize the Gain!
$$ Gain = \dfrac{1}{2}\left[\dfrac{G_L^2}{H_L+\lambda} + \dfrac{G_R^2}{H_R + \lambda} - \dfrac{(G_L+G_R)^2}{H_L+H_R+\lambda}\right] $$
default $\lambda$: l2_regularization=0
# Original data
[-0.752, 2.7042, 1.3919, 0.5091, -2.0636,
-2.064, -2.6514, 2.1977, 0.6007, 1.2487, ...]
# Binned data
[4, 9, 7, 6, 2, 1, 0, 8, 6, 7, ...]
$$ f(X) = C + \sum h_{m}(X) $$
for i in range(n_samples):
left, right = 0, binning_thresholds.shape[0]
while left < right:
middle = left + (right - left - 1) // 2
if data[i] <= binning_thresholds[middle]:
right = middle
else:
left = middle + 1
binned[i] = left
# sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
for i in prange(n_samples, schedule='static', nogil=True):
left, right = 0, binning_thresholds.shape[0]
while left < right:
middle = left + (right - left - 1) // 2
if data[i] <= binning_thresholds[middle]:
right = middle
else:
left = middle + 1
binned[i] = left
# sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
with nogil:
for feature_idx in prange(n_features, schedule='static'):
self._compute_histogram_brute_single_feature(...)
for feature_idx in prange(n_features, schedule='static',
nogil=True):
_subtract_histograms(feature_idx, ...)
# sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
for feature_idx in prange(n_features, schedule='static'):
# For each feature, find best bin to split on
# sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
for thread_idx in prange(n_threads, schedule='static',
chunksize=1):
# splits a partition of node
least_squares
# sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
for i in prange(n_samples, schedule='static', nogil=True):
gradients[i] = raw_predictions[i] - y_true[i]
max_bins=255
HistGradientBoostingRegressor
loss=least_squares
(default)least_absolute_deviation
poisson
HistGradientBoostingClassifier
loss=auto
(default)binary_crossentropy
categorical_crossentropy
l2_regularization=0
learning_rate=0.1
($\eta$)max_iter=100
max_leaf_nodes=31
max_depth=None
min_samples_leaf=20
early_stopping='auto'
(enabled if n_samples>10_000
)scoring='loss'
validation_fraction=0.1
n_iter_no_change=10
tol=1e-7
verbose=0
random_state=None
export OMP_NUM_THREADS=8
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
import numpy as np
X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
y = [0, 0, 1, 1]
gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
gbdt.predict(X)
# [0 0 1 1]
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
X, y = ...
gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=[1, 0]).fit(X, y)
from sklearn.inspection import plot_partial_dependence
disp = plot_partial_dependence(
gbdt_no_cst, X, features=[0], feature_names=['feature 0'], line_kw={...})
plot_partial_dependence(gbdt_cst, X, features=[0], line_kw={...}, ax=disp.axes_)
hist_poisson = HistGradientBoostingRegressor(loss='poisson')
From categorical example
categorical_mask = ([True] * n_categorical_features +
[False] * n_numerical_features)
hist = HistGradientBoostingRegressor(categorical_features=categorical_mask)
conda install -c conda-forge xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier()
conda install -c conda-forge lightgbm
from lightgbm.sklearn import LGBMClassifier
lgbm = LGBMClassifier()
conda install -c conda-forge catboost
from catboost.sklearn import CatBoostClassifier
catb = CatBoostClassifier()
library | time | roc auc | accuracy |
---|---|---|---|
sklearn | 66s | 0.8126 | 0.7325 |
lightgbm | 42s | 0.8125 | 0.7323 |
xgboost | 45s | 0.8124 | 0.7325 |
catboost | 90s | 0.8008 | 0.7223 |
xgboost=1.3.0.post0
lightgbm=3.1.1
catboost=0.24.3
Learn more about Histogram-Based Gradient Boosting
pip install scikit-learn==0.24.0rc1