import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

X, y = fetch_openml(data_id=43938, as_frame=True, return_X_y=True, parser="auto")
df_original = pd.concat([X, y], axis=1)

df_original["class"].value_counts()

class
not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: count, dtype: int64

df = df_original.copy()
df["binary_response"] = df["class"].isin(["priority", "spec_prior"])
y_var = "binary_response"
x_vars = X.columns
print(f"The prepared dataset contains {df.shape[0]} rows.")

The prepared dataset contains 12960 rows.

df.head()

contingency_table = pd.crosstab(df["health"], df['binary_response'])    
print("Contingency Table:")
contingency_table

Contingency Table:

df_train, df_test = train_test_split(df, train_size=0.75, random_state=1234321)
df = (
    pd.concat((df_train, df_test), axis=0, keys=("train", "test"))
    .reset_index(level=0)
    .rename(columns={"level_0": "split"})
)

y_train, y_test = df_train[y_var].astype(float), df_test[y_var].astype(float)
X_train, X_test = df_train[x_vars], df_test[x_vars]

df.groupby("split")[["binary_response"]].mean()

from sklearn.dummy import DummyClassifier


m_trivial = DummyClassifier(strategy="prior").fit(X_train, y_train)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


# All variables are discrete.
x_discrete = x_vars
# ColumnTransformer for linear models
col_trans_cat = ColumnTransformer(
    [
        (
            "cat_features",
            OneHotEncoder(drop="first", sparse_output=False),
            x_discrete,
        ),
    ]
)
m_logistic = Pipeline(
    [
        ("column_transformer", col_trans_cat),
        (
            "model",
            LogisticRegression(
                penalty=None,
                solver="newton-cholesky",
            )
        ),
    ]
).fit(X_train, y_train)

feature_names = m_logistic[:-1].get_feature_names_out()
model_coefs = m_logistic.steps[-1][1].coef_.squeeze()
model_coefs = pd.DataFrame(
    {
        "coefficient": [x.removeprefix("cat_features__") for x in feature_names],
         "value": model_coefs,
    }
).sort_values(by="value")

with pd.option_context("display.max_rows", 20):
    display(model_coefs)

from sklearn.tree import DecisionTreeClassifier, plot_tree


# We can reuse the categorical one-hot encoding of the linear model.
m_tree = Pipeline(
    [
        ("column_transformer", col_trans_cat),
        (
            "model", DecisionTreeClassifier(
                criterion="log_loss", min_samples_leaf=2,
            )
        ),
    ]
).fit(X_train, y_train)

plot_tree(
    m_tree.steps[-1][1],
    feature_names=list(m_tree[:-1].get_feature_names_out()),
    max_depth=3,
);

m_tree.steps[-1][1].get_depth()

17

# Get the leaf node assignments for each sample
leaf_ids = m_tree.steps[-1][1].apply(col_trans_cat.fit_transform(X_train))

# Count the samples in each leaf node
unique_leaf_ids = np.unique(leaf_ids)
leaf_samples = [np.sum(leaf_ids == node_id) for node_id in unique_leaf_ids]

np.sort(leaf_samples)[:10]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

%%time
# Note that this cell might take a few seconds.
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder


# ColumnTransformer for boosted trees
col_trans_bt = ColumnTransformer(
    [
        ("categorical_features", OrdinalEncoder(), x_discrete),
    ]
)

m_hgbt_base = Pipeline(
    [
        ("column_transformer", col_trans_bt),
        (
            "model", HistGradientBoostingClassifier(
                loss="log_loss",
                categorical_features=list(range(len(x_discrete))),
                max_iter=100,
                early_stopping=True,
                random_state=33,
                validation_fraction=0.2,
            )
        ),
    ]
)

param_grid = {
    "model__learning_rate": [0.6, 0.7, 0.8],  # quite large
    "model__l2_regularization": [0, 1e-6, 1e-3],
    "model__max_depth": [10, 15, 20],
    "model__min_samples_leaf": [2, 3, 4],
}


# successive halfing grid search (CV) on the training data
shgs = HalvingGridSearchCV(
    m_hgbt_base,
    param_grid=param_grid,
    cv=2,
    scoring="neg_log_loss",
    random_state=321,
).fit(X_train, y_train)

m_hgbt = shgs.best_estimator_

CPU times: user 33.2 s, sys: 569 ms, total: 33.8 s
Wall time: 8.81 s

shgs.best_params_, shgs.best_score_

({'model__l2_regularization': 1e-06,
  'model__learning_rate': 0.7,
  'model__max_depth': 15,
  'model__min_samples_leaf': 2},
 -6.40295807008405e-05)

m_hgbt[-1].n_iter_

27

models = [m_trivial, m_logistic, m_tree, m_hgbt]
model_names = ["Trivial", "Logistic", "Tree", "HGBT"]

df_pred_test = pd.DataFrame(
    {name: m.predict_proba(X_test)[:, 1] for m, name in zip(models, model_names)}
)

import matplotlib.pyplot as plt
from model_diagnostics.calibration import compute_bias, plot_bias, plot_reliability_diagram

fig, ax = plt.subplots(figsize=(8, 8))
plot_reliability_diagram(
    y_obs=y_test,
    y_pred=df_pred_test[["Logistic", "Tree", "HGBT"]],
    n_bootstrap=100,
    ax=ax,
);

compute_bias(
    y_obs=y_test,
    y_pred=df_pred_test,
    feature=None,
)

compute_bias(
    y_obs=y_test,
    y_pred=df_pred_test,
    feature=X_test["parents"],
)

fig, ax = plt.subplots(figsize=(8, 8))
plot_bias(
    y_obs=y_test,
    y_pred=df_pred_test,
    feature=X_test["parents"],
    confidence_level=0.8
);

from model_diagnostics.scoring import LogLoss, decompose


log_loss = LogLoss()

df = decompose(
    y_obs=y_test,
    y_pred=df_pred_test,
    scoring_function=log_loss,
)
df.sort(["score"])

np.sum((df_pred_test["Tree"] == 1) & (y_test.values == 0))

4

np.sum((df_pred_test["Tree"] == 0) & (y_test.values == 1))

1

from model_diagnostics.scoring import SquaredError, plot_murphy_diagram


plot_murphy_diagram(
    y_obs=y_test,
    y_pred=df_pred_test.loc[:, ["Logistic", "Tree", "HGBT"]],
    etas=200,
);

decompose(
    y_obs=y_test,
    y_pred=df_pred_test,
    scoring_function=SquaredError(),
).sort(["score"])

	parents	has_nurs	form	children	housing	finance	social	health	class	binary_response
0	usual	proper	complete	1	convenient	convenient	nonprob	recommended	recommend	False
1	usual	proper	complete	1	convenient	convenient	nonprob	priority	priority	True
2	usual	proper	complete	1	convenient	convenient	nonprob	not_recom	not_recom	False
3	usual	proper	complete	1	convenient	convenient	slightly_prob	recommended	recommend	False
4	usual	proper	complete	1	convenient	convenient	slightly_prob	priority	priority	True

	coefficient	value
3	has_nurs_less_proper	-11.857906
4	has_nurs_proper	-11.849024
1	parents_usual	-11.532055
0	parents_pretentious	-10.116730
2	has_nurs_improper	-9.870463
5	has_nurs_very_crit	-0.097957
16	social_slightly_prob	0.000125
6	form_completed	0.829653
8	form_incomplete	1.767603
9	children_2	1.860888
14	finance_inconv	2.152042
13	housing_less_conv	2.414219
7	form_foster	3.181892
10	children_3	3.955277
11	children_more	4.120487
12	housing_critical	5.707804
15	social_problematic	11.130701
18	health_recommended	47.231852
17	health_priority	59.699965

model	bias_mean	bias_count	bias_weights	bias_stderr	p_value
str	f64	u32	f64	f64	f64
"Trivial"	0.006379	3240	3240.0	0.008452	0.4505
"Logistic"	-0.001374	3240	3240.0	0.001493	0.357317
"Tree"	0.000772	3240	3240.0	0.000878	0.379312
"HGBT"	0.000008	3240	3240.0	0.000007	0.298421

model	parents	bias_mean	bias_count	bias_weights	bias_stderr	p_value
str	cat	f64	u32	f64	f64	f64
"Trivial"	"great_pret"	-0.009376	1104	1104.0	0.014341	0.513399
"Trivial"	"pretentious"	-0.00183	1089	1089.0	0.01451	0.899677
"Trivial"	"usual"	0.031528	1047	1047.0	0.015072	0.036696
"Logistic"	"great_pret"	-0.000006	1104	1104.0	0.000017	0.726384
"Logistic"	"pretentious"	0.000139	1089	1089.0	0.003114	0.964296
"Logistic"	"usual"	-0.004392	1047	1047.0	0.003295	0.1828
"Tree"	"great_pret"	0.0	1104	1104.0	0.0	0.0
"Tree"	"pretentious"	0.002296	1089	1089.0	0.002415	0.342047
"Tree"	"usual"	-1.0604e-19	1047	1047.0	0.001032	1.0
"HGBT"	"great_pret"	-7.7474e-8	1104	1104.0	3.6859e-8	0.035785
"HGBT"	"pretentious"	7.5675e-7	1089	1089.0	4.5198e-7	0.094364
"HGBT"	"usual"	0.000023	1047	1047.0	0.000023	0.312915

model	miscalibration	discrimination	uncertainty	score
str	f64	f64	f64	f64
"HGBT"	0.000009	0.65545	0.65545	0.000009
"Logistic"	0.003509	0.635995	0.65545	0.022964
"Trivial"	0.000088	0.0	0.65545	0.655539
"Tree"	inf	0.641366	0.65545	inf

Classification on Nursery Dataset¶

1. Data¶

1.1 Data Load and Preprocessing¶

1.2 Contingency tables¶

1.3 Data Split¶

2. Models¶

2.1 The trivial model¶

2.2 Logistic¶

2.3 Decision Tree¶

2.4 Gradient Boosted Decision Trees¶

3. Calibration Assessment¶

3.1. Reliability Diagrams¶

3.2 Conditional Bias¶

Unconditional Bias¶

Bias Conditional on parents¶

4. Predictive Model Performance¶

4.1 Score Decomposition of the Log Loss¶

4.2 Murphy Diagram¶

model	miscalibration	discrimination	uncertainty	score
str	f64	f64	f64	f64
"HGBT"	1.7994e-7	0.23139	0.23139	1.7994e-7
"Tree"	0.000014	0.228908	0.23139	0.002495
"Logistic"	0.000862	0.22503	0.23139	0.007221
"Trivial"	0.000041	0.0	0.23139	0.23143

	binary_response
split
test	0.636420
train	0.642798