# === Setup and Load Model-Ready Data ===

import warnings

# --- Ignore only relevant harmless warnings ---
warnings.filterwarnings("ignore", message="An input array is constant")  # from SHAP/Spearman correlation

# --- Core libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import joblib
from sklearn.inspection import permutation_importance
from pathlib import Path
import random

# --- Visualization style ---
sns.set_theme(style="whitegrid")
plt.rcParams.update({"figure.facecolor": "white", "axes.facecolor": "white"})

# --- Paths ---
DATA_DIR = Path("../data/processed")
MODEL_DIR = Path("../models")

# --- Load contextual metadata for later grouping ---
df_context = pd.read_csv(DATA_DIR / "countries_clean.csv")[["Country Name", "year", "Region", "IncomeGroup"]]

# --- Load model-ready dataset and trained model ---
df_model = pd.read_csv(DATA_DIR / "model_ready.csv")
model = joblib.load(MODEL_DIR / "random_forest_model.pkl")
X_columns = joblib.load(MODEL_DIR / "X_columns.pkl")

# --- Ensure feature alignment ---
missing_cols = [c for c in X_columns if c not in df_model.columns]
for c in missing_cols:
    df_model[c] = 0

X = df_model.reindex(columns=X_columns, fill_value=0).copy()
y = df_model["net_migration_per_1000_capped"].copy()

print(f"✅ Model-ready data loaded and aligned — {len(X_columns)} predictors")
print(f"Feature matrix shape: {X.shape}")

✅ Model-ready data loaded and aligned — 25 predictors
Feature matrix shape: (5712, 25)

# === Global Feature Importance — Random Forest ===

# --- Compute permutation importance ---
perm = permutation_importance(
    model,
    X,
    y,
    n_repeats=10,
    random_state=42
)

# --- Sort and convert to Series for easier visualization ---
imp = pd.Series(perm.importances_mean, index=X.columns).sort_values(ascending=True)

# --- Plot top 15 most important features ---
plt.figure(figsize=(8, 6))
imp.tail(15).plot(kind="barh", color="teal")

plt.title("Global Feature Importance (Permutation, Random Forest)", pad=10)
plt.xlabel("Mean Importance (Δ in Model Score)")
plt.ylabel("")
plt.tight_layout()
plt.show()

# --- Display top-ranked features as a table ---
top_features = imp.sort_values(ascending=False).head(15).round(4).to_frame("Mean Importance")
display(top_features)

# === SHAP Explainability Setup (TreeExplainer) ===

# --- Sample manageable subset for visualization ---
X_sample = X.sample(n=500, random_state=42).astype(np.float32)

# --- Initialize SHAP TreeExplainer for Random Forest ---
# (check_additivity=False → faster and safe for tree-based models)
explainer = shap.TreeExplainer(model)

# --- Compute SHAP values ---
shap_values = explainer.shap_values(X_sample, check_additivity=False)

# --- Extract metadata for downstream use ---
feature_names = X_sample.columns.tolist()
expected_value = (
    explainer.expected_value[0]
    if isinstance(explainer.expected_value, (list, np.ndarray))
    else explainer.expected_value
)

print(f"✅ SHAP values computed for {len(X_sample)} samples and {len(feature_names)} features.")

✅ SHAP values computed for 500 samples and 25 features.

# === Global SHAP Feature Importance ===

# --- Mean absolute SHAP values (global impact ranking) ---
plt.figure(figsize=(8, 6))
shap.summary_plot(
    shap_values,
    X_sample,
    plot_type="bar",
    max_display=15,
    show=False
)
plt.title("Global Feature Importance — Mean |SHAP| (Top 15)", pad=10)
plt.xlabel("Mean |SHAP value| (Impact on model output)")
plt.tight_layout()
plt.show()

# --- Beeswarm plot (distribution of SHAP effects across samples) ---
plt.figure(figsize=(8, 6))
shap.summary_plot(
    shap_values,
    X_sample,
    max_display=15,
    show=False
)
plt.title("Feature-level SHAP Distribution (Beeswarm Plot)", pad=10)
plt.tight_layout()
plt.show()

# === Feature-Level SHAP Direction and Strength ===

# --- Convert SHAP values to DataFrame for analysis ---
sv = pd.DataFrame(shap_values, columns=feature_names, index=X_sample.index)

# --- Mean absolute SHAP values (magnitude of importance) ---
abs_mean = sv.abs().mean().sort_values(ascending=False)

# --- Directionality: Spearman correlation (feature ↔ SHAP value) ---
dirs = pd.Series(
    {f: X_sample[f].corr(sv[f], method="spearman") for f in feature_names}
).reindex(abs_mean.index)

# --- Combine into summary table ---
top_features = (
    pd.DataFrame({
        "mean_|SHAP|": abs_mean,
        "direction (Spearman)": dirs
    })
    .round(4)
    .head(15)
)

# --- Display styled table ---
display(
    top_features.style
        .background_gradient(subset=["mean_|SHAP|"], cmap="Blues")
        .set_caption("Top 15 Features — SHAP Magnitude and Direction")
)

# === SHAP Dependence Plots (Top Features) ===

# --- Select top 2–3 features by global SHAP importance ---
top_features = list(top_features.index[:3])  # top 3 for richer comparison

# --- Generate SHAP dependence plots with automatic interaction highlighting ---
for f in top_features:
    shap.dependence_plot(
        f,
        shap_values,
        X_sample,
        interaction_index="auto",  # auto-selects strongest interacting variable
        show=False
    )
    plt.title(f"SHAP Dependence Plot — {f}", fontsize=12, pad=10)
    plt.xlabel(f)
    plt.ylabel("SHAP Value (Impact on Prediction)")
    plt.tight_layout()
    plt.show()

# === Regional Sensitivity Analysis — Mean |SHAP| by Feature ===

# --- Compute regional mean absolute SHAP impacts ---
sv_abs = sv.abs().copy()

# Map regions from df_context (index-aligned sample)
sv_abs["Region"] = df_context.loc[X_sample.index, "Region"].values

# --- Aggregate mean absolute SHAP values by region ---
region_shap = (
    sv_abs.groupby("Region", dropna=False)[feature_names]
    .mean()
    .sort_index()
)

# --- Visualization: Heatmap of regional mean |SHAP| ---
fig, ax = plt.subplots(
    figsize=(min(14, 0.5 * len(region_shap.columns) + 4), 0.5 * len(region_shap) + 2)
)

sns.heatmap(
    region_shap,
    cmap="Blues",
    cbar_kws={"label": "Mean |SHAP| (Impact Strength)"},
    linewidths=0.3,
    ax=ax
)

ax.set_title("Regional Sensitivity — Mean |SHAP| by Feature", pad=12)
ax.set_xlabel("Feature")
ax.set_ylabel("Region")

plt.xticks(rotation=75, ha="right")
plt.tight_layout()
plt.show()

# --- Show top 5 features per region for inspection ---
top_per_region = (
    region_shap.apply(lambda r: r.sort_values(ascending=False).head(5).index.tolist(), axis=1)
    .rename("Top 5 Drivers")
    .to_frame()
)

display(top_per_region.style.set_caption("Top 5 SHAP Drivers by Region"))

# === Regional Composition — Share of Total Mean |SHAP| (Top-5 Drivers) ===

# --- Identify Top-5 Globally Most Influential Features ---
top_features = region_shap.mean().sort_values(ascending=False).head(5).index

# --- Compute Each Region’s Proportional Share of These Drivers ---
region_share = region_shap[top_features].div(region_shap[top_features].sum(axis=1), axis=0)

# --- Visualization ---
ax = region_share.plot(
    kind="barh",
    stacked=True,
    figsize=(10, 5),
    width=0.8,
    alpha=0.9,
)

plt.title("Regional Composition of Top-5 SHAP Drivers", fontsize=13, pad=14)
plt.xlabel("Proportion of Total Mean |SHAP|", fontsize=11)
plt.ylabel("Region", fontsize=11)
plt.tight_layout(rect=[0, 0, 0.78, 1])  # Add right margin for legend

# --- Legend Styling (placed outside the plot) ---
plt.legend(
    title="Feature",
    bbox_to_anchor=(1.02, 0.5),
    loc="center left",
    frameon=False,
)

# Adjust spacing between bars and legend area
plt.subplots_adjust(left=0.15, right=0.78)
plt.show()

# === Local Explanations — SHAP Waterfall Plots (3 Random Samples) ===

# --- Convert expected value to scalar (if array-like) ---
expected_value_scalar = (
    explainer.expected_value[0]
    if isinstance(explainer.expected_value, (list, np.ndarray))
    else explainer.expected_value
)

# --- Reproducible random selection of samples ---
random.seed(42)
idxs = random.sample(list(X_sample.index), 3)

# --- Try new SHAP API; fall back to legacy if needed for static HTML ---
try:
    from shap.plots import waterfall as shap_waterfall_new
    use_legacy = False
except Exception:
    use_legacy = True

if use_legacy:
    # Legacy import for maximum compatibility with nbconvert/HTML export
    try:
        from shap.plots._waterfall import waterfall_legacy as shap_waterfall_legacy
    except Exception as e:
        print(f"⚠️ Could not import legacy waterfall plotter: {e}")
        shap_waterfall_legacy = None

for i in idxs:
    print(f"\n💧 Local SHAP Waterfall — Sample Index: {i}")

    # Ensure plain numpy arrays for plotting stability
    sample_features = X_sample.loc[i].to_numpy()
    sample_feature_names = list(X_sample.columns)
    sample_values = sv.loc[i].to_numpy()

    # New API path (preferred)
    if not use_legacy:
        exp = shap.Explanation(
            values=sample_values.astype(float),
            base_values=float(expected_value_scalar),
            data=sample_features.astype(float),
            feature_names=sample_feature_names,
        )
        # Use show=False so nbconvert captures the Matplotlib figure
        shap.plots.waterfall(exp, max_display=12, show=False)
        plt.title(f"SHAP Local Explanation — Sample {i}", pad=10)
        plt.tight_layout()
        plt.show()
    else:
        # Legacy fallback (still matplotlib-based)
        if shap_waterfall_legacy is not None:
            shap_waterfall_legacy(
                shap_values=sample_values.astype(float),
                feature_names=sample_feature_names,
                max_display=12,
                show=False,
                expected_value=float(expected_value_scalar),
                features=sample_features.astype(float),
            )
            plt.title(f"SHAP Local Explanation — Sample {i}", pad=10)
            plt.tight_layout()
            plt.show()
        else:
            print("❌ Skipping plot: no compatible SHAP waterfall function available.")

💧 Local SHAP Waterfall — Sample Index: 3904

💧 Local SHAP Waterfall — Sample Index: 577

💧 Local SHAP Waterfall — Sample Index: 2209

# === Predictive Scenario Simulation — GDP Growth Shocks ===

# --- Define Counterfactual Scenarios (±2 Percentage Points) ---
scenarios = {
    "Baseline": X.copy(),
    "High GDP Growth (+2pp)": X.assign(gdp_growth=X["gdp_growth"] + 2),
    "Low GDP Growth (−2pp)": X.assign(gdp_growth=X["gdp_growth"] - 2),
}

# --- Generate Mean Predictions for Each Scenario ---
results = {}
for name, X_s in scenarios.items():
    preds = model.predict(X_s)
    results[name] = preds.mean()

results_series = pd.Series(results)
deltas = results_series - results_series["Baseline"]

# --- Visualization ---
fig, ax = plt.subplots(figsize=(6, 4))
bars = results_series.plot(kind="bar", color="teal", ax=ax)

# Annotate bars with numeric labels
ax.bar_label(ax.containers[0], fmt="%.2f", padding=4)
plt.title("Scenario: Impact of GDP Growth on Predicted Migration", pad=10)
plt.ylabel("Average Predicted Net Migration (per 1,000)")
plt.xlabel("")
plt.tight_layout()
plt.show()

# --- Δ vs. Baseline ---
print("Mean prediction differences vs. Baseline (Δ per 1,000):")
display(deltas.round(4).to_frame("Δ Prediction"))

Mean prediction differences vs. Baseline (Δ per 1,000):

# === Predictive Scenario Simulation — Demographic Shifts ===

# --- Define "What-If" Demographic Scenarios ---
demo_scenarios = {
    "Baseline": X.copy(),
    "Higher Adolescent Fertility (+10%)": X.assign(adol_fertility=X["adol_fertility"] * 1.10),
    "Lower Adolescent Fertility (−10%)": X.assign(adol_fertility=X["adol_fertility"] * 0.90),
    "Higher Human Development (+5%)": X.assign(hdi=X["hdi"] * 1.05),
}

# --- Generate Mean Predictions for Each Scenario ---
demo_results = {}
for name, X_s in demo_scenarios.items():
    preds = model.predict(X_s)
    demo_results[name] = preds.mean()

demo_series = pd.Series(demo_results)
demo_deltas = demo_series - demo_series["Baseline"]

# --- Visualization ---
fig, ax = plt.subplots(figsize=(7, 4))
bars = demo_series.plot(kind="bar", color="darkorange", ax=ax)

# Annotate bars with numeric labels
ax.bar_label(ax.containers[0], fmt="%.2f", padding=4)
plt.title("Scenario: Demographic and Development Shifts — Migration Outcomes", pad=10)
plt.ylabel("Average Predicted Net Migration (per 1,000)")
plt.xlabel("")
plt.tight_layout()
plt.show()

# --- Δ vs. Baseline ---
print("Mean prediction differences vs. Baseline (Δ per 1,000):")
display(demo_deltas.round(4).to_frame("Δ Prediction"))

Mean prediction differences vs. Baseline (Δ per 1,000):

# === Scenario Runner — Δ vs Baseline (Global, Regional, Local) ===

def run_scenarios(scenarios: dict):
    """
    Run multiple counterfactual scenarios and compute deltas vs baseline.
    Returns:
        preds: dict of full model predictions
        deltas: dict of (prediction - baseline) for each scenario
    """
    preds = {name: model.predict(df_) for name, df_ in scenarios.items()}
    base = preds["Baseline"]
    deltas = {
        name: pd.Series(pred - base, index=scenarios[name].index)
        for name, pred in preds.items()
        if name != "Baseline"
    }
    return preds, deltas


# --- Run Scenarios (e.g., GDP growth shocks from previous block)
preds, deltas = run_scenarios(scenarios)

# --- Summary Table of Global Mean Δ ---
summary_global = pd.DataFrame(
    {k: v.mean() for k, v in deltas.items()},
    index=["Δ mean prediction (net migration per 1,000 population)"]
).T

display(summary_global.style.format("{:+.3f}"))

# --- Global Δ Plot ---
plt.figure(figsize=(7, 4.5))
ax = sns.barplot(
    x=summary_global.index,
    y=summary_global["Δ mean prediction (net migration per 1,000 population)"],
    color="steelblue",
    edgecolor="black",
)

plt.title("Scenario Δ Predictions vs Baseline (Net Migration per 1,000)", pad=14, fontsize=12)
plt.xlabel("Scenario", fontsize=11)
plt.ylabel("Δ Prediction vs Baseline", fontsize=11)

# Annotate values above bars with more spacing
for i, v in enumerate(summary_global["Δ mean prediction (net migration per 1,000 population)"]):
    ax.text(i, v + np.sign(v) * 0.03, f"{v:+.2f}",
            ha="center", va="bottom", fontsize=10, fontweight="medium")

# Improve layout spacing
plt.xticks(rotation=0)
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


# === Regional Breakdown of Scenario Effects ===

# Ensure df_context aligns with model-ready data indices
df_context = df_context.reindex(df_model.index)

if "Region" in df_context.columns:
    by_region = (
        pd.concat(
            {k: v.groupby(df_context.loc[v.index, "Region"]).mean() for k, v in deltas.items()},
            axis=1
        )
        .sort_index()
    )

    # Sort regions by magnitude of mean Δ (strongest effects first)
    by_region = by_region.reindex(
        by_region.abs().mean(axis=1).sort_values(ascending=False).index
    )

    print("Regional mean Δ vs Baseline (net migration per 1,000):")
    display(by_region.style.format("{:+.3f}").background_gradient(cmap="coolwarm", axis=None))

    # --- Regional Heatmap ---
    plt.figure(figsize=(8, 4.5))
    sns.heatmap(by_region, annot=True, fmt="+.2f", cmap="coolwarm", center=0)
    plt.title("Regional Scenario Effects (Δ vs Baseline)", pad=10)
    plt.xlabel("Scenario")
    plt.ylabel("Region")
    plt.tight_layout()
    plt.show()

Regional mean Δ vs Baseline (net migration per 1,000):

# === Top Movers — Country-Year Level ===

if {"Country Name", "year"}.issubset(df_context.columns):
    for k, v in deltas.items():
        top_up = v.sort_values(ascending=False).head(10)
        top_down = v.sort_values().head(10)

        print(f"\n📈 {k} — Top 10 Increases:")
        display(
            df_context.loc[top_up.index, ["Country Name", "year"]]
            .assign(Δ=top_up.values)
            .reset_index(drop=True)
        )

        print(f"\n📉 {k} — Top 10 Decreases:")
        display(
            df_context.loc[top_down.index, ["Country Name", "year"]]
            .assign(Δ=top_down.values)
            .reset_index(drop=True)
        )

📈 High GDP Growth (+2pp) — Top 10 Increases:

📉 High GDP Growth (+2pp) — Top 10 Decreases:

📈 Low GDP Growth (−2pp) — Top 10 Increases:

📉 Low GDP Growth (−2pp) — Top 10 Decreases:

# === Regional Feature Context — Average Key Indicators ===

# --- Attach Region back to feature matrix if missing ---
if "Region" not in X.columns and "Region" in df_context.columns:
    X["Region"] = df_context["Region"]

# --- Compute regional averages for selected core features ---
region_effects = (
    X.groupby("Region")[["gdp_growth", "adol_fertility", "hdi"]]
    .mean()
    .sort_index()
)

# --- Visualization ---
fig, ax = plt.subplots(figsize=(10, 6))
region_effects.plot(kind="bar", ax=ax)

plt.title("Regional Average Socioeconomic and Demographic Indicators", pad=10)
plt.ylabel("Mean Value")
plt.xlabel("Region")
plt.xticks(rotation=45, ha="right")

# Add grid and adjust layout
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

Author	Golib Sanaev
Project	Forecasting Migration Flows with Machine Learning
Created	2025-09-23
Last Updated	2025-10-14

	Mean Importance
pop_growth	1.4900
gdp_per_capita	0.2662
adol_fertility	0.1099
trade_openness_ratio	0.0866
hdi	0.0628
gdp_growth	0.0404
unemployment	0.0401
is_crisis_lag1	0.0228
urbanization_rate_stable	0.0222
year	0.0159
gdp_growth_x_Low income	0.0000
gdp_growth_x_Lower middle income	0.0000
gdp_growth_x_Upper middle income	0.0000
IncomeGroup_Low income	0.0000
IncomeGroup_Lower middle income	0.0000

Metric	Description
mean \|SHAP\|	Average magnitude of impact (overall importance).
Spearman correlation	Indicates direction: positive values mean that higher feature values raise predicted net migration; negative values imply the opposite.

	mean_\|SHAP\|	direction (Spearman)
pop_growth	4.403900	0.970100
gdp_per_capita	2.807900	0.909500
adol_fertility	1.101900	-0.871000
trade_openness_ratio	0.707500	0.779900
hdi	0.616000	0.852200
unemployment	0.261700	0.572800
year	0.168100	0.670600
urbanization_rate_stable	0.161800	0.393500
gdp_growth	0.136300	0.791600
Region_Europe & Central Asia	0.108500	nan
pop_growth_x_Lower middle income	0.073400	nan
gdp_growth_x_Low income	0.073400	nan
Region_Middle East & North Africa	0.058900	nan
is_crisis_lag1	0.041700	0.109300
pop_growth_x_Upper middle income	0.041300	nan

	Top 5 Drivers
Region
East Asia & Pacific	['pop_growth', 'gdp_per_capita', 'adol_fertility', 'trade_openness_ratio', 'hdi']
Europe & Central Asia	['pop_growth', 'gdp_per_capita', 'adol_fertility', 'trade_openness_ratio', 'hdi']
Latin America & Caribbean	['pop_growth', 'gdp_per_capita', 'adol_fertility', 'hdi', 'trade_openness_ratio']
Middle East & North Africa	['pop_growth', 'gdp_per_capita', 'adol_fertility', 'hdi', 'trade_openness_ratio']
North America	['gdp_per_capita', 'pop_growth', 'adol_fertility', 'hdi', 'trade_openness_ratio']
South Asia	['pop_growth', 'gdp_per_capita', 'adol_fertility', 'trade_openness_ratio', 'hdi']
Sub-Saharan Africa	['pop_growth', 'gdp_per_capita', 'adol_fertility', 'hdi', 'trade_openness_ratio']

📘 Forecasting Migration Flows — Notebook 04: Model Interpretation and Scenario Analysis¶

🎯 Purpose¶

📑 Table of Contents¶

⚙️ 1. Setup and Load Model-Ready Data¶

🌲 2. Global Feature Importance — Random Forest ¶

🧠 3. Model Explainability with SHAP ¶

🧩 3.1. SHAP Setup (TreeExplainer) ¶

🔍 3.2. Global SHAP Feature Importance ¶

↕️ 3.3. Feature-Level Direction and Strength ¶

📈 3.4. SHAP Dependence Plots ¶

🌍 3.5. Regional Sensitivity Analysis ¶

🗺️ 3.6. Regional Composition — Top-5 Drivers ¶

💧 3.7. Local Explanations — Waterfall Plots ¶

🧪 4. Scenario Simulation Experiments ¶

💹 4.1. GDP Growth Shocks ¶

👥 4.2. Demographic and Development Shifts ¶

🚀 4.3. Scenario Runner — Global, Regional and Local Δ ¶

📊 4.4. Top Movers — Country-Year Highlights ¶

🧭 5. Regional Feature Context ¶

🧾 6. Key Insights and Next Steps ¶

	Δ Prediction
Baseline	0.0000
High GDP Growth (+2pp)	0.0635
Low GDP Growth (−2pp)	-0.0688

	Δ Prediction
Baseline	0.0000
Higher Adolescent Fertility (+10%)	-0.1808
Lower Adolescent Fertility (−10%)	0.1797
Higher Human Development (+5%)	0.2185

	High GDP Growth (+2pp)	Low GDP Growth (−2pp)
Region
Middle East & North Africa	+0.124	-0.129
Europe & Central Asia	+0.132	-0.094
East Asia & Pacific	+0.073	-0.069
North America	-0.010	-0.081
South Asia	+0.011	-0.051
Sub-Saharan Africa	-0.007	-0.032
Latin America & Caribbean	+0.006	-0.031

	Country Name	year	Δ
0	Albania	1995	19.219643
1	Armenia	2006	16.412004
2	Moldova	2021	16.403101
3	Armenia	2002	15.941600
4	Armenia	2007	15.336004
5	Armenia	2003	14.524618
6	Armenia	2005	12.940585
7	Georgia	2007	8.984493
8	China	2006	6.517327
9	United Arab Emirates	2017	5.017619

	Country Name	year	Δ
0	Zimbabwe	1992	-2.544630
1	Eritrea	1995	-1.689419
2	Eritrea	2003	-1.636124
3	Jordan	2013	-1.450958
4	South Sudan	1999	-1.213621
5	South Sudan	1997	-1.185994
6	United Arab Emirates	2014	-1.125625
7	Guinea-Bissau	1996	-0.790550
8	Syrian Arab Republic	2018	-0.752600
9	South Sudan	2020	-0.733347

	Country Name	year	Δ
0	Cameroon	1993	2.849440
1	Gambia, The	2011	1.755261
2	Afghanistan	2007	0.968255
3	Uganda	1995	0.941282
4	Korea, Rep.	2007	0.880607
5	United Arab Emirates	2016	0.860106
6	Germany	2006	0.855100
7	Mauritania	1995	0.850444
8	Timor-Leste	2003	0.772941
9	Madagascar	2020	0.759903

	Country Name	year	Δ
0	Bosnia and Herzegovina	1995	-15.858746
1	Cabo Verde	2022	-12.603443
2	Togo	1994	-8.713796
3	Croatia	2001	-3.649290
4	Georgia	1996	-3.541570
5	Kuwait	2018	-3.079227
6	Serbia	2022	-2.887825
7	Bahrain	1991	-2.810712
8	Singapore	1995	-2.715122
9	Belgium	2020	-2.713292