# === Setup and Configuration ===

import os, warnings
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# --- Visualization style (consistent with previous notebooks) ---
sns.set_theme(style="whitegrid")
plt.rcParams.update({"figure.facecolor": "white", "axes.facecolor": "white"})

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

# === Project Settings ===

TARGET = "net_migration_per_1000_capped"
CORE_FEATURES = [
    "gdp_growth","gdp_per_capita","unemployment",
    "trade_openness_ratio","adol_fertility","hdi",
    "urbanization_rate_stable","pop_growth","is_crisis_lag1","year",
]
INTERACTIONS = [("gdp_growth","IncomeGroup"),("pop_growth","IncomeGroup")]
ID_COLS = ["country","year"]

# ✅ Explicit paths
CSV_PATH  = "../data/processed/model_ready.csv"
PARQ_PATH = "../data/processed/model_ready.parquet"
OUT_DIR   = "../outputs"
DATA_DIR = Path("../data/processed")
OUT_DIR = Path("../outputs")

# === Data Loading and Preparation ===

def load_model_ready() -> pd.DataFrame:
    """Load the model-ready dataset from the processed data folder."""
    if os.path.exists(PARQ_PATH):
        df = pd.read_parquet(PARQ_PATH)
        print(f"✅ Loaded model-ready data from: {PARQ_PATH}")
    elif os.path.exists(CSV_PATH):
        df = pd.read_csv(CSV_PATH)
        print(f"✅ Loaded model-ready data from: {CSV_PATH}")
    else:
        raise FileNotFoundError("Could not locate model_ready.* in ../data/processed/")

    # --- Validate columns ---
    needed = set(ID_COLS + [TARGET] + CORE_FEATURES + ["IncomeGroup"])
    missing = needed - set(df.columns)

    # Soft fix for missing 'country'
    if "country" not in df.columns:
        print("⚠️ 'country' column missing — creating placeholder index.")
        df["country"] = np.arange(len(df))

    remaining_missing = missing - {"country"}
    if remaining_missing:
        raise ValueError(f"Missing columns in model-ready data: {remaining_missing}")

    return df


# Load data
raw = load_model_ready()
print(
    f"Data loaded: {raw.shape}, years: {int(raw.year.min())}-{int(raw.year.max())}, "
    f"countries: {raw['country'].nunique()}"
)

✅ Loaded model-ready data from: ../data/processed/model_ready.parquet
Data loaded: (5712, 28), years: 1990-2023, countries: 168

# === Feature Engineering Utility ===

INCOME_ORDER = ["Low income","Lower middle income","Upper middle income","High income"]

def add_interactions(df: pd.DataFrame) -> pd.DataFrame:
    """Add one-hot encodings for IncomeGroup and interaction terms for GDP and population growth."""
    out = df.copy()
    if "IncomeGroup" in out.columns:
        cats = pd.Categorical(out["IncomeGroup"], categories=INCOME_ORDER, ordered=True)
        dummies = pd.get_dummies(cats, prefix="IncomeGroup", dummy_na=False)
        out = pd.concat([out.drop(columns=["IncomeGroup"]), dummies], axis=1)
        # interactions
        for feat in ["gdp_growth", "pop_growth"]:
            for col in dummies.columns:
                out[f"{feat}__x__{col}"] = out[feat] * dummies[col]
    return out


# === Prepare Matrices (apply FE) ===

dfX = add_interactions(raw)
feature_cols = [c for c in dfX.columns if c not in ID_COLS + [TARGET]]
print("#features:", len(feature_cols))


# === Prediction Helper ===

def bounded_clip(pred: np.ndarray, lower: float = -50.0, upper: float = 50.0) -> np.ndarray:
    """Clip predictions to stay within a plausible migration range."""
    return np.clip(pred, lower, upper)

# === Evaluation Helper ===

def rmse(y_true, y_pred):
    """Compute Root Mean Squared Error (RMSE)."""
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# === Residual-based Prediction Interval Helper ===

def residual_intervals(train_residuals: np.ndarray, alpha: float = 0.1) -> Tuple[float, float]:
    """
    Compute empirical residual quantiles for prediction intervals.
    Returns lower and upper residual bounds for the given alpha (default 90% PI).
    """
    lo = float(np.quantile(train_residuals, alpha / 2))
    hi = float(np.quantile(train_residuals, 1 - alpha / 2))
    return lo, hi

#features: 36

# === Temporal Validation (Two Views) ===

# A) Expanding-window (index-based TimeSeriesSplit) — assumes dfX sorted by year
dfX_sorted = dfX.sort_values(["year", "country"]).reset_index(drop=True)
X_all, y_all = dfX_sorted[feature_cols], dfX_sorted[TARGET]

tscv = TimeSeriesSplit(n_splits=5)
cv_rows = []

for fold, (tr_idx, va_idx) in enumerate(tscv.split(X_all), 1):
    X_tr, y_tr = X_all.iloc[tr_idx], y_all.iloc[tr_idx]
    X_va, y_va = X_all.iloc[va_idx], y_all.iloc[va_idx]

    rf = RandomForestRegressor(
        n_estimators=600,
        max_depth=None,
        min_samples_leaf=2,
        min_samples_split=4,
        max_features="sqrt",
        n_jobs=-1,
        random_state=RANDOM_STATE,
    ).fit(X_tr, y_tr)

    va_pred = bounded_clip(rf.predict(X_va))
    row = dict(
        view="tscv",
        fold=fold,
        mae=mean_absolute_error(y_va, va_pred),
        rmse=rmse(y_va, va_pred),
        r2=r2_score(y_va, va_pred),
    )
    cv_rows.append(row)

# B) Rolling-origin (year-based look-ahead)
years = sorted(dfX.year.unique())
folds_ro = []

min_train_years = 5   # slightly relaxed (was 8)
max_folds = 6

for valid_end in years:
    train_end = valid_end - 1
    if train_end - years[0] + 1 < min_train_years:
        continue
    folds_ro.append((train_end, valid_end))

# Evenly sample up to 6 folds across available years
if len(folds_ro) > max_folds:
    step = max(1, len(folds_ro) // max_folds)
    folds_ro = folds_ro[::step]

print(f"Candidate folds: {folds_ro}")

# --- Run rolling-origin folds ---
oof_rows = []
for (train_end, valid_end) in folds_ro:
    tr = dfX[dfX.year <= train_end]
    va = dfX[(dfX.year > train_end) & (dfX.year <= valid_end)]

    tr_countries = set(tr.country.unique())
    overlap = va.country.isin(tr_countries)

    if overlap.sum() == 0:
        print(f"⚠️ Skipping fold {train_end}->{valid_end} (no overlap at all).")
        continue

    va = va[overlap]

    rf = RandomForestRegressor(
        n_estimators=600,
        max_depth=None,
        min_samples_leaf=2,
        min_samples_split=4,
        max_features="sqrt",
        n_jobs=-1,
        random_state=RANDOM_STATE,
    ).fit(tr[feature_cols], tr[TARGET])

    va_pred = bounded_clip(rf.predict(va[feature_cols]))
    row = dict(
        view="rolling_origin",
        fold=f"{train_end}->{valid_end}",
        mae=mean_absolute_error(va[TARGET], va_pred),
        rmse=rmse(va[TARGET], va_pred),
        r2=r2_score(va[TARGET], va_pred),
    )
    cv_rows.append(row)

    tmp = va[["country", "year", TARGET]].copy()
    tmp["y_pred"] = va_pred
    oof_rows.append(tmp)

# Combine CV metrics
cv_df = pd.DataFrame(cv_rows)
cv_df.to_csv(f"{OUT_DIR}/backtest_metrics_by_fold.csv", index=False)
print("✅ Validation views saved to outputs/backtest_metrics_by_fold.csv")

Candidate folds: [(np.int64(1994), np.int64(1995)), (np.int64(1998), np.int64(1999)), (np.int64(2002), np.int64(2003)), (np.int64(2006), np.int64(2007)), (np.int64(2010), np.int64(2011)), (np.int64(2014), np.int64(2015)), (np.int64(2018), np.int64(2019)), (np.int64(2022), np.int64(2023))]
✅ Validation views saved to outputs/backtest_metrics_by_fold.csv

# === Diagnostics by income group ===

if oof_rows:
    oof = pd.concat(oof_rows, ignore_index=True)
    oof = oof.merge(
        raw[ID_COLS + ["IncomeGroup"]].drop_duplicates(),
        on=["country", "year"],
        how="left",
    )

    by_income = (
        oof.groupby("IncomeGroup")
        .apply(
            lambda g: pd.Series(
                {
                    "n": len(g),
                    "MAE": mean_absolute_error(g[TARGET], g["y_pred"]),
                    "RMSE": rmse(g[TARGET], g["y_pred"]),
                    "R2": r2_score(g[TARGET], g["y_pred"]),
                }
            )
        )
        .reset_index()
    )

    by_income.to_csv(f"{OUT_DIR}/backtest_diagnostics_by_income.csv", index=False)
    print("✅ Backtest diagnostics saved to outputs/backtest_diagnostics_by_income.csv")
else:
    print("⚠️ No OOF predictions generated — all folds were skipped.")

✅ Backtest diagnostics saved to outputs/backtest_diagnostics_by_income.csv

# === Uncertainty via Empirical Residual Intervals ===

rf_full = RandomForestRegressor(
    n_estimators=600,
    max_depth=None,
    min_samples_leaf=2,
    min_samples_split=4,
    max_features="sqrt",
    n_jobs=-1,
    random_state=RANDOM_STATE,
).fit(dfX[feature_cols], dfX[TARGET])

train_pred_full = rf_full.predict(dfX[feature_cols])
residuals_full = dfX[TARGET].values - train_pred_full
lo_r, hi_r = residual_intervals(residuals_full, alpha=0.1)  # 90% PI

# PI coverage on OOF (sanity check)
oof["pi_lo"], oof["pi_hi"] = bounded_clip(oof["y_pred"] + lo_r), bounded_clip(oof["y_pred"] + hi_r)
oof["covered"] = (oof[TARGET] >= oof["pi_lo"]) & (oof[TARGET] <= oof["pi_hi"])
coverage_90 = float(oof["covered"].mean())
print(f"Empirical 90% PI coverage on OOF: {coverage_90:.3f}")
oof.to_csv(f"{OUT_DIR}/backtest_oof_predictions.csv", index=False)

Empirical 90% PI coverage on OOF: 0.747

# === Forecasting API (Scenarios) ===

def forecast_from_scenarios(scen_df: pd.DataFrame, use_ensemble: bool=False, w_rf: float=0.8) -> pd.DataFrame:
    """Return point forecasts + 90% PIs for provided scenario rows.
    scen_df must include: ['country','year','IncomeGroup'] + CORE_FEATURES (same names).
    """
    req = set(["country","year","IncomeGroup"] + CORE_FEATURES)
    missing = req - set(scen_df.columns)
    if missing:
        raise ValueError(f"Scenario frame missing columns: {missing}")

    Xf = add_interactions(scen_df)
    # Align to training feature space
    missing_cols = set(feature_cols) - set(Xf.columns)
    for c in missing_cols: Xf[c] = 0.0
    Xf = Xf[feature_cols]

    if use_ensemble:
        lin = LinearRegression().fit(dfX[feature_cols], dfX[TARGET])
        preds = bounded_clip(w_rf*rf_full.predict(Xf) + (1-w_rf)*lin.predict(Xf))
    else:
        preds = bounded_clip(rf_full.predict(Xf))

    out = scen_df[["country","year"]].copy()
    out["pred"] = preds
    out["pi_lo"] = bounded_clip(preds + lo_r)
    out["pi_hi"] = bounded_clip(preds + hi_r)
    return out


# === Forward Forecast Generation (2024–2030) ===

# --- Load the cleaned contextual dataset used in Notebook 03 & 04 ---
df_context = pd.read_csv(DATA_DIR / "countries_clean.csv")[[
    "Country Name", "year", "Region", "IncomeGroup"
]]

print(f"✅ Loaded contextual metadata: {df_context['Country Name'].nunique()} countries × years {df_context['year'].min()}–{df_context['year'].max()}")

# --- Define forecast horizon ---
FUTURE_YEARS = list(range(2024, 2031))

# --- Use latest available country-year data as the base (2023) ---
latest = dfX.groupby("country").apply(lambda d: d.loc[d["year"].idxmax()]).reset_index(drop=True)
latest = latest.merge(
    df_context[["Country Name", "IncomeGroup"]],
    left_on="country",
    right_on="Country Name",
    how="left"
).drop(columns=["Country Name"], errors="ignore")

# --- Keep only relevant columns
latest = latest[["country", "IncomeGroup"] + CORE_FEATURES].copy()
latest["year"] = 2023

# --- Build future frame by extending baseline ---
forecast_base = pd.concat(
    [latest.assign(year=y) for y in FUTURE_YEARS],
    ignore_index=True
)

print(f"✅ Created base frame for {len(FUTURE_YEARS)} future years × {forecast_base['country'].nunique()} countries")
forecast_base.head()

✅ Loaded contextual metadata: 168 countries × years 1990–2023
✅ Created base frame for 7 future years × 168 countries

# === Define Baseline and Counterfactual Scenarios ===

# --- Baseline: assume gradual continuity in key features ---
scen_baseline = forecast_base.copy()

# --- High-Growth Scenario (+2pp GDP growth, +3% HDI) ---
scen_growth = scen_baseline.copy()
scen_growth["gdp_growth"] += 2.0
scen_growth["hdi"] *= 1.03

# --- Crisis Scenario (-2pp GDP growth, -2% HDI) ---
scen_crisis = scen_baseline.copy()
scen_crisis["gdp_growth"] -= 2.0
scen_crisis["hdi"] *= 0.98

# --- Demographic Pressure (+5% Adolescent Fertility) ---
scen_demo = scen_baseline.copy()
scen_demo["adol_fertility"] *= 1.05

# --- Bundle all scenarios ---
scenarios_future = {
    "Baseline": scen_baseline,
    "High Growth": scen_growth,
    "Crisis": scen_crisis,
    "Demographic Pressure": scen_demo,
}

print(f"✅ Defined {len(scenarios_future)} forecasting scenarios.")


# === Generate Forecasts for Each Scenario ===

# --- Run model predictions with forecast_from_scenarios() ---
forecast_results = {}
for name, scen in scenarios_future.items():
    forecast_results[name] = forecast_from_scenarios(scen)

# --- Combine all results into a single DataFrame ---
df_forecasts = pd.concat(
    {k: v.assign(scenario=k) for k, v in forecast_results.items()},
    axis=0
).reset_index(drop=True)

# --- Sanity check ---
print(f"✅ Forecasts generated: {df_forecasts.shape[0]:,} records ({df_forecasts['scenario'].nunique()} scenarios)")
df_forecasts.head()

✅ Defined 4 forecasting scenarios.
✅ Forecasts generated: 159,936 records (4 scenarios)

# === Aggregate Forecasts — Global and Regional Trends ===

# --- Merge with contextual metadata ---
df_forecasts = df_forecasts.merge(
    df_context[["Country Name", "Region", "IncomeGroup"]],
    left_on="country",
    right_on="Country Name",
    how="left"
)

# --- Aggregate by year and scenario ---
global_trends = (
    df_forecasts.groupby(["year", "scenario"])["pred"]
    .mean()
    .reset_index()
)

# --- Plot global trend lines ---
plt.figure(figsize=(8, 5))
sns.lineplot(
    data=global_trends,
    x="year",
    y="pred",
    hue="scenario",
    linewidth=2.2
)

plt.title("Forecasted Global Net Migration (2024–2030)", pad=10)
plt.ylabel("Predicted Net Migration (per 1,000 population)")
plt.xlabel("Year")
plt.tight_layout()
plt.show()


# === Regional Forecast Profiles (Mean Predictions)s ===

# --- Ensure Region metadata is attached safely ---
if "Region" not in df_forecasts.columns:
    region_map = df_context[["Country Name", "Region"]].drop_duplicates("Country Name")
    df_forecasts = df_forecasts.merge(
        region_map,
        left_on="country",
        right_on="Country Name",
        how="left"
    ).drop(columns=["Country Name"], errors="ignore")

# --- Aggregate by Region, Year, Scenario ---
regional_trends = (
    df_forecasts.groupby(["Region", "year", "scenario"])["pred"]
    .mean()
    .reset_index()
)

# --- Visualization ---
plt.figure(figsize=(10, 6))
sns.lineplot(
    data=regional_trends,
    x="year",
    y="pred",
    hue="Region",
    style="scenario",
    alpha=0.9
)

plt.title("Regional Forecast Profiles (2024–2030)", pad=10)
plt.ylabel("Predicted Net Migration (per 1,000 population)")
plt.xlabel("Year")
plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)
plt.tight_layout()
plt.show()

# === Clean and Export Forecast Results (Final) ===

# --- 🧹 1. Drop merge duplicates (cartesian explosion fix) ---
df_forecasts = (
    df_forecasts
    .drop_duplicates(subset=["country", "year", "scenario"])
    .reset_index(drop=True)
)

# --- 🧩 2. Create unique country–region mapping ---
region_map = (
    df_context[["Country Name", "Region", "IncomeGroup"]]
    .drop_duplicates(subset="Country Name")
    .rename(columns={"Country Name": "country"})
)

# --- 🧼 3. Remove any redundant columns before re-merging ---
redundant_cols = [c for c in df_forecasts.columns if "Country Name" in c or c in ["Region", "IncomeGroup"]]
df_forecasts = df_forecasts.drop(columns=redundant_cols, errors="ignore")

# --- 🔗 4. Merge clean metadata ---
df_forecasts = df_forecasts.merge(region_map, on="country", how="left")

# --- 📊 5. Sanity checks ---
print(f"✅ Cleaned forecast shape: {df_forecasts.shape}")
print(f"   Unique (country,year,scenario): {df_forecasts.drop_duplicates(subset=['country','year','scenario']).shape[0]}")
print(f"   Expected: {168*7*4} = {168*7*4:,}")

# --- 💾 6. Export clean files ---
df_forecasts.to_csv(OUT_DIR / "forecast_results_2024_2030.csv", index=False)
global_trends.to_csv(OUT_DIR / "forecast_global_trends.csv", index=False)

print(f"\n✅ Clean forecast results exported to: {OUT_DIR.resolve()}")
print(f"   Forecast file size should now be under 2 MB.")

✅ Cleaned forecast shape: (4704, 8)
   Unique (country,year,scenario): 4704
   Expected: 4704 = 4,704

✅ Clean forecast results exported to: /Users/golibsanaev/Library/CloudStorage/Dropbox/GitHub_gsanaev/forecasting-migration-flows-ml/outputs
   Forecast file size should now be under 2 MB.

# === Summary Printouts ===

print("\n=== Validation Summary (first rows) ===")
print(cv_df.head())

print("\nBy-Income Diagnostics:")
print(by_income)

print(f"\n90% PI coverage (OOF): {coverage_90:.3f}")

print(f"\nArtifacts written to: {os.path.abspath(OUT_DIR)}")

=== Validation Summary (first rows) ===
   view fold       mae      rmse        r2
0  tscv    1  3.782038  6.917923  0.526602
1  tscv    2  2.935160  4.593332  0.792585
2  tscv    3  2.871232  5.013363  0.749805
3  tscv    4  2.955350  5.181439  0.759667
4  tscv    5  3.728009  6.880906  0.353487

By-Income Diagnostics:
           IncomeGroup      n       MAE      RMSE        R2
0          High income  424.0  2.945963  5.035595  0.786481
1           Low income  184.0  4.402258  9.768196  0.233676
2  Lower middle income  360.0  2.186605  4.450006  0.615945
3  Upper middle income  376.0  2.664239  4.863841  0.671561

90% PI coverage (OOF): 0.747

Artifacts written to: /Users/golibsanaev/Library/CloudStorage/Dropbox/GitHub_gsanaev/forecasting-migration-flows-ml/outputs

Author	Golib Sanaev
Project	Forecasting Migration Flows with Machine Learning
Created	2025-09-23
Last Updated	2025-10-15

	country	IncomeGroup	gdp_growth	gdp_per_capita	unemployment	trade_openness_ratio	adol_fertility	hdi	urbanization_rate_stable	pop_growth	year
0	Afghanistan	Low income	2.266944	378.066303	13.991	0.332193	64.068	0.496	1.554401	2.135594	2024
1	Afghanistan	Low income	2.266944	378.066303	13.991	0.332193	64.068	0.496	1.554401	2.135594	2024
2	Afghanistan	Low income	2.266944	378.066303	13.991	0.332193	64.068	0.496	1.554401	2.135594	2024
3	Afghanistan	Low income	2.266944	378.066303	13.991	0.332193	64.068	0.496	1.554401	2.135594	2024
4	Afghanistan	Low income	2.266944	378.066303	13.991	0.332193	64.068	0.496	1.554401	2.135594	2024

	country	year	pred	pi_lo	pi_hi	scenario
0	Afghanistan	2024	-4.832401	-7.902818	-1.840252	Baseline
1	Afghanistan	2024	-4.832401	-7.902818	-1.840252	Baseline
2	Afghanistan	2024	-4.832401	-7.902818	-1.840252	Baseline
3	Afghanistan	2024	-4.832401	-7.902818	-1.840252	Baseline
4	Afghanistan	2024	-4.832401	-7.902818	-1.840252	Baseline

Metric	Avg (CV)
MAE	3.3 per 1 000
RMSE	5.9 per 1 000
R²	0.62

🤖 Forecasting Migration Flows — Notebook 05: Forecasting and Validation¶

🎯 Purpose¶

📑 Table of Contents¶

⚙️ 1. Setup and Configuration ¶

📂 2. Data Loading and Preparation ¶

⏱️ 3. Temporal Validation ¶

🔁 3.1 Expanding Window ¶

🔄 3.2 Rolling-Origin Validation ¶

📊 3.3 Diagnostics by Income Group ¶

🎯 4. Uncertainty Estimation ¶

🌍 5. Forecast Generation & Scenario Analysis ¶

🧩 5.1 Forecast API ¶

📈 5.2 Scenario Setup & Predictions ¶

🌐 5.3 Global and Regional Aggregation ¶

💾 5.4 Export Forecast Artifacts ¶

🧾 6. Summary and Outputs ¶

🧭 7. Key Insights and Next Steps ¶

✅ Notebook 05 Summary — Forecasting and Validation ¶