# %% ===============================================================
# STEP 0 — ENVIRONMENT SETUP
# ===============================================================
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("viridis")

BASE_DIR = Path("..")
DATA_PROCESSED = BASE_DIR / "data" / "processed"

for path in [DATA_PROCESSED]:
    path.mkdir(parents=True, exist_ok=True)

print("✅ Environment setup complete.")

✅ Environment setup complete.

# %% ===============================================================
# STEP 1 — LOAD DATASET
# Purpose: Load the cleaned hotel dataset produced in Notebook 1.
# ===============================================================

DATA_PATH = DATA_PROCESSED / "hotel_clean.csv"
df = pd.read_csv(DATA_PATH, parse_dates=["month"])

print(f"✅ Dataset loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"✅ Range: {df['month'].min().strftime('%Y-%m')} → {df['month'].max().strftime('%Y-%m')}")
print(f"✅ Countries: {df['region'].nunique()}")

✅ Dataset loaded: 3,328 rows × 10 columns
✅ Range: 2015-01 → 2025-08
✅ Countries: 26

# %% ===============================================================
# STEP 2 — FEATURE ENGINEERING: LAG VARIABLES
# Purpose: Create lag features for selected exogenous variables (L = 1, 2, 3).
# ===============================================================

lag_features = [
    "log_gdp", 
    "unemployment_rate", 
    "turnover_index", 
    "weighted_stringency_index", 
    "eurusd", 
    "eurgbp"
]

for col in lag_features:
    for L in [1, 2, 3]:
        df[f"{col}_lag{L}"] = df.groupby("region")[col].shift(L)

print(f"✅ Added {len(lag_features)*3} lag features.")

✅ Added 18 lag features.

# %% ===============================================================
# STEP 3 — FEATURE ENGINEERING: MONTH-ON-MONTH (MoM) CHANGES
# Purpose: Capture monthly growth dynamics.
# ===============================================================

mom_vars = ["log_gdp", "turnover_index"]

for col in mom_vars:
    df[f"{col}_mom"] = (
        df.groupby("region")[col].pct_change() * 100
    )

df.replace([np.inf, -np.inf], np.nan, inplace=True)

print(f"✅ Added {len(mom_vars)} MoM change features.")

✅ Added 2 MoM change features.

# %% ===============================================================
# STEP 4 — TEMPORAL FEATURES
# Purpose: Extract time-based components to capture seasonality.
# ===============================================================

df["m"] = df["month"].dt.month
df["quarter"] = df["month"].dt.quarter

print("✅ Added time-based features (month, quarter).")

✅ Added time-based features (month, quarter).

# %% ===============================================================
# STEP 5 — DATA QUALITY CHECK
# Purpose: Summarize missing values and basic descriptive stats..
# ===============================================================

# --- Missing values summary ---
missing_tbl = (
    df.isna()
    .sum()
    .reset_index()
    .rename(columns={"index": "column", 0: "missing_count"})
)
missing_tbl["missing_pct"] = (missing_tbl["missing_count"] / len(df) * 100).round(2)

print("✅ Missing Values Summary:")
display(missing_tbl.sort_values("missing_count", ascending=False).head(10))

# --- Quick numeric overview ---
print("\n✅ Numeric Feature Summary:")
display(df.describe().T.round(2))

✅ Missing Values Summary:

✅ Numeric Feature Summary:

# %% ===============================================================
# STEP 6 — EXPORT FEATURE-ENGINEERED DATASET
# Purpose: Save all relevant columns as a modeling-ready dataset.
# ===============================================================

# --- Ensure year column exists ---
df["year"] = df["month"].dt.year

# --- Identify feature columns ---
lag_cols = [c for c in df.columns if "_lag" in c or "_mom" in c]

# --- Define export column order ---
export_cols = [
    "region", "month", "year", "log_nights_spent"
] + lag_cols + [
    "log_gdp", "unemployment_rate", "turnover_index",
    "weighted_stringency_index",
    "eurusd", "eurgbp"
]

# --- Create a clean feature DataFrame ---
df_features = df[export_cols].copy()

# --- Save feature-engineered dataset ---
FEATURE_PATH = DATA_PROCESSED / "hotel_features.csv"
df_features.to_csv(FEATURE_PATH, index=False)

# --- Confirm output ---
print(f"💾 Feature-engineered dataset saved → {FEATURE_PATH.resolve()}")
print(f"✅ Shape: {df_features.shape[0]} rows × {df_features.shape[1]} columns")

💾 Feature-engineered dataset saved → /Users/golibsanaev/Library/CloudStorage/Dropbox/GitHub_gsanaev/forecasting-explaining-hotel-demand-in-eu/data/processed/hotel_features.csv
✅ Shape: 3328 rows × 30 columns

	column	missing_count	missing_pct
18	turnover_index_lag3	78	2.34
27	eurgbp_lag3	78	2.34
24	eurusd_lag3	78	2.34
21	weighted_stringency_index_lag3	78	2.34
12	log_gdp_lag3	78	2.34
15	unemployment_rate_lag3	78	2.34
20	weighted_stringency_index_lag2	52	1.56
14	unemployment_rate_lag2	52	1.56
17	turnover_index_lag2	52	1.56
26	eurgbp_lag2	52	1.56

	count	mean	min	25%	50%	75%	max	std
month	3328	2020-04-16 04:52:30	2015-01-01 00:00:00	2017-08-24 06:00:00	2020-04-16 00:00:00	2022-12-08 18:00:00	2025-08-01 00:00:00	NaN
year	3328.0	2019.84375	2015.0	2017.0	2020.0	2022.25	2025.0	3.083779
log_nights_spent	3328.0	13.448233	5.147494	12.100265	13.631481	14.594523	17.22325	1.812738
log_gdp	3328.0	10.389341	7.427144	9.032616	10.493316	11.423246	13.415018	1.561698
unemployment_rate	3328.0	6.672386	2.0	4.6	6.0	7.7	25.1	3.238662
turnover_index	3328.0	119.759856	6.3	78.7	114.5	155.525	419.6	56.721679
weighted_stringency_index	3328.0	49.18058	36.617093	44.671611	48.849586	52.96561	65.876549	6.46075
eurusd	3328.0	1.118962	0.982956	1.084225	1.115387	1.157142	1.240957	0.051836
eurgbp	3328.0	0.849934	0.701	0.842647	0.859745	0.88004	0.9201	0.049174
log_gdp_lag1	3302.0	10.388478	7.427144	9.032162	10.49297	11.421645	13.415018	1.561861
log_gdp_lag2	3276.0	10.387601	7.427144	9.031178	10.4924	11.420809	13.415018	1.562027
log_gdp_lag3	3250.0	10.386709	7.427144	9.029664	10.492111	11.420097	13.415018	1.562191
unemployment_rate_lag1	3302.0	6.680254	2.0	4.6	6.0	7.7	25.1	3.244585
unemployment_rate_lag2	3276.0	6.689347	2.0	4.6	6.0	7.7	25.1	3.25085
unemployment_rate_lag3	3250.0	6.698308	2.0	4.6	6.0	7.7	25.1	3.256633
turnover_index_lag1	3302.0	118.916172	6.3	77.75	114.5	154.275	419.6	55.82491
turnover_index_lag2	3276.0	118.070604	6.3	77.5	114.5	153.35	419.6	54.915688
turnover_index_lag3	3250.0	117.339446	6.3	76.825	114.5	151.9	419.6	54.370259
weighted_stringency_index_lag1	3302.0	49.18058	36.617093	44.671611	48.849586	52.96561	65.876549	6.460758
weighted_stringency_index_lag2	3276.0	49.18058	36.617093	44.671611	48.849586	52.96561	65.876549	6.460766
weighted_stringency_index_lag3	3250.0	49.18058	36.617093	44.671611	48.849586	52.96561	65.876549	6.460774
eurusd_lag1	3302.0	1.118571	0.982956	1.083882	1.115324	1.156136	1.240957	0.051851
eurusd_lag2	3276.0	1.118377	0.982956	1.083882	1.115281	1.156136	1.240957	0.052011
eurusd_lag3	3250.0	1.117943	0.982956	1.083882	1.115237	1.148765	1.240957	0.05199
eurgbp_lag1	3302.0	0.849811	0.701	0.8424	0.85974	0.8801	0.9201	0.049348
eurgbp_lag2	3276.0	0.849714	0.701	0.8424	0.859085	0.8801	0.9201	0.049531
eurgbp_lag3	3250.0	0.849674	0.701	0.8424	0.85974	0.8801	0.9201	0.049727
log_gdp_mom	3302.0	0.025431	-1.706215	-0.105329	0.039785	0.16847	0.783342	0.300298
turnover_index_mom	3302.0	2.727435	-86.123348	-4.107911	0.0	8.815959	209.74026	19.86269
m	3328.0	6.375	1.0	3.0	6.0	9.0	12.0	3.426062
quarter	3328.0	2.460938	1.0	1.0	2.0	3.0	4.0	1.110504

📘 Notebook 2 — Feature Engineering for Hotel Demand Forecasting (2015–2025)¶

Structure Overview¶

✅ Summary¶