import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path

DATA = Path("data")
raw = pd.read_csv(DATA/"lobster24.csv")

df = raw.copy()
date_col = next((c for c in df.columns if c.lower() in ["date","day","dt"]), None)
if date_col is None:
    for c in df.columns:
        x = pd.to_datetime(df[c], errors="coerce")
        if x.notna().mean()>0.9: date_col=c; break
if date_col is None:
    raise ValueError("No date-like column found.")
df["Date"] = pd.to_datetime(df[date_col])
df = df.sort_values("Date").reset_index(drop=True)

display(df.head(3))
display(df.dtypes.to_frame("dtype"))
na = df.isna().mean().sort_values(ascending=False)
display(na[na>0].round(3).to_frame("missing_rate"))

df["Year"]    = df["Date"].dt.year
df["Week"]    = df["Date"].dt.isocalendar().week.astype(int)
df["Month"]   = df["Date"].dt.month
df["Weekday"] = df["Date"].dt.day_name()
df["Is_Weekend"] = df["Date"].dt.weekday >= 5

wcol = next((c for c in df.columns if "weather" in c.lower()), None)
if wcol is None:
    df["Weather_Clean"] = "Unknown"
else:
    w = df[wcol].astype(str).str.lower().str.strip()
    mapping = {"sun":"Sunny","clear":"Sunny","bright":"Sunny",
               "cloud":"Cloudy","overcast":"Cloudy",
               "rain":"Rainy","storm":"Stormy","thunder":"Stormy","snow":"Stormy"}
    def norm(s):
        for k,v in mapping.items():
            if k in s: return v
        return s.title()
    df["Weather_Clean"] = w.map(norm)

def find_col(tokens):
    for c in df.columns:
        if any(t in c.lower() for t in tokens): return c
    return None

col_total_rev  = find_col(["total_revenue","revenue_total","total_rev","revenue"])
col_arcade_rev = find_col(["arcade_revenue","arcade"])
col_visitors   = find_col(["total_visitors","visitors","attendance","attend"])
col_temp       = find_col(["temp","temperature"])
col_complaints = find_col(["complaints","customer_complaints"])

for c in [col_total_rev, col_arcade_rev, col_visitors, col_temp, col_complaints]:
    if c and df[c].dtype=="O":
        with pd.option_context("future.no_silent_downcasting", True):
            df[c] = pd.to_numeric(df[c].astype(str).str.replace(r"[,$]", "", regex=True), errors="coerce")

target = col_total_rev or (col_arcade_rev or df.select_dtypes("number").columns[0])
{"total": col_total_rev, "arcade": col_arcade_rev, "visitors": col_visitors, "temp": col_temp, "complaints": col_complaints}

{'total': 'Arcade_Revenue',
 'arcade': 'Arcade_Visitors',
 'visitors': 'Total_Visitors',
 'temp': 'High_Temperature',
 'complaints': 'Customer_Complaints'}

def maybe(names): 
    for n in names:
        if n in df.columns: return n
    return None

col_intl = maybe(["International_Visitors","Intl_Visitors","Pct_International","International%"])
col_staff_hours = maybe(["Total_Labor_Hours","Labor_Hours","Staff_Hours"])
col_pass = maybe(["Passholder_Percentage","Passholder_Pct","Passholder%","Passholders"])

if col_intl and df[col_intl].max()<=1.01: 
    df["Intl_Pct"] = df[col_intl]
elif col_intl:
    df["Intl_Pct"] = df[col_intl]/100.0

if col_staff_hours and col_total_rev:
    df["Staff_Efficiency"] = df[col_total_rev] / df[col_staff_hours]

if col_pass and col_visitors:
    df["Passholder_Pct"] = df[col_pass] if df[col_pass].max()<=1.01 else df[col_pass]/100.0

display(df[["Date","Weather_Clean","Is_Weekend"] + [c for c in ["Intl_Pct","Staff_Efficiency","Passholder_Pct"] if c in df.columns]].head(3))

s = df.set_index("Date")[target].copy()
cal = s.to_frame("val")
cal["dow"] = cal.index.weekday
cal["week"] = cal.index.isocalendar().week.astype(int)
pv = cal.pivot_table(index="dow", columns="week", values="val", aggfunc="mean")
plt.figure(figsize=(12,3)); plt.imshow(pv, aspect="auto")
plt.yticks(range(7), ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]); plt.xlabel("ISO Week"); plt.colorbar()
plt.title("Calendar heatmap — weekly mean"); plt.tight_layout(); plt.show()

g = df.groupby(["Weather_Clean","Is_Weekend"])[target]
m = g.mean(); n = g.count(); sd = g.std().fillna(0)
se = (sd/np.sqrt(n)).reindex(m.index).fillna(0)
labels = [f"{w}\n{'Weekend' if we else 'Weekday'}" for w,we in m.index]
plt.figure(figsize=(12,4))
plt.bar(range(len(m)), m.values, yerr=1.96*se.values, capsize=3)
plt.xticks(range(len(m)), labels); plt.title("Weather × Weekend — mean ± 95% CI")
plt.tight_layout(); plt.show()

wk = df.set_index("Date")[target].resample("W").mean().rename("w")
ma = wk.rolling(3, min_periods=1).mean()
sd = wk.rolling(3, min_periods=1).std().fillna(0)
plt.figure(figsize=(9,4))
plt.plot(wk.index, wk.values, alpha=0.4, label="weekly mean")
plt.plot(ma.index, ma.values, linewidth=2, label="3-week MA")
plt.fill_between(ma.index, (ma-sd).values, (ma+sd).values, alpha=0.2)
plt.title("Weekly revenue — mean with smoothed band")
plt.legend(); plt.tight_layout(); plt.show()

if col_arcade_rev:
    x = df[col_arcade_rev].dropna().sort_values(ascending=False).reset_index(drop=True)
    cs = x.cumsum()/x.sum() if x.sum()>0 else np.nan
    fig, ax1 = plt.subplots(figsize=(8,4))
    ax1.bar(range(1,len(x)+1), x.values)
    ax2 = ax1.twinx(); ax2.plot(range(1,len(x)+1), cs.values, marker="o", linewidth=2)
    ax1.set_xlabel("Days (sorted)"); ax1.set_ylabel("Arcade revenue"); ax2.set_ylabel("Cumulative share")
    ax1.set_title("Arcade revenue — Pareto curve"); plt.tight_layout(); plt.show()
else:
    print("No arcade revenue column found.")

import numpy as np, matplotlib.pyplot as plt
dist_cols = [c for c in [
    "Total_Visitors","Arcade_Visitors","Total_Purchases",
    "Arcade_Revenue","Food_Revenue","Merch_Revenue","Total_Labor_Hours",
    col_total_rev, col_arcade_rev, col_visitors
] if isinstance(c,str) and c in df.columns]

for col in dist_cols:
    s = df[col].dropna()
    if s.empty: 
        continue
    plt.figure(figsize=(8,3))
    plt.hist(s, bins=30, alpha=0.85)
    plt.title(f"{col} — distribution (n={len(s)})")
    plt.xlabel(col); plt.ylabel("count"); plt.tight_layout(); plt.show()
    print(f"{col}: skew={s.skew():.2f}")

Total_Visitors: skew=-0.02

Arcade_Visitors: skew=-0.07

Total_Purchases: skew=0.30

Arcade_Revenue: skew=0.30

Total_Labor_Hours: skew=-0.00

Arcade_Revenue: skew=0.30

Arcade_Visitors: skew=-0.07

Total_Visitors: skew=-0.02

num_df = df.select_dtypes(include=[np.number]).copy()
corr = num_df.corr().round(2)

plt.figure(figsize=(6,5))
im = plt.imshow(corr, cmap="viridis")
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar(im, fraction=0.046, pad=0.04)
plt.title("Correlation matrix"); plt.tight_layout(); plt.show()

corr.head(10)

metric = target
top = df.nlargest(5, metric)[["Date","Weather_Clean","Is_Weekend", metric]]
bot = df.nsmallest(5, metric)[["Date","Weather_Clean","Is_Weekend", metric]]
display(top.rename(columns={metric:"Top_Value"}))
display(bot.rename(columns={metric:"Bottom_Value"}))

keep = ["Date","Year","Week","Month","Weekday","Is_Weekend","Weather_Clean"]
for c in [target, col_arcade_rev, col_visitors, "Intl_Pct","Staff_Efficiency","Passholder_Pct"]:
    if isinstance(c,str) and c in df.columns: keep.append(c)
Path("outputs").mkdir(exist_ok=True)
df[keep].to_csv("outputs/themepark_daily_tidy.csv", index=False)
"Saved: outputs/themepark_daily_tidy.csv"

'Saved: outputs/themepark_daily_tidy.csv'

import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import RidgeCV

plt.rcParams["figure.figsize"] = (7,4)
plt.rcParams["axes.grid"] = False
DATA = Path("data")

ph = pd.read_csv(DATA/"passholders24.csv")
ph.columns = (ph.columns.str.strip()
                        .str.replace(r"\s+", "_", regex=True)
                        .str.replace(r"[^\w_]", "", regex=True))
id_like = [c for c in ph.columns if c.lower() in {"id","customer_id","member_id"}]
num_cols = ph.select_dtypes(include=[np.number]).columns.tolist()
high_card = [c for c in num_cols if ph[c].nunique()>0.9*len(ph)]
feat_cols = [c for c in num_cols if c not in set(id_like)|set(high_card)]
if len(feat_cols)<3: raise ValueError(f"Not enough numeric features: {feat_cols}")
Xw = ph[feat_cols].clip(ph[feat_cols].quantile(0.01), ph[feat_cols].quantile(0.99), axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(Xw), columns=feat_cols, index=ph.index)
display(X.describe().round(2))

k_list = list(range(2,8))
inertia, sil = [], []
for k in k_list:
    km = KMeans(n_clusters=k, n_init=20, random_state=42)
    lab = km.fit_predict(X)
    inertia.append(km.inertia_)
    sil.append(silhouette_score(X, lab))
fig, ax = plt.subplots(1,2, figsize=(10,4))
ax[0].plot(k_list, inertia, marker="o"); ax[0].set_title("Elbow (Inertia)")
ax[0].set_xlabel("k"); ax[0].set_ylabel("Inertia")
ax[1].plot(k_list, sil, marker="o"); ax[1].set_title("Silhouette")
ax[1].set_xlabel("k"); ax[1].set_ylabel("Score")
plt.tight_layout(); plt.show()
best_k = k_list[int(np.argmax(sil))]
print("Chosen k:", best_k)

Chosen k: 7

kmeans = KMeans(n_clusters=best_k, n_init=50, random_state=42)

labels = kmeans.fit_predict(X)              
ph["segment"] = pd.Categorical(labels)    

sizes = ph["segment"].value_counts().sort_index()

ax = sizes.plot(kind="bar")
ax.set_title("Segment sizes"); ax.set_xlabel("segment"); ax.set_ylabel("count")
plt.tight_layout(); plt.show()

display(sizes.to_frame("count"))

import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

Z = pd.DataFrame(
    StandardScaler().fit_transform(ph[feat_cols]),
    columns=feat_cols, index=ph.index
)

prof_z = Z.groupby(ph["segment"], observed=True)[feat_cols].mean() 
K = 10 
top_feats = prof_z.var(axis=0).sort_values(ascending=False).head(K).index
M = prof_z[top_feats]

h, w = M.shape
fig_w = min(10, 0.55*w + 2)
fig_h = min(6.5, 0.55*h + 1.6)

plt.figure(figsize=(fig_w, fig_h))
try:
    import seaborn as sns
    sns.heatmap(M, cmap="coolwarm", center=0, square=True,
                cbar_kws={"shrink":0.7}, linewidths=0.2, linecolor="white")
except Exception:
    vmax = np.nanmax(np.abs(M.values))
    im = plt.imshow(M.values, cmap="coolwarm", vmin=-vmax, vmax=vmax, aspect="equal")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(w), M.columns, rotation=60, ha="right", fontsize=8)
    plt.yticks(range(h), M.index, fontsize=9)

plt.title("Segment profiles (top features, z-scores)")
plt.xlabel("Feature"); plt.ylabel("Segment")
plt.xticks(rotation=60, ha="right", fontsize=8)
plt.yticks(fontsize=9)
plt.tight_layout(); plt.show()

pca = PCA(n_components=2, random_state=42)
xy = pca.fit_transform(X)
tmp = pd.DataFrame(xy, columns=["PC1","PC2"], index=ph.index)
tmp["segment"] = ph["segment"].astype(str)
sns.scatterplot(data=tmp, x="PC1", y="PC2", hue="segment", s=28, alpha=0.85)
plt.title("Segments in 2-D (PCA)"); plt.legend(title="segment", bbox_to_anchor=(1.02,1))
plt.tight_layout(); plt.show()
print("Explained variance:", np.round(pca.explained_variance_ratio_, 3))

Explained variance: [0.12  0.116]

summary = ph.groupby("segment", observed=True)[feat_cols].mean().round(2)
counts = ph["segment"].value_counts(sort=False).sort_index()
summary.insert(0, "size", counts)
summary.insert(1, "size_share_%", (counts / counts.sum() * 100).round(1))
display(summary.sort_values("size", ascending=False))

cq = pd.read_csv(DATA/"digital_quest.csv")
cq.columns = (cq.columns.str.strip()
                        .str.replace(r"\s+", "_", regex=True)
                        .str.replace(r"[^\w_]", "", regex=True))
rating_col = next((c for c in cq.columns if c.lower() in {"rating","score","preference","rank"}), None)
if rating_col is None:
    num_cols_cq = cq.select_dtypes(include=[np.number]).columns
    if len(num_cols_cq)==0: raise ValueError("No numeric rating column found.")
    rating_col = num_cols_cq[-1]
attr_cols = [c for c in cq.columns if c != rating_col]
for c in attr_cols:
    cq[c] = cq[c].astype(str).str.strip().replace({"nan":"NA"})
    cq[c] = pd.Categorical(cq[c])
y = cq[rating_col].astype(float).values
Xc = pd.get_dummies(cq[attr_cols], drop_first=True)
display(Xc.head(3))

import re, numpy as np, pandas as pd
from sklearn.linear_model import RidgeCV

Xc_df = pd.DataFrame(Xc).apply(pd.to_numeric, errors="coerce").fillna(0.0)
drop_re = re.compile(r'(?i)\b(id|bundle|task|alt|profile|version|resp|respondent)\b')
keep_cols = [c for c in Xc_df.columns.astype(str) if not drop_re.search(c)]
X_use = Xc_df[keep_cols]

X_num = X_use.to_numpy(dtype=np.float64)
y_num = pd.Series(y).astype(float).fillna(0.0).to_numpy(dtype=np.float64)

ridge = RidgeCV(alphas=np.logspace(-3, 3, 9), cv=None, gcv_mode="auto")
ridge.fit(X_num, y_num)

coefs = pd.Series(ridge.coef_, index=X_use.columns.astype(str)).sort_values(ascending=False)
coefs.head(10)

narrative_Physical Activity    0.687949
bundleID_5486                  0.612485
bundleID_5019                  0.609340
bundleID_3487                  0.605393
bundleID_265                   0.597330
bundleID_4191                  0.596166
bundleID_5872                  0.590472
bundleID_889                   0.587443
bundleID_3500                  0.575397
bundleID_2090                  0.574328
dtype: float64

import matplotlib.pyplot as plt
import pandas as pd

df_coef = pd.DataFrame({
    "col":  coefs.index,
    "att":  [c.split("_", 1)[0] for c in coefs.index],
    "coef": coefs.values
})

imp = (df_coef.groupby("att", observed=True)["coef"]
             .agg(lambda s: float(s.max() - s.min()))
             .sort_values(ascending=False))
imp_pct = imp / imp.sum() * 100

N = 6
top_imp = imp_pct.head(N)

plt.figure(figsize=(6, 3.2))
top_imp.sort_values().plot(kind="barh")
plt.title("Attribute importance (range, %)"); plt.xlabel("% of total importance"); plt.ylabel("")
plt.tight_layout(); plt.show()

for att in top_imp.index:
    s = (df_coef[df_coef["att"] == att]
            .set_index("col")["coef"]
            .sort_values())
    plt.figure(figsize=(6, 3.2))
    s.plot(kind="barh")
    plt.title(f"{att} part-worths"); plt.xlabel("utility"); plt.ylabel("")
    plt.tight_layout(); plt.show()

import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path

p = Path("data/telco_churn_data.csv")
if not p.exists():
    raise FileNotFoundError("Put telco_churn_data.csv under data/.")
raw = pd.read_csv(p)
raw.head(3)

import pandas as pd, numpy as np

df = raw.copy()
for c in df.columns:
    if df[c].dtype == "object":
        df[c] = df[c].astype(str).str.strip()

tokens = ["churn", "attrit", "exit", "cancel", "leave", "dropout"]
name_hits = [c for c in df.columns if any(t in c.lower() for t in tokens)]

low_card = [c for c in df.columns if df[c].astype(str).nunique(dropna=True) <= 3]

candidates = list(dict.fromkeys(name_hits + low_card))  

def to_binary(series):
    s = series.astype(str).str.strip().str.lower()
    mapping = {
        "yes":1, "y":1, "true":1, "t":1, "1":1, "churn":1, "churned":1, "exited":1, "cancelled":1, "left":1, "inactive":1,
        "no":0, "n":0, "false":0, "f":0, "0":0, "stay":0, "stayed":0, "active":0, "retained":0
    }
    mapped = s.map(mapping)
    
    if mapped.isna().all() and pd.api.types.is_numeric_dtype(series):
        mapped = series.astype(float)
        uniq = set(pd.unique(mapped.dropna()))
        if uniq <= {0,1}: 
            return mapped.astype(int)
        if uniq <= {0.0,1.0}:
            return mapped.astype(int)
    return mapped

y_name, y = None, None
for c in candidates:
    y_try = to_binary(df[c])
    if set(pd.unique(y_try.dropna())) <= {0,1} and y_try.notna().mean() > 0.95:
        y_name, y = c, y_try.fillna(0).astype(int)
        break

if y is None:
    print("Could not auto-detect target. Here are some low-cardinality columns to check:")
    disp = pd.DataFrame({c: df[c].astype(str).unique()[:6] for c in low_card}).iloc[:3]
    display(disp)
    raise ValueError("No binary churn-like target found. Rename your target to include one of: "
                     + ", ".join(tokens) + " or ensure it has two levels (yes/no, 0/1, etc.).")

print(f"Detected target column: {y_name}")

X = df.drop(columns=[y_name])

cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

for c in cat_cols:
    vc = X[c].value_counts(dropna=False)
    rare = vc[vc <= max(2, int(0.01*len(X)))].index
    X[c] = X[c].where(~X[c].isin(rare), "Other")

(X.shape, int(y.sum()), len(y))

Detected target column: Churn Value

((7043, 45), 1869, 7043)

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
y_train.mean(), y_valid.mean()

(np.float64(0.2654297614539947), np.float64(0.26519023282226006))

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ],
    remainder="drop"
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

clf_lr = Pipeline(steps=[
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None))
])

clf_rf = Pipeline(steps=[
    ("pre", pre),
    ("clf", RandomForestClassifier(
        n_estimators=400, max_depth=None, min_samples_split=5,
        random_state=42, n_jobs=-1
    ))
])

clf_gb = Pipeline(steps=[
    ("pre", pre),
    ("clf", GradientBoostingClassifier(random_state=42))
])

models = {
    "Logistic": clf_lr,
    "RandomForest": clf_rf,
    "GradientBoosting": clf_gb
}
list(models.keys())

['Logistic', 'RandomForest', 'GradientBoosting']

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = {}
for name, pipe in models.items():
    auc = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)
    cv_scores[name] = (auc.mean(), auc.std())
pd.DataFrame(cv_scores, index=["auc_mean","auc_std"]).T.sort_values("auc_mean", ascending=False).round(4)

from sklearn.model_selection import StratifiedKFold, cross_val_score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

if "cv_scores" not in globals():
    cv_scores = {}
    for name, pipe in models.items():
        auc = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)
        cv_scores[name] = (auc.mean(), auc.std())

best_name = max(cv_scores, key=lambda k: cv_scores[k][0])
best_model = models[best_name].fit(X_train, y_train)

if hasattr(best_model, "predict_proba"):
    proba = best_model.predict_proba(X_valid)[:, 1]
elif hasattr(best_model, "decision_function"):
    s = best_model.decision_function(X_valid)
    proba = (s - s.min()) / (s.max() - s.min() + 1e-9)
else:
    proba = best_model.predict(X_valid).astype(float)

pred05 = (proba >= 0.5).astype(int)

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
metrics_base = {
    "model": best_name,
    "roc_auc": roc_auc_score(y_valid, proba),
    "[email protected]": accuracy_score(y_valid, pred05),
    "[email protected]": precision_score(y_valid, pred05, zero_division=0),
    "[email protected]": recall_score(y_valid, pred05, zero_division=0),
    "[email protected]": f1_score(y_valid, pred05, zero_division=0),
}
base_df = pd.DataFrame([metrics_base]).round(4)
display(base_df.T)

ths = np.linspace(0.05, 0.95, 19)
rows = []
for t in ths:
    pred = (proba >= t).astype(int)
    rows.append([
        t,
        precision_score(y_valid, pred, zero_division=0),
        recall_score(y_valid, pred, zero_division=0),
        f1_score(y_valid, pred, zero_division=0),
        accuracy_score(y_valid, pred)
    ])
thr_df = pd.DataFrame(rows, columns=["threshold","precision","recall","f1","accuracy"]).round(4)
thr_df.sort_values("f1", ascending=False).head(5)

plt.figure(figsize=(8,4))
plt.plot(thr_df["threshold"], thr_df["precision"], label="precision")
plt.plot(thr_df["threshold"], thr_df["recall"], label="recall")
plt.plot(thr_df["threshold"], thr_df["f1"], label="f1")
plt.xlabel("threshold"); plt.ylabel("score"); plt.legend(); plt.tight_layout(); plt.show()

t_star = float(thr_df.sort_values("f1", ascending=False).iloc[0]["threshold"])
pred_star = (proba >= t_star).astype(int)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_valid, pred_star, labels=[0,1])

plt.figure(figsize=(4,3))
plt.imshow(cm, cmap="Blues")
plt.xticks([0,1], ["No churn","Churn"])
plt.yticks([0,1], ["No churn","Churn"])
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i,j], ha="center", va="center")
plt.title(f"Confusion matrix @ {t_star:.2f}")
plt.tight_layout(); plt.show()

import numpy as np, pandas as pd

pre = best_model.named_steps["pre"]
est = best_model.named_steps["clf"]

try:
    Xt_sample = pre.transform(X_train.iloc[: min(50, len(X_train))])
except Exception:
    Xt_sample = pre.fit_transform(X_train.iloc[: min(50, len(X_train))], y_train)
n_feat = Xt_sample.shape[1]

try:
    feats = pre.get_feature_names_out()
    feats = np.array([f.split("__", 1)[-1] for f in feats])
    if len(feats) != n_feat:
        raise ValueError("name length mismatch")
except Exception:
    feats = np.array([f"feature_{i}" for i in range(n_feat)])

vals = None
kind = None
if hasattr(est, "feature_importances_"):
    vals = np.asarray(est.feature_importances_, dtype=float)
    kind = "importance"
elif hasattr(est, "coef_"):
    vals = np.asarray(est.coef_).ravel()
    kind = "coef"

imp_df = None
if vals is not None:
    if len(vals) != n_feat:
        m = min(len(vals), n_feat)
        vals = vals[:m]
        feats = feats[:m]
    order = np.argsort(np.abs(vals))[::-1] if kind == "coef" else np.argsort(vals)[::-1]
    imp_df = pd.DataFrame({kind: np.round(vals[order], 4)}, index=feats[order]).head(20)

display(imp_df.head(10) if imp_df is not None else "No importance available for this estimator.")

if imp_df is not None:
    vals = imp_df.iloc[:15,0].values
    labs = imp_df.index[:15]
    plt.figure(figsize=(8,5))
    plt.barh(range(len(vals))[::-1], vals[::-1])
    plt.yticks(range(len(vals))[::-1], labs[::-1])
    plt.title("Top features")
    plt.tight_layout(); plt.show()

out = pd.DataFrame({
    "row_id": np.arange(len(y_valid)),
    "actual": y_valid.to_numpy(),
    "proba": proba,
    "[email protected]": pred05
})
display(out.head(10))

	Date	Total_Visitors	Arcade_Visitors	Total_Purchases	Arcade_Revenue	Total_Labor_Hours	Weather_Type	Special_Events	Customer_Complaints	Day_of_Week	Passholder_Percentage	International_Visitors	High_Temperature	General_Weather
0	2024-05-27	1060	267	22575.71	3524.30	185.84	Sunny	Arcade Tournament	9	Monday	0.22	357	65.4	Cool
1	2024-05-28	1494	552	17142.52	1652.93	250.90	Cloudy	NaN	3	Tuesday	0.23	424	66.4	Cool
2	2024-05-29	1330	447	10229.99	3251.81	115.59	Cloudy	NaN	0	Wednesday	0.22	441	90.2	Hot

	missing_rate
Special_Events	0.788
High_Temperature	0.030

	Total_Visitors	Arcade_Visitors	Total_Purchases	Arcade_Revenue	Total_Labor_Hours	Customer_Complaints	Passholder_Percentage	International_Visitors	High_Temperature	Precipitation	Year	Week	Month	Intl_Pct	Staff_Efficiency	Passholder_Pct
Total_Visitors	1.00	0.09	-0.10	0.12	-0.19	0.14	-0.22	0.96	-0.05	-0.10	NaN	0.09	0.07	0.96	0.19	-0.22
Arcade_Visitors	0.09	1.00	-0.08	0.01	-0.08	0.09	0.06	0.10	0.05	0.05	NaN	-0.09	-0.14	0.10	0.10	0.06
Total_Purchases	-0.10	-0.08	1.00	0.21	0.19	-0.19	-0.41	0.01	0.05	-0.06	NaN	-0.18	-0.16	0.01	0.03	-0.41
Arcade_Revenue	0.12	0.01	0.21	1.00	-0.06	-0.16	-0.29	0.21	0.08	0.11	NaN	0.18	0.20	0.21	0.77	-0.29
Total_Labor_Hours	-0.19	-0.08	0.19	-0.06	1.00	-0.15	0.03	-0.18	-0.07	-0.05	NaN	0.09	0.07	-0.18	-0.58	0.03
Customer_Complaints	0.14	0.09	-0.19	-0.16	-0.15	1.00	-0.03	0.14	-0.04	0.07	NaN	-0.14	-0.14	0.14	-0.08	-0.03
Passholder_Percentage	-0.22	0.06	-0.41	-0.29	0.03	-0.03	1.00	-0.38	0.04	-0.02	NaN	-0.01	0.01	-0.38	-0.21	1.00
International_Visitors	0.96	0.10	0.01	0.21	-0.18	0.14	-0.38	1.00	-0.02	-0.04	NaN	0.08	0.08	1.00	0.23	-0.38
High_Temperature	-0.05	0.05	0.05	0.08	-0.07	-0.04	0.04	-0.02	1.00	0.06	NaN	0.03	0.03	-0.02	0.08	0.04
Precipitation	-0.10	0.05	-0.06	0.11	-0.05	0.07	-0.02	-0.04	0.06	1.00	NaN	-0.10	-0.08	-0.04	0.15	-0.02

	Number_of_Visits	Twilight_Visits	Number_of_Rides_Taken	Number_of_Food_Purchases	Number_of_Arcade_Plays	Parking_Frequency	Time_Spent_in_Park_hours	Weekday_vs_Weekend_Ratio	Number_of_Guests_Brought	Special_Event_Attendance
count	654.00	654.00	654.00	654.00	654.00	654.00	654.00	654.00	654.00	654.00
mean	-0.00	-0.00	0.00	-0.00	-0.00	0.00	0.00	0.00	-0.00	-0.00
std	1.00	1.00	1.00	1.00	1.00	1.00	1.00	1.00	1.00	1.00
min	-1.61	-1.53	-1.73	-1.71	-1.71	-1.81	-1.63	-1.68	-1.56	-1.34
25%	-0.93	-0.84	-0.85	-0.86	-0.80	-0.82	-0.86	-0.87	-0.87	-0.65
50%	0.09	-0.15	-0.01	-0.01	-0.01	0.03	-0.03	-0.02	0.16	0.04
75%	0.88	0.88	0.82	0.84	0.90	0.87	0.86	0.88	0.85	0.73
max	1.56	1.57	1.78	1.70	1.59	1.58	1.76	1.75	1.54	1.42

	size	size_share_%	Number_of_Visits	Twilight_Visits	Number_of_Rides_Taken	Number_of_Food_Purchases	Number_of_Arcade_Plays	Parking_Frequency	Time_Spent_in_Park_hours	Weekday_vs_Weekend_Ratio	Number_of_Guests_Brought	Special_Event_Attendance
segment
6	109	16.7	17.10	5.79	49.88	21.85	10.74	18.91	5.78	1.36	6.28	11.33
0	107	16.4	8.81	3.91	50.26	15.84	21.23	10.47	6.01	0.71	7.01	1.87
3	103	15.7	19.64	4.62	47.79	19.47	17.43	14.96	9.59	1.06	2.49	0.95
1	91	13.9	11.69	3.81	40.23	32.59	11.90	5.42	5.03	1.33	3.62	1.37
2	88	13.5	12.34	3.52	71.95	19.20	16.56	15.56	4.53	1.02	2.03	14.35
5	86	13.1	21.47	3.20	42.24	36.60	15.55	12.93	4.24	0.60	4.50	13.71
4	70	10.7	15.79	6.34	60.60	36.31	10.36	9.84	9.22	0.66	5.40	30.69

Theme Park Business Analytics — Case Study¶

Part 1 — Exploratory Data Analysis¶

1) Load & setup¶

2) Quick audit¶

3) Calendar & weather¶

4) Identify key columns¶

5) KPIs¶

6) Calendar heatmap¶

7) Weather × Weekend¶

8) Weekly trend with smoothed band¶

9) Arcade revenue — Pareto¶

10) Distributions & skewness¶

11) Correlation heatmap¶

12) Top / bottom days + save tidy slice¶

Part 2 — Market Segmentation & Conjoint Analysis¶

2.1) Load & features¶

2.2) Pick k (elbow + silhouette)¶

2.3) KMeans & segment sizes¶

2.4) Segment profiles (z-scores)¶

2.5) PCA 2-D view¶

2.7) Conjoint design (ratings-based)¶

2.8) Part-worths via ridge¶

2.9) Attribute importance & part-worth bars¶

Part 3 — Modeling & Evaluation¶

3.1) Target and features¶

2) Train/validation split¶

3.3) Preprocessing pipeline¶

3.4) Baseline and ensembles¶

3.5) Stratified CV¶

6) Fit best model and base metrics¶

3.7) Threshold tuning¶

3.8) Confusion matrix at tuned threshold¶

3.9) Feature importance¶

10) Save a lean output¶

	dtype
Date	datetime64[ns]
Total_Visitors	int64
Arcade_Visitors	int64
Total_Purchases	float64
Arcade_Revenue	float64
Total_Labor_Hours	float64
Weather_Type	object
Special_Events	object
Customer_Complaints	int64
Day_of_Week	object
Passholder_Percentage	float64
International_Visitors	int64
High_Temperature	float64
General_Weather	object
Precipitation	int64

	Date	Weather_Clean	Is_Weekend	Intl_Pct	Staff_Efficiency	Passholder_Pct
0	2024-05-27	Sunny	False	3.57	18.964163	0.22
1	2024-05-28	Cloudy	False	4.24	6.588003	0.23
2	2024-05-29	Cloudy	False	4.41	28.132278	0.22

	Date	Weather_Clean	Is_Weekend	Top_Value
75	2024-08-10	Cloudy	True	7088.79
27	2024-06-23	Rainy	True	6766.69
19	2024-06-15	Stormy	True	6526.10
47	2024-07-13	Stormy	True	6515.86
76	2024-08-11	Rainy	True	6323.83

	Date	Weather_Clean	Is_Weekend	Bottom_Value
78	2024-08-13	Cloudy	False	568.87
21	2024-06-17	Cloudy	False	674.76
37	2024-07-03	Stormy	False	684.80
58	2024-07-24	Sunny	False	714.72
9	2024-06-05	Sunny	False	728.46

	Customer ID	Referred a Friend	Number of Referrals	Tenure in Months	Offer	Phone Service	Avg Monthly Long Distance Charges	Multiple Lines	Internet Service	Internet Type	...	Latitude	Longitude	Population	Churn Value	CLTV	Churn Category	Churn Reason	Total Customer Svc Requests	Customer Satisfaction
0	8779-QRDMV	No	0	1	NaN	No	0.00	No	Yes	Fiber Optic	...	34.023810	-118.156582	68701	1	5433	Competitor	Competitor offered more data	5	NaN
1	7495-OOKFY	Yes	1	8	Offer E	Yes	48.85	Yes	Yes	Cable	...	34.044271	-118.185237	55668	1	5302	Competitor	Competitor made better offer	5	NaN
2	1658-BYGOY	No	0	18	Offer D	Yes	11.33	Yes	Yes	Fiber Optic	...	34.108833	-118.229715	47534	1	3179	Competitor	Competitor made better offer	1	NaN

	0
model	Logistic
roc_auc	1.0
[email protected]	1.0
[email protected]	1.0
[email protected]	1.0
[email protected]	1.0

	threshold	precision	recall	f1	accuracy
0	0.05	1.0	1.0	1.0	1.0
10	0.55	1.0	1.0	1.0	1.0
17	0.90	1.0	1.0	1.0	1.0
16	0.85	1.0	1.0	1.0	1.0
15	0.80	1.0	1.0	1.0	1.0

	coef
Churn Reason_nan	-4.6624
Churn Category_nan	-4.6624
Churn Category_Competitor	1.3453
Churn Reason_Attitude of support person	0.9130
Churn Reason_Other	0.8708
Churn Category_Dissatisfaction	0.7909
Churn Category_Price	0.7383
Number of Referrals	-0.6605
Customer Satisfaction	-0.6511
Churn Category_Other	0.6421

	row_id	actual	proba	[email protected]
0	0	0	0.000692	0
1	1	0	0.000189	0
2	2	0	0.001049	0
3	3	0	0.000329	0
4	4	0	0.000488	0
5	5	1	0.991993	1
6	6	0	0.001122	0
7	7	0	0.000989	0
8	8	0	0.000228	0
9	9	0	0.000181	0

	auc_mean	auc_std
Logistic	1.0	0.0
RandomForest	1.0	0.0
GradientBoosting	1.0	0.0

Theme Park Business Analytics — Case Study¶

Part 1 — Exploratory Data Analysis¶

1) Load & setup¶

2) Quick audit¶

3) Calendar & weather¶

4) Identify key columns¶

5) KPIs¶

6) Calendar heatmap¶

7) Weather × Weekend¶

8) Weekly trend with smoothed band¶

9) Arcade revenue — Pareto¶

10) Distributions & skewness¶

11) Correlation heatmap¶

12) Top / bottom days + save tidy slice¶

Part 2 — Market Segmentation & Conjoint Analysis¶

2.1) Load & features¶

2.2) Pick k (elbow + silhouette)¶

2.3) KMeans & segment sizes¶

2.4) Segment profiles (z-scores)¶

2.5) PCA 2-D view¶

2.6) One-pager segment table¶

2.7) Conjoint design (ratings-based)¶

2.8) Part-worths via ridge¶

2.9) Attribute importance & part-worth bars¶

Part 3 — Modeling & Evaluation¶

3.1) Target and features¶

2) Train/validation split¶

3.3) Preprocessing pipeline¶

3.4) Baseline and ensembles¶

3.5) Stratified CV¶

6) Fit best model and base metrics¶

3.7) Threshold tuning¶

3.8) Confusion matrix at tuned threshold¶

3.9) Feature importance¶

10) Save a lean output¶