From Jupyter to Production: Packaging an ML Model for a REST API
Your notebook has a trained XGBoost model. Cross-validation looks great. You need it serving live predictions at low latency behind a REST API — but the leap from pickle.dump(model, f) in a Jupyter cell to a production FastAPI service has about a dozen moving parts that aren't obvious.
This post walks through a real packaging pattern: serializing the model with metadata, verifying integrity at load time, building a FastAPI endpoint, handling versioning, and gracefully falling back when the model file goes missing. Every block is copy-pasteable.
Step 1: Save the Model With Metadata, Not Just the Model
Naive pickle loses critical context: which features did the model expect? What's its training date? Its calibration metrics? What calibrator wraps it? Wrap everything in a dict before pickling.
import pickle
import hashlib
import datetime as dt
from pathlib import Path
def save_model_bundle(out_path: Path,
model,
feature_cols: list,
calibrator=None,
metrics: dict = None,
meta: dict = None) -> str:
"""Serialize model + all metadata needed to run it in production.
Returns the SHA-256 of the pickle for integrity tracking.
"""
bundle = {
"model": model,
"calibrator": calibrator,
"feature_cols": list(feature_cols),
"trained_at": dt.datetime.utcnow().isoformat() + "Z",
"python_version": __import__("sys").version.split()[0],
"xgboost_version": __import__("xgboost").__version__,
"sklearn_version": __import__("sklearn").__version__,
"metrics": metrics or {},
"meta": meta or {},
}
with open(out_path, "wb") as f:
pickle.dump(bundle, f)
sha = hashlib.sha256(out_path.read_bytes()).hexdigest()
# Write companion hash file
out_path.with_suffix(".sha256").write_text(f"{sha} {out_path.name}\n")
return sha
Companion .sha256 file enables integrity verification at load time and in CI. If either file is corrupted or tampered with, the hashes won't match.
Step 2: Load With Integrity Verification
class ModelBundle:
def __init__(self, model, calibrator, feature_cols, metrics, meta):
self.model = model
self.calibrator = calibrator
self.feature_cols = feature_cols
self.metrics = metrics
self.meta = meta
@classmethod
def load(cls, path: Path, verify: bool = True) -> "ModelBundle":
if not path.exists():
raise FileNotFoundError(f"Model not found at {path}")
if verify:
hash_path = path.with_suffix(".sha256")
if hash_path.exists():
expected = hash_path.read_text().strip().split()[0]
actual = hashlib.sha256(path.read_bytes()).hexdigest()
if actual != expected:
raise ValueError(
f"INTEGRITY FAIL: {path.name} hash mismatch. "
f"Expected {expected[:16]}... got {actual[:16]}..."
)
with open(path, "rb") as f:
b = pickle.load(f)
return cls(
model=b["model"],
calibrator=b.get("calibrator"),
feature_cols=b["feature_cols"],
metrics=b.get("metrics", {}),
meta=b.get("meta", {}),
)
def predict_proba(self, X):
if self.calibrator is not None:
return self.calibrator.predict_proba(X)
return self.model.predict_proba(X)
Step 3: Build a Feature-Order-Agnostic Predict Function
The order of your feature columns at training time must match the order at inference. Hard-coding the order is brittle. Instead, let the bundle remember which features it needs and let the caller pass a dict.
import numpy as np
def bundle_predict(bundle: ModelBundle, features: dict) -> float:
"""Predict P(class=1) from a dict of feature values.
Missing features default to 0.0; extra features are ignored.
"""
x = np.array(
[[features.get(col, 0.0) for col in bundle.feature_cols]],
dtype=np.float32,
)
proba = bundle.predict_proba(x)[0, 1]
return float(np.clip(proba, 0.01, 0.99))
# Usage
features = {
"score_diff": 5,
"seconds_remaining": 300,
"time_fraction": 0.105,
# model can have more features; missing ones default to 0.0
}
wp = bundle_predict(bundle, features)
This pattern means your API callers don't need to know about feature ordering or the exact set of features in use. Swap a model with different features in and out without touching API code.
Step 4: Wrap in FastAPI
Load the model once at startup, hold it in module state, and serve predictions via a simple endpoint.
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from pathlib import Path
app = FastAPI(title="WP Prediction API")
MODEL_PATH = Path("./model_bundle.pkl")
_bundle: ModelBundle | None = None
@app.on_event("startup")
async def load_model():
global _bundle
try:
_bundle = ModelBundle.load(MODEL_PATH)
print(f"Loaded model: {_bundle.meta} metrics={_bundle.metrics}")
except Exception as e:
print(f"Failed to load model: {e}")
class PredictRequest(BaseModel):
features: dict
class PredictResponse(BaseModel):
probability: float
model_version: str
trained_at: str
@app.post("/v1/predict", response_model=PredictResponse)
async def predict(req: PredictRequest):
if _bundle is None:
raise HTTPException(503, "Model not loaded")
try:
p = bundle_predict(_bundle, req.features)
return PredictResponse(
probability=p,
model_version=_bundle.meta.get("version", "unknown"),
trained_at=_bundle.meta.get("trained_at", "unknown"),
)
except Exception as e:
raise HTTPException(500, f"Prediction failed: {e}")
@app.get("/v1/health")
async def health():
return {
"status": "ok" if _bundle else "model_not_loaded",
"model_metrics": _bundle.metrics if _bundle else None,
}
Run with uvicorn main:app --host 0.0.0.0 --port 8000. Supports thousands of requests/sec on modest hardware once the model is cached in process memory.
Step 5: Handle Model Reloads Gracefully
You'll want to swap models without restarting the API (retraining is a continuous process). A simple file-watcher pattern:
import time
import asyncio
_model_mtime: float = 0.0
async def watch_model_file():
"""Background task: reload bundle when the .pkl file changes."""
global _bundle, _model_mtime
while True:
try:
mtime = MODEL_PATH.stat().st_mtime
if mtime != _model_mtime:
new_bundle = ModelBundle.load(MODEL_PATH)
_bundle = new_bundle
_model_mtime = mtime
print(f"Reloaded model, new mtime={mtime}")
except Exception as e:
print(f"Watch error: {e}")
await asyncio.sleep(10)
@app.on_event("startup")
async def start_watcher():
asyncio.create_task(watch_model_file())
Deploy a new model_bundle.pkl + .sha256, wait 10 seconds, and the API is serving the new model. Zero downtime. Zero restart needed.
Step 6: Fall Back Gracefully When the Model Is Missing
Production systems must handle the case where the model file isn't there, is corrupted, or has a mismatched hash. Never serve garbage — either degrade to a sensible fallback or fail loudly.
@app.post("/v1/predict")
async def predict(req: PredictRequest):
if _bundle is None:
# Fallback: Elo-based baseline (no ML)
elo_diff = req.features.get("elo_diff", 0)
fallback_wp = 1.0 / (1.0 + 10 ** (-elo_diff / 400))
return PredictResponse(
probability=fallback_wp,
model_version="fallback:elo",
trained_at="n/a",
)
# Normal path
...
Log every fallback invocation so you notice when your model is silently missing in prod.
Step 7: CI — Test Both Training and Serving
Your deploy pipeline should verify:
- The bundle loads cleanly with integrity check passing
- Predict returns a value in [0, 1] for a known feature dict
- Predict latency under some threshold (e.g., < 10ms on CPU)
- The
feature_colslist matches your feature engineering module's output
import pytest
import time
def test_bundle_loads():
bundle = ModelBundle.load(Path("./model_bundle.pkl"))
assert bundle.model is not None
assert len(bundle.feature_cols) > 0
def test_predict_returns_valid_probability():
bundle = ModelBundle.load(Path("./model_bundle.pkl"))
features = {col: 0.0 for col in bundle.feature_cols}
p = bundle_predict(bundle, features)
assert 0 <= p <= 1
def test_predict_latency_under_threshold():
bundle = ModelBundle.load(Path("./model_bundle.pkl"))
features = {col: 0.0 for col in bundle.feature_cols}
t0 = time.perf_counter()
for _ in range(100):
bundle_predict(bundle, features)
avg_ms = (time.perf_counter() - t0) * 10
assert avg_ms < 10, f"Avg latency {avg_ms:.1f}ms exceeds 10ms budget"
def test_feature_cols_match_engineering():
from my_pipeline import engineer_features
import pandas as pd
dummy = pd.DataFrame([{"score": 0, "time": 0}])
engineered = engineer_features(dummy)
bundle = ModelBundle.load(Path("./model_bundle.pkl"))
missing = [c for c in bundle.feature_cols if c not in engineered.columns]
assert not missing, f"Features missing from pipeline: {missing}"
Production Checklist
- [ ] Model saved as a dict with all metadata (features, metrics, calibrator, versions)
- [ ] Companion
.sha256hash file committed alongside the.pkl - [ ] Integrity verification at load time; crashes loudly on mismatch
- [ ] Dict-based predict (no brittle positional feature ordering)
- [ ] File-watcher for zero-downtime model swaps
- [ ] Graceful fallback when model is missing/corrupt
- [ ] Health endpoint exposing current model metadata + metrics
- [ ] CI tests for load, predict, latency, and feature-list parity
- [ ] Structured logs including model_version on every prediction
Building a model-serving pipeline is a week of work. ZenHodl's prediction API is already deployed with all of this — calibrated sports probabilities for 11 sports via REST.
See the APIFurther reading: Calibrating XGBoost Probabilities · Sample Weights in XGBoost · Feature Engineering for Sports Win Probability
Related Reading
- NCAAMB 2025-26 Season Report — what a production model's ECE looks like after deployment.
- Build a March Madness prediction model — end-to-end example of the pipeline.
- Build a Super Bowl prediction model — NFL version of the same deployment pattern.
- Build an MLB prediction model — baseball version.
- Best College Basketball Prediction Sites 2026 — how to evaluate competing APIs.