Tail events, why they matter and how to model them (code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
plt.figure(figsize=(20, 12))
np.random.seed(123)
n = 100000
fs = {"fontsize" : 16}
sns.kdeplot(stats.distributions.gumbel_r().rvs(n), label="Gumbel")
sns.kdeplot(stats.distributions.norm().rvs(n), label="Normal")
plt.legend(**fs)
plt.xticks(**fs)
plt.yticks(**fs)
plt.ylabel("Density", **fs)
plt.xlabel("X", **fs);
df = pd.read_csv(
"/Users/conormcdonald/Downloads/archive-5/DailyDelhiClimateTrain.csv",
parse_dates=["date"]
)
block_maxima_month = (
df
.set_index("date")
.resample("M")
.max()
.wind_speed
.reset_index()
.query("wind_speed > 5")
)
fs = {"fontsize" : 16}
plt.figure(figsize=(24, 10))
plt.scatter(df.date, df.wind_speed, alpha=.3)
plt.plot(df.date, df.wind_speed, alpha=.7, c="b")
plt.scatter(block_maxima_month.date, block_maxima_month.wind_speed, c="red", s=80)
plt.xticks(**fs)
plt.yticks(**fs)
plt.ylabel("Wind speed", **fs)
plt.xlabel("Date", **fs);
plt.figure(figsize=(24, 10))
sns.distplot(block_maxima_month.wind_speed)
plt.xticks(**fs)
plt.yticks(**fs)
plt.ylabel("Density", **fs)
plt.xlabel("Wind speed", **fs);
ev_model = pm.Model()
with ev_model:
loc = pm.Normal("loc", 5, 10)
scale = pm.HalfCauchy("scale", 1)
lik = pm.Gumbel("lik", loc, scale, observed=block_maxima_month.wind_speed)
ev_trace = pm.sample()
norm_model = pm.Model()
with norm_model:
loc = pm.Normal("loc", 5, 10)
scale = pm.HalfCauchy("scale", 1)
lik = pm.Normal("lik", loc, scale, observed=block_maxima_month.wind_speed)
norm_trace = pm.sample()
t_model = pm.Model()
with t_model:
loc = pm.Normal("loc", 5, 10)
scale = pm.HalfCauchy("scale", 1)
nu = pm.Exponential("nu", 20)
lik = pm.StudentT("lik", nu, loc, scale, observed=block_maxima_month.wind_speed)
t_trace = pm.sample()
p1 = pm.traceplot(ev_trace, figsize=(14, 7))
p2 = pm.traceplot(norm_trace, figsize=(14, 7));
pm.plot_posterior(ev_trace)
pm.plot_posterior(norm_trace)
ev_post = pm.sample_posterior_predictive(ev_trace, model=ev_model)["lik"].T
norm_post = pm.sample_posterior_predictive(norm_trace, model=norm_model)["lik"].T
t_post = pm.sample_posterior_predictive(t_trace, model=t_model)["lik"].T
ev_post = pm.sample_posterior_predictive(ev_trace, model=ev_model)["lik"].T
norm_post = pm.sample_posterior_predictive(norm_trace, model=norm_model)["lik"].T
# Plot density
plt.figure(figsize=(20, 10))
pm.plot_dist(ev_post, label="Gumbel", textsize=20, hist_kwargs=fs)
pm.plot_dist(norm_post, label="Normal", textsize=20, )
sns.kdeplot(block_maxima_month.wind_speed, label="Actual")
plt.legend(fontsize=14)
def calcualte_exceedance_probability(exceed_values, posterior):
n = posterior.shape[1]
ix = []
probs_m = []
probs_05 = []
probs_95 = []
for i in exceed_values:
p = ((posterior>i).sum(1)/n)
p05 = np.quantile(p, 0.01)
p95 = np.quantile(p, 0.99)
probs_m.append(p.mean())
probs_05.append(p05)
probs_95.append(p95)
ix.append(i)
return pd.DataFrame(dict(exceed_value=ix, ep_mean=probs_m, ep_lb=probs_05, ep_ub=probs_95))
exceedance = calcualte_exceedance_probability(list(range(1, 60)), ev_post)
plt.figure(figsize=(20, 12))
plt.plot(exceedance.exceed_value, exceedance.ep_mean, c="r", label="ev")
plt.fill_between(exceedance.exceed_value, exceedance.ep_lb, exceedance.ep_ub, color="b", alpha=.3)
plt.xticks(**fs)
plt.yticks(**fs)
plt.ylabel("Wind speed", **fs)
plt.xlabel("Date", **fs);
plt.legend()